summaryrefslogtreecommitdiff
path: root/libbcache
diff options
context:
space:
mode:
Diffstat (limited to 'libbcache')
-rw-r--r--libbcache/acl.c225
-rw-r--r--libbcache/acl.h56
-rw-r--r--libbcache/alloc.c1913
-rw-r--r--libbcache/alloc.h85
-rw-r--r--libbcache/alloc_types.h102
-rw-r--r--libbcache/bcache.h831
-rw-r--r--libbcache/bkey.c1167
-rw-r--r--libbcache/bkey.h606
-rw-r--r--libbcache/bkey_methods.c127
-rw-r--r--libbcache/bkey_methods.h82
-rw-r--r--libbcache/blockdev.c819
-rw-r--r--libbcache/blockdev.h134
-rw-r--r--libbcache/blockdev_types.h123
-rw-r--r--libbcache/bset.c1846
-rw-r--r--libbcache/bset.h615
-rw-r--r--libbcache/btree_cache.c756
-rw-r--r--libbcache/btree_cache.h71
-rw-r--r--libbcache/btree_gc.c955
-rw-r--r--libbcache/btree_gc.h104
-rw-r--r--libbcache/btree_io.c1738
-rw-r--r--libbcache/btree_io.h73
-rw-r--r--libbcache/btree_iter.c1150
-rw-r--r--libbcache/btree_iter.h282
-rw-r--r--libbcache/btree_locking.h119
-rw-r--r--libbcache/btree_types.h311
-rw-r--r--libbcache/btree_update.c2345
-rw-r--r--libbcache/btree_update.h424
-rw-r--r--libbcache/buckets.c750
-rw-r--r--libbcache/buckets.h267
-rw-r--r--libbcache/buckets_types.h112
-rw-r--r--libbcache/chardev.c407
-rw-r--r--libbcache/chardev.h30
-rw-r--r--libbcache/checksum.c590
-rw-r--r--libbcache/checksum.h133
-rw-r--r--libbcache/clock.c161
-rw-r--r--libbcache/clock.h23
-rw-r--r--libbcache/clock_types.h34
-rw-r--r--libbcache/closure.c210
-rw-r--r--libbcache/closure.h387
-rw-r--r--libbcache/compress.c500
-rw-r--r--libbcache/compress.h15
-rw-r--r--libbcache/debug.c467
-rw-r--r--libbcache/debug.h65
-rw-r--r--libbcache/dirent.c427
-rw-r--r--libbcache/dirent.h36
-rw-r--r--libbcache/error.c140
-rw-r--r--libbcache/error.h240
-rw-r--r--libbcache/extents.c2498
-rw-r--r--libbcache/extents.h587
-rw-r--r--libbcache/eytzinger.h196
-rw-r--r--libbcache/fifo.h123
-rw-r--r--libbcache/fs-gc.c924
-rw-r--r--libbcache/fs-gc.h7
-rw-r--r--libbcache/fs-io.c2496
-rw-r--r--libbcache/fs-io.h96
-rw-r--r--libbcache/fs.c1481
-rw-r--r--libbcache/fs.h65
-rw-r--r--libbcache/inode.c451
-rw-r--r--libbcache/inode.h57
-rw-r--r--libbcache/io.c1435
-rw-r--r--libbcache/io.h90
-rw-r--r--libbcache/io_types.h145
-rw-r--r--libbcache/journal.c2835
-rw-r--r--libbcache/journal.h373
-rw-r--r--libbcache/journal_types.h242
-rw-r--r--libbcache/keybuf.c195
-rw-r--r--libbcache/keybuf.h16
-rw-r--r--libbcache/keybuf_types.h33
-rw-r--r--libbcache/keylist.c55
-rw-r--r--libbcache/keylist.h62
-rw-r--r--libbcache/keylist_types.h15
-rw-r--r--libbcache/migrate.c395
-rw-r--r--libbcache/migrate.h8
-rw-r--r--libbcache/move.c392
-rw-r--r--libbcache/move.h87
-rw-r--r--libbcache/move_types.h4
-rw-r--r--libbcache/movinggc.c297
-rw-r--r--libbcache/movinggc.h30
-rw-r--r--libbcache/notify.c105
-rw-r--r--libbcache/notify.h34
-rw-r--r--libbcache/opts.c241
-rw-r--r--libbcache/opts.h168
-rw-r--r--libbcache/request.c809
-rw-r--r--libbcache/request.h16
-rw-r--r--libbcache/siphash.c172
-rw-r--r--libbcache/siphash.h86
-rw-r--r--libbcache/six.c396
-rw-r--r--libbcache/six.h136
-rw-r--r--libbcache/stats.c219
-rw-r--r--libbcache/stats.h68
-rw-r--r--libbcache/stats_types.h56
-rw-r--r--libbcache/str_hash.h384
-rw-r--r--libbcache/super-io.c820
-rw-r--r--libbcache/super-io.h159
-rw-r--r--libbcache/super.c2047
-rw-r--r--libbcache/super.h136
-rw-r--r--libbcache/super_types.h12
-rw-r--r--libbcache/sysfs.c1336
-rw-r--r--libbcache/sysfs.h103
-rw-r--r--libbcache/tier.c282
-rw-r--r--libbcache/tier.h8
-rw-r--r--libbcache/trace.c11
-rw-r--r--libbcache/util.c418
-rw-r--r--libbcache/util.h755
-rw-r--r--libbcache/vstructs.h62
-rw-r--r--libbcache/writeback.c657
-rw-r--r--libbcache/writeback.h122
-rw-r--r--libbcache/xattr.c365
-rw-r--r--libbcache/xattr.h20
109 files changed, 0 insertions, 47946 deletions
diff --git a/libbcache/acl.c b/libbcache/acl.c
deleted file mode 100644
index 4363c57e..00000000
--- a/libbcache/acl.c
+++ /dev/null
@@ -1,225 +0,0 @@
-#include "bcache.h"
-
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-
-#include "xattr.h"
-#include "acl.h"
-
-/*
- * Convert from filesystem to in-memory representation.
- */
-static struct posix_acl *bch_acl_from_disk(const void *value, size_t size)
-{
- const char *end = (char *)value + size;
- int n, count;
- struct posix_acl *acl;
-
- if (!value)
- return NULL;
- if (size < sizeof(bch_acl_header))
- return ERR_PTR(-EINVAL);
- if (((bch_acl_header *)value)->a_version !=
- cpu_to_le32(BCH_ACL_VERSION))
- return ERR_PTR(-EINVAL);
- value = (char *)value + sizeof(bch_acl_header);
- count = bch_acl_count(size);
- if (count < 0)
- return ERR_PTR(-EINVAL);
- if (count == 0)
- return NULL;
- acl = posix_acl_alloc(count, GFP_KERNEL);
- if (!acl)
- return ERR_PTR(-ENOMEM);
- for (n = 0; n < count; n++) {
- bch_acl_entry *entry =
- (bch_acl_entry *)value;
- if ((char *)value + sizeof(bch_acl_entry_short) > end)
- goto fail;
- acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
- acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
- switch (acl->a_entries[n].e_tag) {
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- value = (char *)value +
- sizeof(bch_acl_entry_short);
- break;
-
- case ACL_USER:
- value = (char *)value + sizeof(bch_acl_entry);
- if ((char *)value > end)
- goto fail;
- acl->a_entries[n].e_uid =
- make_kuid(&init_user_ns,
- le32_to_cpu(entry->e_id));
- break;
- case ACL_GROUP:
- value = (char *)value + sizeof(bch_acl_entry);
- if ((char *)value > end)
- goto fail;
- acl->a_entries[n].e_gid =
- make_kgid(&init_user_ns,
- le32_to_cpu(entry->e_id));
- break;
-
- default:
- goto fail;
- }
- }
- if (value != end)
- goto fail;
- return acl;
-
-fail:
- posix_acl_release(acl);
- return ERR_PTR(-EINVAL);
-}
-
-/*
- * Convert from in-memory to filesystem representation.
- */
-static void *bch_acl_to_disk(const struct posix_acl *acl, size_t *size)
-{
- bch_acl_header *ext_acl;
- char *e;
- size_t n;
-
- *size = bch_acl_size(acl->a_count);
- ext_acl = kmalloc(sizeof(bch_acl_header) + acl->a_count *
- sizeof(bch_acl_entry), GFP_KERNEL);
- if (!ext_acl)
- return ERR_PTR(-ENOMEM);
- ext_acl->a_version = cpu_to_le32(BCH_ACL_VERSION);
- e = (char *)ext_acl + sizeof(bch_acl_header);
- for (n = 0; n < acl->a_count; n++) {
- const struct posix_acl_entry *acl_e = &acl->a_entries[n];
- bch_acl_entry *entry = (bch_acl_entry *)e;
-
- entry->e_tag = cpu_to_le16(acl_e->e_tag);
- entry->e_perm = cpu_to_le16(acl_e->e_perm);
- switch (acl_e->e_tag) {
- case ACL_USER:
- entry->e_id = cpu_to_le32(
- from_kuid(&init_user_ns, acl_e->e_uid));
- e += sizeof(bch_acl_entry);
- break;
- case ACL_GROUP:
- entry->e_id = cpu_to_le32(
- from_kgid(&init_user_ns, acl_e->e_gid));
- e += sizeof(bch_acl_entry);
- break;
-
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- e += sizeof(bch_acl_entry_short);
- break;
-
- default:
- goto fail;
- }
- }
- return (char *)ext_acl;
-
-fail:
- kfree(ext_acl);
- return ERR_PTR(-EINVAL);
-}
-
-struct posix_acl *bch_get_acl(struct inode *inode, int type)
-{
- struct bch_fs *c = inode->i_sb->s_fs_info;
- int name_index;
- char *value = NULL;
- struct posix_acl *acl;
- int ret;
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
- break;
- default:
- BUG();
- }
- ret = bch_xattr_get(c, inode, "", NULL, 0, name_index);
- if (ret > 0) {
- value = kmalloc(ret, GFP_KERNEL);
- if (!value)
- return ERR_PTR(-ENOMEM);
- ret = bch_xattr_get(c, inode, "", value,
- ret, name_index);
- }
- if (ret > 0)
- acl = bch_acl_from_disk(value, ret);
- else if (ret == -ENODATA || ret == -ENOSYS)
- acl = NULL;
- else
- acl = ERR_PTR(ret);
- kfree(value);
-
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
-
- return acl;
-}
-
-int bch_set_acl(struct inode *inode, struct posix_acl *acl, int type)
-{
- struct bch_fs *c = inode->i_sb->s_fs_info;
- int name_index;
- void *value = NULL;
- size_t size = 0;
- int ret;
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- name_index = BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
- if (acl) {
- ret = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (ret < 0)
- return ret;
- else {
- inode->i_ctime = current_fs_time(inode->i_sb);
- mark_inode_dirty(inode);
- if (ret == 0)
- acl = NULL;
- }
- }
- break;
-
- case ACL_TYPE_DEFAULT:
- name_index = BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
- if (!S_ISDIR(inode->i_mode))
- return acl ? -EACCES : 0;
- break;
-
- default:
- return -EINVAL;
- }
-
- if (acl) {
- value = bch_acl_to_disk(acl, &size);
- if (IS_ERR(value))
- return (int)PTR_ERR(value);
- }
-
- ret = bch_xattr_set(c, inode, "", value, size, 0, name_index);
-
- kfree(value);
-
- if (ret == -ERANGE)
- ret = -E2BIG;
-
- if (!ret)
- set_cached_acl(inode, type, acl);
-
- return ret;
-}
diff --git a/libbcache/acl.h b/libbcache/acl.h
deleted file mode 100644
index 079e5689..00000000
--- a/libbcache/acl.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- File: fs/bch/acl.h
-
- (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
-*/
-
-#include <linux/posix_acl_xattr.h>
-
-#define BCH_ACL_VERSION 0x0001
-
-typedef struct {
- __le16 e_tag;
- __le16 e_perm;
- __le32 e_id;
-} bch_acl_entry;
-
-typedef struct {
- __le16 e_tag;
- __le16 e_perm;
-} bch_acl_entry_short;
-
-typedef struct {
- __le32 a_version;
-} bch_acl_header;
-
-static inline size_t bch_acl_size(int count)
-{
- if (count <= 4) {
- return sizeof(bch_acl_header) +
- count * sizeof(bch_acl_entry_short);
- } else {
- return sizeof(bch_acl_header) +
- 4 * sizeof(bch_acl_entry_short) +
- (count - 4) * sizeof(bch_acl_entry);
- }
-}
-
-static inline int bch_acl_count(size_t size)
-{
- ssize_t s;
-
- size -= sizeof(bch_acl_header);
- s = size - 4 * sizeof(bch_acl_entry_short);
- if (s < 0) {
- if (size % sizeof(bch_acl_entry_short))
- return -1;
- return size / sizeof(bch_acl_entry_short);
- } else {
- if (s % sizeof(bch_acl_entry))
- return -1;
- return s / sizeof(bch_acl_entry) + 4;
- }
-}
-
-extern struct posix_acl *bch_get_acl(struct inode *, int);
-extern int bch_set_acl(struct inode *, struct posix_acl *, int);
diff --git a/libbcache/alloc.c b/libbcache/alloc.c
deleted file mode 100644
index 2f892914..00000000
--- a/libbcache/alloc.c
+++ /dev/null
@@ -1,1913 +0,0 @@
-/*
- * Primary bucket allocation code
- *
- * Copyright 2012 Google, Inc.
- *
- * Allocation in bcache is done in terms of buckets:
- *
- * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
- * btree pointers - they must match for the pointer to be considered valid.
- *
- * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
- * bucket simply by incrementing its gen.
- *
- * The gens (along with the priorities; it's really the gens are important but
- * the code is named as if it's the priorities) are written in an arbitrary list
- * of buckets on disk, with a pointer to them in the journal header.
- *
- * When we invalidate a bucket, we have to write its new gen to disk and wait
- * for that write to complete before we use it - otherwise after a crash we
- * could have pointers that appeared to be good but pointed to data that had
- * been overwritten.
- *
- * Since the gens and priorities are all stored contiguously on disk, we can
- * batch this up: We fill up the free_inc list with freshly invalidated buckets,
- * call prio_write(), and when prio_write() finishes we pull buckets off the
- * free_inc list and optionally discard them.
- *
- * free_inc isn't the only freelist - if it was, we'd often have to sleep while
- * priorities and gens were being written before we could allocate. c->free is a
- * smaller freelist, and buckets on that list are always ready to be used.
- *
- * If we've got discards enabled, that happens when a bucket moves from the
- * free_inc list to the free list.
- *
- * It's important to ensure that gens don't wrap around - with respect to
- * either the oldest gen in the btree or the gen on disk. This is quite
- * difficult to do in practice, but we explicitly guard against it anyways - if
- * a bucket is in danger of wrapping around we simply skip invalidating it that
- * time around, and we garbage collect or rewrite the priorities sooner than we
- * would have otherwise.
- *
- * bch_bucket_alloc() allocates a single bucket from a specific device.
- *
- * bch_bucket_alloc_set() allocates one or more buckets from different devices
- * in a given filesystem.
- *
- * invalidate_buckets() drives all the processes described above. It's called
- * from bch_bucket_alloc() and a few other places that need to make sure free
- * buckets are ready.
- *
- * invalidate_buckets_(lru|fifo)() find buckets that are available to be
- * invalidated, and then invalidate them and stick them on the free_inc list -
- * in either lru or fifo order.
- */
-
-#include "bcache.h"
-#include "alloc.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "clock.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "io.h"
-#include "journal.h"
-#include "super-io.h"
-
-#include <linux/blkdev.h>
-#include <linux/kthread.h>
-#include <linux/math64.h>
-#include <linux/random.h>
-#include <linux/rcupdate.h>
-#include <trace/events/bcache.h>
-
-static void __bch_bucket_free(struct bch_dev *, struct bucket *);
-static void bch_recalc_min_prio(struct bch_dev *, int);
-
-/* Allocation groups: */
-
-void bch_dev_group_remove(struct dev_group *grp, struct bch_dev *ca)
-{
- unsigned i;
-
- spin_lock(&grp->lock);
-
- for (i = 0; i < grp->nr; i++)
- if (grp->d[i].dev == ca) {
- grp->nr--;
- memmove(&grp->d[i],
- &grp->d[i + 1],
- (grp->nr- i) * sizeof(grp->d[0]));
- break;
- }
-
- spin_unlock(&grp->lock);
-}
-
-void bch_dev_group_add(struct dev_group *grp, struct bch_dev *ca)
-{
- unsigned i;
-
- spin_lock(&grp->lock);
- for (i = 0; i < grp->nr; i++)
- if (grp->d[i].dev == ca)
- goto out;
-
- BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
-
- grp->d[grp->nr++].dev = ca;
-out:
- spin_unlock(&grp->lock);
-}
-
-/* Ratelimiting/PD controllers */
-
-static void pd_controllers_update(struct work_struct *work)
-{
- struct bch_fs *c = container_of(to_delayed_work(work),
- struct bch_fs,
- pd_controllers_update);
- struct bch_dev *ca;
- unsigned i, iter;
-
- /* All units are in bytes */
- u64 faster_tiers_size = 0;
- u64 faster_tiers_dirty = 0;
-
- u64 fastest_tier_size = 0;
- u64 fastest_tier_free = 0;
- u64 copygc_can_free = 0;
-
- rcu_read_lock();
- for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
- bch_pd_controller_update(&c->tiers[i].pd,
- div_u64(faster_tiers_size *
- c->tiering_percent, 100),
- faster_tiers_dirty,
- -1);
-
- spin_lock(&c->tiers[i].devs.lock);
- group_for_each_dev(ca, &c->tiers[i].devs, iter) {
- struct bch_dev_usage stats = bch_dev_usage_read(ca);
- unsigned bucket_bits = ca->bucket_bits + 9;
-
- u64 size = (ca->mi.nbuckets -
- ca->mi.first_bucket) << bucket_bits;
- u64 dirty = stats.buckets_dirty << bucket_bits;
- u64 free = __dev_buckets_free(ca, stats) << bucket_bits;
- /*
- * Bytes of internal fragmentation, which can be
- * reclaimed by copy GC
- */
- s64 fragmented = ((stats.buckets_dirty +
- stats.buckets_cached) <<
- bucket_bits) -
- ((stats.sectors[S_DIRTY] +
- stats.sectors[S_CACHED] ) << 9);
-
- fragmented = max(0LL, fragmented);
-
- bch_pd_controller_update(&ca->moving_gc_pd,
- free, fragmented, -1);
-
- faster_tiers_size += size;
- faster_tiers_dirty += dirty;
-
- if (!c->fastest_tier ||
- c->fastest_tier == &c->tiers[i]) {
- fastest_tier_size += size;
- fastest_tier_free += free;
- }
-
- copygc_can_free += fragmented;
- }
- spin_unlock(&c->tiers[i].devs.lock);
- }
-
- rcu_read_unlock();
-
- /*
- * Throttle foreground writes if tier 0 is running out of free buckets,
- * and either tiering or copygc can free up space.
- *
- * Target will be small if there isn't any work to do - we don't want to
- * throttle foreground writes if we currently have all the free space
- * we're ever going to have.
- *
- * Otherwise, if there's work to do, try to keep 20% of tier0 available
- * for foreground writes.
- */
- if (c->fastest_tier)
- copygc_can_free = U64_MAX;
-
- bch_pd_controller_update(&c->foreground_write_pd,
- min(copygc_can_free,
- div_u64(fastest_tier_size *
- c->foreground_target_percent,
- 100)),
- fastest_tier_free,
- -1);
-
- schedule_delayed_work(&c->pd_controllers_update,
- c->pd_controllers_update_seconds * HZ);
-}
-
-/*
- * Bucket priorities/gens:
- *
- * For each bucket, we store on disk its
- * 8 bit gen
- * 16 bit priority
- *
- * See alloc.c for an explanation of the gen. The priority is used to implement
- * lru (and in the future other) cache replacement policies; for most purposes
- * it's just an opaque integer.
- *
- * The gens and the priorities don't have a whole lot to do with each other, and
- * it's actually the gens that must be written out at specific times - it's no
- * big deal if the priorities don't get written, if we lose them we just reuse
- * buckets in suboptimal order.
- *
- * On disk they're stored in a packed array, and in as many buckets are required
- * to fit them all. The buckets we use to store them form a list; the journal
- * header points to the first bucket, the first bucket points to the second
- * bucket, et cetera.
- *
- * This code is used by the allocation code; periodically (whenever it runs out
- * of buckets to allocate from) the allocation code will invalidate some
- * buckets, but it can't use those buckets until their new gens are safely on
- * disk.
- */
-
-static int prio_io(struct bch_dev *ca, uint64_t bucket, int op)
-{
- bio_init(ca->bio_prio);
- bio_set_op_attrs(ca->bio_prio, op, REQ_SYNC|REQ_META);
-
- ca->bio_prio->bi_max_vecs = bucket_pages(ca);
- ca->bio_prio->bi_io_vec = ca->bio_prio->bi_inline_vecs;
- ca->bio_prio->bi_iter.bi_sector = bucket * ca->mi.bucket_size;
- ca->bio_prio->bi_bdev = ca->disk_sb.bdev;
- ca->bio_prio->bi_iter.bi_size = bucket_bytes(ca);
- bch_bio_map(ca->bio_prio, ca->disk_buckets);
-
- return submit_bio_wait(ca->bio_prio);
-}
-
-static struct nonce prio_nonce(struct prio_set *p)
-{
- return (struct nonce) {{
- [0] = 0,
- [1] = p->nonce[0],
- [2] = p->nonce[1],
- [3] = p->nonce[2]^BCH_NONCE_PRIO,
- }};
-}
-
-static int bch_prio_write(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct journal *j = &c->journal;
- struct journal_res res = { 0 };
- bool need_new_journal_entry;
- int i, ret;
-
- if (c->opts.nochanges)
- return 0;
-
- trace_bcache_prio_write_start(ca);
-
- atomic64_add(ca->mi.bucket_size * prio_buckets(ca),
- &ca->meta_sectors_written);
-
- for (i = prio_buckets(ca) - 1; i >= 0; --i) {
- struct bucket *g;
- struct prio_set *p = ca->disk_buckets;
- struct bucket_disk *d = p->data;
- struct bucket_disk *end = d + prios_per_bucket(ca);
- size_t r;
-
- for (r = i * prios_per_bucket(ca);
- r < ca->mi.nbuckets && d < end;
- r++, d++) {
- g = ca->buckets + r;
- d->read_prio = cpu_to_le16(g->read_prio);
- d->write_prio = cpu_to_le16(g->write_prio);
- d->gen = ca->buckets[r].mark.gen;
- }
-
- p->next_bucket = cpu_to_le64(ca->prio_buckets[i + 1]);
- p->magic = cpu_to_le64(pset_magic(c));
- get_random_bytes(&p->nonce, sizeof(p->nonce));
-
- spin_lock(&ca->prio_buckets_lock);
- r = bch_bucket_alloc(ca, RESERVE_PRIO);
- BUG_ON(!r);
-
- /*
- * goes here before dropping prio_buckets_lock to guard against
- * it getting gc'd from under us
- */
- ca->prio_buckets[i] = r;
- bch_mark_metadata_bucket(ca, ca->buckets + r,
- BUCKET_PRIOS, false);
- spin_unlock(&ca->prio_buckets_lock);
-
- SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c));
-
- bch_encrypt(c, PSET_CSUM_TYPE(p),
- prio_nonce(p),
- p->encrypted_start,
- bucket_bytes(ca) -
- offsetof(struct prio_set, encrypted_start));
-
- p->csum = bch_checksum(c, PSET_CSUM_TYPE(p),
- prio_nonce(p),
- (void *) p + sizeof(p->csum),
- bucket_bytes(ca) - sizeof(p->csum));
-
- ret = prio_io(ca, r, REQ_OP_WRITE);
- if (bch_dev_fatal_io_err_on(ret, ca,
- "prio write to bucket %zu", r) ||
- bch_meta_write_fault("prio"))
- return ret;
- }
-
- spin_lock(&j->lock);
- j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]);
- j->nr_prio_buckets = max_t(unsigned,
- ca->dev_idx + 1,
- j->nr_prio_buckets);
- spin_unlock(&j->lock);
-
- do {
- unsigned u64s = jset_u64s(0);
-
- if (!test_bit(JOURNAL_STARTED, &c->journal.flags))
- break;
-
- ret = bch_journal_res_get(j, &res, u64s, u64s);
- if (ret)
- return ret;
-
- need_new_journal_entry = j->buf[res.idx].nr_prio_buckets <
- ca->dev_idx + 1;
- bch_journal_res_put(j, &res);
-
- ret = bch_journal_flush_seq(j, res.seq);
- if (ret)
- return ret;
- } while (need_new_journal_entry);
-
- /*
- * Don't want the old priorities to get garbage collected until after we
- * finish writing the new ones, and they're journalled
- */
-
- spin_lock(&ca->prio_buckets_lock);
-
- for (i = 0; i < prio_buckets(ca); i++) {
- if (ca->prio_last_buckets[i])
- __bch_bucket_free(ca,
- &ca->buckets[ca->prio_last_buckets[i]]);
-
- ca->prio_last_buckets[i] = ca->prio_buckets[i];
- }
-
- spin_unlock(&ca->prio_buckets_lock);
-
- trace_bcache_prio_write_end(ca);
- return 0;
-}
-
-int bch_prio_read(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct prio_set *p = ca->disk_buckets;
- struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
- struct bucket_mark new;
- struct bch_csum csum;
- unsigned bucket_nr = 0;
- u64 bucket, expect, got;
- size_t b;
- int ret = 0;
-
- spin_lock(&c->journal.lock);
- bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]);
- spin_unlock(&c->journal.lock);
-
- /*
- * If the device hasn't been used yet, there won't be a prio bucket ptr
- */
- if (!bucket)
- return 0;
-
- unfixable_fsck_err_on(bucket < ca->mi.first_bucket ||
- bucket >= ca->mi.nbuckets, c,
- "bad prio bucket %llu", bucket);
-
- for (b = 0; b < ca->mi.nbuckets; b++, d++) {
- if (d == end) {
- ca->prio_last_buckets[bucket_nr] = bucket;
- bucket_nr++;
-
- ret = prio_io(ca, bucket, REQ_OP_READ);
- if (bch_dev_fatal_io_err_on(ret, ca,
- "prior read from bucket %llu",
- bucket) ||
- bch_meta_read_fault("prio"))
- return -EIO;
-
- got = le64_to_cpu(p->magic);
- expect = pset_magic(c);
- unfixable_fsck_err_on(got != expect, c,
- "bad magic (got %llu expect %llu) while reading prios from bucket %llu",
- got, expect, bucket);
-
- unfixable_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c,
- "prio bucket with unknown csum type %llu bucket %lluu",
- PSET_CSUM_TYPE(p), bucket);
-
- csum = bch_checksum(c, PSET_CSUM_TYPE(p),
- prio_nonce(p),
- (void *) p + sizeof(p->csum),
- bucket_bytes(ca) - sizeof(p->csum));
- unfixable_fsck_err_on(bch_crc_cmp(csum, p->csum), c,
- "bad checksum reading prios from bucket %llu",
- bucket);
-
- bch_encrypt(c, PSET_CSUM_TYPE(p),
- prio_nonce(p),
- p->encrypted_start,
- bucket_bytes(ca) -
- offsetof(struct prio_set, encrypted_start));
-
- bucket = le64_to_cpu(p->next_bucket);
- d = p->data;
- }
-
- ca->buckets[b].read_prio = le16_to_cpu(d->read_prio);
- ca->buckets[b].write_prio = le16_to_cpu(d->write_prio);
-
- bucket_cmpxchg(&ca->buckets[b], new, new.gen = d->gen);
- }
-
- mutex_lock(&c->bucket_lock);
- bch_recalc_min_prio(ca, READ);
- bch_recalc_min_prio(ca, WRITE);
- mutex_unlock(&c->bucket_lock);
-
- ret = 0;
-fsck_err:
- return ret;
-}
-
-#define BUCKET_GC_GEN_MAX 96U
-
-/**
- * wait_buckets_available - wait on reclaimable buckets
- *
- * If there aren't enough available buckets to fill up free_inc, wait until
- * there are.
- */
-static int wait_buckets_available(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- int ret = 0;
-
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop()) {
- ret = -1;
- break;
- }
-
- if (ca->inc_gen_needs_gc >= fifo_free(&ca->free_inc)) {
- if (c->gc_thread) {
- trace_bcache_gc_cannot_inc_gens(ca->fs);
- atomic_inc(&c->kick_gc);
- wake_up_process(ca->fs->gc_thread);
- }
-
- /*
- * We are going to wait for GC to wake us up, even if
- * bucket counters tell us enough buckets are available,
- * because we are actually waiting for GC to rewrite
- * nodes with stale pointers
- */
- } else if (dev_buckets_available(ca) >=
- fifo_free(&ca->free_inc))
- break;
-
- up_read(&ca->fs->gc_lock);
- schedule();
- try_to_freeze();
- down_read(&ca->fs->gc_lock);
- }
-
- __set_current_state(TASK_RUNNING);
- return ret;
-}
-
-static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket)
-{
- if (expensive_debug_checks(ca->fs)) {
- size_t iter;
- long i;
- unsigned j;
-
- for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
- BUG_ON(ca->prio_buckets[iter] == bucket);
-
- for (j = 0; j < RESERVE_NR; j++)
- fifo_for_each_entry(i, &ca->free[j], iter)
- BUG_ON(i == bucket);
- fifo_for_each_entry(i, &ca->free_inc, iter)
- BUG_ON(i == bucket);
- }
-}
-
-/* Bucket heap / gen */
-
-void bch_recalc_min_prio(struct bch_dev *ca, int rw)
-{
- struct bch_fs *c = ca->fs;
- struct prio_clock *clock = &c->prio_clock[rw];
- struct bucket *g;
- u16 max_delta = 1;
- unsigned i;
-
- lockdep_assert_held(&c->bucket_lock);
-
- /* Determine min prio for this particular cache */
- for_each_bucket(g, ca)
- max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
-
- ca->min_prio[rw] = clock->hand - max_delta;
-
- /*
- * This may possibly increase the min prio for the whole cache, check
- * that as well.
- */
- max_delta = 1;
-
- for_each_member_device(ca, c, i)
- max_delta = max(max_delta,
- (u16) (clock->hand - ca->min_prio[rw]));
-
- clock->min_prio = clock->hand - max_delta;
-}
-
-static void bch_rescale_prios(struct bch_fs *c, int rw)
-{
- struct prio_clock *clock = &c->prio_clock[rw];
- struct bch_dev *ca;
- struct bucket *g;
- unsigned i;
-
- trace_bcache_rescale_prios(c);
-
- for_each_member_device(ca, c, i) {
- for_each_bucket(g, ca)
- g->prio[rw] = clock->hand -
- (clock->hand - g->prio[rw]) / 2;
-
- bch_recalc_min_prio(ca, rw);
- }
-}
-
-static void bch_inc_clock_hand(struct io_timer *timer)
-{
- struct prio_clock *clock = container_of(timer,
- struct prio_clock, rescale);
- struct bch_fs *c = container_of(clock,
- struct bch_fs, prio_clock[clock->rw]);
- u64 capacity;
-
- mutex_lock(&c->bucket_lock);
-
- clock->hand++;
-
- /* if clock cannot be advanced more, rescale prio */
- if (clock->hand == (u16) (clock->min_prio - 1))
- bch_rescale_prios(c, clock->rw);
-
- mutex_unlock(&c->bucket_lock);
-
- capacity = READ_ONCE(c->capacity);
-
- if (!capacity)
- return;
-
- /*
- * we only increment when 0.1% of the filesystem capacity has been read
- * or written too, this determines if it's time
- *
- * XXX: we shouldn't really be going off of the capacity of devices in
- * RW mode (that will be 0 when we're RO, yet we can still service
- * reads)
- */
- timer->expire += capacity >> 10;
-
- bch_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch_prio_timer_init(struct bch_fs *c, int rw)
-{
- struct prio_clock *clock = &c->prio_clock[rw];
- struct io_timer *timer = &clock->rescale;
-
- clock->rw = rw;
- timer->fn = bch_inc_clock_hand;
- timer->expire = c->capacity >> 10;
-}
-
-/*
- * Background allocation thread: scans for buckets to be invalidated,
- * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
- * then optionally issues discard commands to the newly free buckets, then puts
- * them on the various freelists.
- */
-
-static inline bool can_inc_bucket_gen(struct bch_dev *ca, struct bucket *g)
-{
- return bucket_gc_gen(ca, g) < BUCKET_GC_GEN_MAX;
-}
-
-static bool bch_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g)
-{
- if (!is_available_bucket(READ_ONCE(g->mark)))
- return false;
-
- if (bucket_gc_gen(ca, g) >= BUCKET_GC_GEN_MAX - 1)
- ca->inc_gen_needs_gc++;
-
- return can_inc_bucket_gen(ca, g);
-}
-
-static void bch_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
-{
- spin_lock(&ca->freelist_lock);
-
- bch_invalidate_bucket(ca, g);
-
- g->read_prio = ca->fs->prio_clock[READ].hand;
- g->write_prio = ca->fs->prio_clock[WRITE].hand;
-
- verify_not_on_freelist(ca, g - ca->buckets);
- BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
-
- spin_unlock(&ca->freelist_lock);
-}
-
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- * indication of how hot the data is -- we scale the prio so that the prio
- * farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- * indication of the cost in cache misses this eviction will cause.
- *
- * - The difference between the bucket's current gen and oldest gen of any
- * pointer into it, which gives us an indication of the cost of an eventual
- * btree GC to rewrite nodes with stale pointers.
- */
-
-#define bucket_sort_key(g) \
-({ \
- unsigned long prio = g->read_prio - ca->min_prio[READ]; \
- prio = (prio * 7) / (ca->fs->prio_clock[READ].hand - \
- ca->min_prio[READ]); \
- \
- (((prio + 1) * bucket_sectors_used(g)) << 8) | bucket_gc_gen(ca, g);\
-})
-
-static void invalidate_buckets_lru(struct bch_dev *ca)
-{
- struct bucket_heap_entry e;
- struct bucket *g;
- unsigned i;
-
- mutex_lock(&ca->heap_lock);
-
- ca->heap.used = 0;
-
- mutex_lock(&ca->fs->bucket_lock);
- bch_recalc_min_prio(ca, READ);
- bch_recalc_min_prio(ca, WRITE);
-
- /*
- * Find buckets with lowest read priority, by building a maxheap sorted
- * by read priority and repeatedly replacing the maximum element until
- * all buckets have been visited.
- */
- for_each_bucket(g, ca) {
- if (!bch_can_invalidate_bucket(ca, g))
- continue;
-
- bucket_heap_push(ca, g, bucket_sort_key(g));
- }
-
- /* Sort buckets by physical location on disk for better locality */
- for (i = 0; i < ca->heap.used; i++) {
- struct bucket_heap_entry *e = &ca->heap.data[i];
-
- e->val = e->g - ca->buckets;
- }
-
- heap_resort(&ca->heap, bucket_max_cmp);
-
- /*
- * If we run out of buckets to invalidate, bch_allocator_thread() will
- * kick stuff and retry us
- */
- while (!fifo_full(&ca->free_inc) &&
- heap_pop(&ca->heap, e, bucket_max_cmp)) {
- BUG_ON(!bch_can_invalidate_bucket(ca, e.g));
- bch_invalidate_one_bucket(ca, e.g);
- }
-
- mutex_unlock(&ca->fs->bucket_lock);
- mutex_unlock(&ca->heap_lock);
-}
-
-static void invalidate_buckets_fifo(struct bch_dev *ca)
-{
- struct bucket *g;
- size_t checked = 0;
-
- while (!fifo_full(&ca->free_inc)) {
- if (ca->fifo_last_bucket < ca->mi.first_bucket ||
- ca->fifo_last_bucket >= ca->mi.nbuckets)
- ca->fifo_last_bucket = ca->mi.first_bucket;
-
- g = ca->buckets + ca->fifo_last_bucket++;
-
- if (bch_can_invalidate_bucket(ca, g))
- bch_invalidate_one_bucket(ca, g);
-
- if (++checked >= ca->mi.nbuckets)
- return;
- }
-}
-
-static void invalidate_buckets_random(struct bch_dev *ca)
-{
- struct bucket *g;
- size_t checked = 0;
-
- while (!fifo_full(&ca->free_inc)) {
- size_t n = bch_rand_range(ca->mi.nbuckets -
- ca->mi.first_bucket) +
- ca->mi.first_bucket;
-
- g = ca->buckets + n;
-
- if (bch_can_invalidate_bucket(ca, g))
- bch_invalidate_one_bucket(ca, g);
-
- if (++checked >= ca->mi.nbuckets / 2)
- return;
- }
-}
-
-static void invalidate_buckets(struct bch_dev *ca)
-{
- ca->inc_gen_needs_gc = 0;
-
- switch (ca->mi.replacement) {
- case CACHE_REPLACEMENT_LRU:
- invalidate_buckets_lru(ca);
- break;
- case CACHE_REPLACEMENT_FIFO:
- invalidate_buckets_fifo(ca);
- break;
- case CACHE_REPLACEMENT_RANDOM:
- invalidate_buckets_random(ca);
- break;
- }
-}
-
-static bool __bch_allocator_push(struct bch_dev *ca, long bucket)
-{
- if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
- goto success;
-
- if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket))
- goto success;
-
- if (fifo_push(&ca->free[RESERVE_BTREE], bucket))
- goto success;
-
- if (fifo_push(&ca->free[RESERVE_NONE], bucket))
- goto success;
-
- return false;
-success:
- closure_wake_up(&ca->fs->freelist_wait);
- return true;
-}
-
-static bool bch_allocator_push(struct bch_dev *ca, long bucket)
-{
- bool ret;
-
- spin_lock(&ca->freelist_lock);
- ret = __bch_allocator_push(ca, bucket);
- if (ret)
- fifo_pop(&ca->free_inc, bucket);
- spin_unlock(&ca->freelist_lock);
-
- return ret;
-}
-
-static void bch_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
- u16 last_seq_ondisk = c->journal.last_seq_ondisk;
- struct bucket *g;
-
- for_each_bucket(g, ca) {
- struct bucket_mark m = READ_ONCE(g->mark);
-
- if (is_available_bucket(m) &&
- !m.cached_sectors &&
- !m.had_metadata &&
- !bucket_needs_journal_commit(m, last_seq_ondisk)) {
- spin_lock(&ca->freelist_lock);
-
- bch_mark_alloc_bucket(ca, g, true);
- g->read_prio = c->prio_clock[READ].hand;
- g->write_prio = c->prio_clock[WRITE].hand;
-
- verify_not_on_freelist(ca, g - ca->buckets);
- BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
-
- spin_unlock(&ca->freelist_lock);
-
- if (fifo_full(&ca->free_inc))
- break;
- }
- }
-}
-
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by invalidate_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch_allocator_thread(void *arg)
-{
- struct bch_dev *ca = arg;
- struct bch_fs *c = ca->fs;
- int ret;
-
- set_freezable();
-
- bch_find_empty_buckets(c, ca);
-
- while (1) {
- /*
- * First, we pull buckets off of the free_inc list, possibly
- * issue discards to them, then we add the bucket to a
- * free list:
- */
-
- while (!fifo_empty(&ca->free_inc)) {
- long bucket = fifo_peek(&ca->free_inc);
-
- /*
- * Don't remove from free_inc until after it's added
- * to freelist, so gc doesn't miss it while we've
- * dropped bucket lock
- */
-
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca, bucket),
- ca->mi.bucket_size, GFP_NOIO, 0);
-
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (bch_allocator_push(ca, bucket))
- break;
-
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- goto out;
- }
- schedule();
- try_to_freeze();
- }
-
- __set_current_state(TASK_RUNNING);
- }
-
- down_read(&c->gc_lock);
-
- /*
- * See if we have buckets we can reuse without invalidating them
- * or forcing a journal commit:
- */
- //bch_find_empty_buckets(c, ca);
-
- if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) {
- up_read(&c->gc_lock);
- continue;
- }
-
- /* We've run out of free buckets! */
-
- while (!fifo_full(&ca->free_inc)) {
- if (wait_buckets_available(ca)) {
- up_read(&c->gc_lock);
- goto out;
- }
-
- /*
- * Find some buckets that we can invalidate, either
- * they're completely unused, or only contain clean data
- * that's been written back to the backing device or
- * another cache tier
- */
-
- invalidate_buckets(ca);
- trace_bcache_alloc_batch(ca, fifo_used(&ca->free_inc),
- ca->free_inc.size);
- }
-
- up_read(&c->gc_lock);
-
- /*
- * free_inc is full of newly-invalidated buckets, must write out
- * prios and gens before they can be re-used
- */
- ret = bch_prio_write(ca);
- if (ret) {
- /*
- * Emergency read only - allocator thread has to
- * shutdown.
- *
- * N.B. we better be going into RO mode, else
- * allocations would hang indefinitely - whatever
- * generated the error will have sent us into RO mode.
- *
- * Clear out the free_inc freelist so things are
- * consistent-ish:
- */
- spin_lock(&ca->freelist_lock);
- while (!fifo_empty(&ca->free_inc)) {
- long bucket;
-
- fifo_pop(&ca->free_inc, bucket);
- bch_mark_free_bucket(ca, ca->buckets + bucket);
- }
- spin_unlock(&ca->freelist_lock);
- goto out;
- }
- }
-out:
- /*
- * Avoid a race with bch_usage_update() trying to wake us up after
- * we've exited:
- */
- synchronize_rcu();
- return 0;
-}
-
-/* Allocation */
-
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-size_t bch_bucket_alloc(struct bch_dev *ca, enum alloc_reserve reserve)
-{
- struct bucket *g;
- long r;
-
- spin_lock(&ca->freelist_lock);
- if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
- fifo_pop(&ca->free[reserve], r))
- goto out;
-
- spin_unlock(&ca->freelist_lock);
-
- trace_bcache_bucket_alloc_fail(ca, reserve);
- return 0;
-out:
- verify_not_on_freelist(ca, r);
- spin_unlock(&ca->freelist_lock);
-
- trace_bcache_bucket_alloc(ca, reserve);
-
- bch_wake_allocator(ca);
-
- g = ca->buckets + r;
-
- g->read_prio = ca->fs->prio_clock[READ].hand;
- g->write_prio = ca->fs->prio_clock[WRITE].hand;
-
- return r;
-}
-
-static void __bch_bucket_free(struct bch_dev *ca, struct bucket *g)
-{
- bch_mark_free_bucket(ca, g);
-
- g->read_prio = ca->fs->prio_clock[READ].hand;
- g->write_prio = ca->fs->prio_clock[WRITE].hand;
-}
-
-enum bucket_alloc_ret {
- ALLOC_SUCCESS,
- NO_DEVICES, /* -EROFS */
- FREELIST_EMPTY, /* Allocator thread not keeping up */
-};
-
-static void recalc_alloc_group_weights(struct bch_fs *c,
- struct dev_group *devs)
-{
- struct bch_dev *ca;
- u64 available_buckets = 1; /* avoid a divide by zero... */
- unsigned i;
-
- for (i = 0; i < devs->nr; i++) {
- ca = devs->d[i].dev;
-
- devs->d[i].weight = dev_buckets_free(ca);
- available_buckets += devs->d[i].weight;
- }
-
- for (i = 0; i < devs->nr; i++) {
- const unsigned min_weight = U32_MAX >> 4;
- const unsigned max_weight = U32_MAX;
-
- devs->d[i].weight =
- min_weight +
- div64_u64(devs->d[i].weight *
- devs->nr *
- (max_weight - min_weight),
- available_buckets);
- devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
- }
-}
-
-static enum bucket_alloc_ret bch_bucket_alloc_group(struct bch_fs *c,
- struct open_bucket *ob,
- enum alloc_reserve reserve,
- unsigned nr_replicas,
- struct dev_group *devs,
- long *devs_used)
-{
- enum bucket_alloc_ret ret;
- unsigned fail_idx = -1, i;
- unsigned available = 0;
-
- BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs));
-
- if (ob->nr_ptrs >= nr_replicas)
- return ALLOC_SUCCESS;
-
- spin_lock(&devs->lock);
-
- for (i = 0; i < devs->nr; i++)
- available += !test_bit(devs->d[i].dev->dev_idx,
- devs_used);
-
- recalc_alloc_group_weights(c, devs);
-
- i = devs->cur_device;
-
- while (ob->nr_ptrs < nr_replicas) {
- struct bch_dev *ca;
- u64 bucket;
-
- if (!available) {
- ret = NO_DEVICES;
- goto err;
- }
-
- i++;
- i %= devs->nr;
-
- ret = FREELIST_EMPTY;
- if (i == fail_idx)
- goto err;
-
- ca = devs->d[i].dev;
-
- if (test_bit(ca->dev_idx, devs_used))
- continue;
-
- if (fail_idx == -1 &&
- get_random_int() > devs->d[i].weight)
- continue;
-
- bucket = bch_bucket_alloc(ca, reserve);
- if (!bucket) {
- if (fail_idx == -1)
- fail_idx = i;
- continue;
- }
-
- /*
- * open_bucket_add_buckets expects new pointers at the head of
- * the list:
- */
- memmove(&ob->ptrs[1],
- &ob->ptrs[0],
- ob->nr_ptrs * sizeof(ob->ptrs[0]));
- memmove(&ob->ptr_offset[1],
- &ob->ptr_offset[0],
- ob->nr_ptrs * sizeof(ob->ptr_offset[0]));
- ob->nr_ptrs++;
- ob->ptrs[0] = (struct bch_extent_ptr) {
- .gen = ca->buckets[bucket].mark.gen,
- .offset = bucket_to_sector(ca, bucket),
- .dev = ca->dev_idx,
- };
- ob->ptr_offset[0] = 0;
-
- __set_bit(ca->dev_idx, devs_used);
- available--;
- devs->cur_device = i;
- }
-
- ret = ALLOC_SUCCESS;
-err:
- EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
- spin_unlock(&devs->lock);
- return ret;
-}
-
-static enum bucket_alloc_ret __bch_bucket_alloc_set(struct bch_fs *c,
- struct write_point *wp,
- struct open_bucket *ob,
- unsigned nr_replicas,
- enum alloc_reserve reserve,
- long *devs_used)
-{
- struct bch_tier *tier;
- /*
- * this should implement policy - for a given type of allocation, decide
- * which devices to allocate from:
- *
- * XXX: switch off wp->type and do something more intelligent here
- */
- if (wp->group)
- return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
- wp->group, devs_used);
-
- /* foreground writes: prefer fastest tier: */
- tier = READ_ONCE(c->fastest_tier);
- if (tier)
- bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
- &tier->devs, devs_used);
-
- return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
- &c->all_devs, devs_used);
-}
-
-static int bch_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
- struct open_bucket *ob, unsigned nr_replicas,
- enum alloc_reserve reserve, long *devs_used,
- struct closure *cl)
-{
- bool waiting = false;
-
- while (1) {
- switch (__bch_bucket_alloc_set(c, wp, ob, nr_replicas,
- reserve, devs_used)) {
- case ALLOC_SUCCESS:
- if (waiting)
- closure_wake_up(&c->freelist_wait);
-
- return 0;
-
- case NO_DEVICES:
- if (waiting)
- closure_wake_up(&c->freelist_wait);
- return -EROFS;
-
- case FREELIST_EMPTY:
- if (!cl || waiting)
- trace_bcache_freelist_empty_fail(c,
- reserve, cl);
-
- if (!cl)
- return -ENOSPC;
-
- if (waiting)
- return -EAGAIN;
-
- /* Retry allocation after adding ourself to waitlist: */
- closure_wait(&c->freelist_wait, cl);
- waiting = true;
- break;
- default:
- BUG();
- }
- }
-}
-
-/* Open buckets: */
-
-/*
- * Open buckets represent one or more buckets (on multiple devices) that are
- * currently being allocated from. They serve two purposes:
- *
- * - They track buckets that have been partially allocated, allowing for
- * sub-bucket sized allocations - they're used by the sector allocator below
- *
- * - They provide a reference to the buckets they own that mark and sweep GC
- * can find, until the new allocation has a pointer to it inserted into the
- * btree
- *
- * When allocating some space with the sector allocator, the allocation comes
- * with a reference to an open bucket - the caller is required to put that
- * reference _after_ doing the index update that makes its allocation reachable.
- */
-
-static void __bch_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
-{
- const struct bch_extent_ptr *ptr;
-
- lockdep_assert_held(&c->open_buckets_lock);
-
- open_bucket_for_each_ptr(ob, ptr) {
- struct bch_dev *ca = c->devs[ptr->dev];
-
- bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), false);
- }
-
- ob->nr_ptrs = 0;
-
- list_move(&ob->list, &c->open_buckets_free);
- c->open_buckets_nr_free++;
- closure_wake_up(&c->open_buckets_wait);
-}
-
-void bch_open_bucket_put(struct bch_fs *c, struct open_bucket *b)
-{
- if (atomic_dec_and_test(&b->pin)) {
- spin_lock(&c->open_buckets_lock);
- __bch_open_bucket_put(c, b);
- spin_unlock(&c->open_buckets_lock);
- }
-}
-
-static struct open_bucket *bch_open_bucket_get(struct bch_fs *c,
- unsigned nr_reserved,
- struct closure *cl)
-{
- struct open_bucket *ret;
-
- spin_lock(&c->open_buckets_lock);
-
- if (c->open_buckets_nr_free > nr_reserved) {
- BUG_ON(list_empty(&c->open_buckets_free));
- ret = list_first_entry(&c->open_buckets_free,
- struct open_bucket, list);
- list_move(&ret->list, &c->open_buckets_open);
- BUG_ON(ret->nr_ptrs);
-
- atomic_set(&ret->pin, 1); /* XXX */
- ret->has_full_ptrs = false;
-
- c->open_buckets_nr_free--;
- trace_bcache_open_bucket_alloc(c, cl);
- } else {
- trace_bcache_open_bucket_alloc_fail(c, cl);
-
- if (cl) {
- closure_wait(&c->open_buckets_wait, cl);
- ret = ERR_PTR(-EAGAIN);
- } else
- ret = ERR_PTR(-ENOSPC);
- }
-
- spin_unlock(&c->open_buckets_lock);
-
- return ret;
-}
-
-static unsigned ob_ptr_sectors_free(struct bch_fs *c,
- struct open_bucket *ob,
- struct bch_extent_ptr *ptr)
-{
- struct bch_dev *ca = c->devs[ptr->dev];
- unsigned i = ptr - ob->ptrs;
- unsigned bucket_size = ca->mi.bucket_size;
- unsigned used = (ptr->offset & (bucket_size - 1)) +
- ob->ptr_offset[i];
-
- BUG_ON(used > bucket_size);
-
- return bucket_size - used;
-}
-
-static unsigned open_bucket_sectors_free(struct bch_fs *c,
- struct open_bucket *ob,
- unsigned nr_replicas)
-{
- unsigned i, sectors_free = UINT_MAX;
-
- for (i = 0; i < min(nr_replicas, ob->nr_ptrs); i++)
- sectors_free = min(sectors_free,
- ob_ptr_sectors_free(c, ob, &ob->ptrs[i]));
-
- return sectors_free != UINT_MAX ? sectors_free : 0;
-}
-
-static void open_bucket_copy_unused_ptrs(struct bch_fs *c,
- struct open_bucket *new,
- struct open_bucket *old)
-{
- unsigned i;
-
- for (i = 0; i < old->nr_ptrs; i++)
- if (ob_ptr_sectors_free(c, old, &old->ptrs[i])) {
- struct bch_extent_ptr tmp = old->ptrs[i];
-
- tmp.offset += old->ptr_offset[i];
- new->ptrs[new->nr_ptrs] = tmp;
- new->ptr_offset[new->nr_ptrs] = 0;
- new->nr_ptrs++;
- }
-}
-
-static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob)
-{
-#ifdef CONFIG_BCACHE_DEBUG
- const struct bch_extent_ptr *ptr;
-
- open_bucket_for_each_ptr(ob, ptr) {
- struct bch_dev *ca = c->devs[ptr->dev];
-
- BUG_ON(ptr_stale(ca, ptr));
- }
-#endif
-}
-
-/* Sector allocator */
-
-static struct open_bucket *lock_writepoint(struct bch_fs *c,
- struct write_point *wp)
-{
- struct open_bucket *ob;
-
- while ((ob = ACCESS_ONCE(wp->b))) {
- mutex_lock(&ob->lock);
- if (wp->b == ob)
- break;
-
- mutex_unlock(&ob->lock);
- }
-
- return ob;
-}
-
-static int open_bucket_add_buckets(struct bch_fs *c,
- struct write_point *wp,
- struct open_bucket *ob,
- unsigned nr_replicas,
- unsigned nr_replicas_required,
- enum alloc_reserve reserve,
- struct closure *cl)
-{
- long devs_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
- unsigned i;
- int ret;
-
- /*
- * We might be allocating pointers to add to an existing extent
- * (tiering/copygc/migration) - if so, some of the pointers in our
- * existing open bucket might duplicate devices we already have. This is
- * moderately annoying.
- */
-
- /* Short circuit all the fun stuff if posssible: */
- if (ob->nr_ptrs >= nr_replicas)
- return 0;
-
- memset(devs_used, 0, sizeof(devs_used));
-
- for (i = 0; i < ob->nr_ptrs; i++)
- __set_bit(ob->ptrs[i].dev, devs_used);
-
- ret = bch_bucket_alloc_set(c, wp, ob, nr_replicas,
- reserve, devs_used, cl);
-
- if (ret == -EROFS &&
- ob->nr_ptrs >= nr_replicas_required)
- ret = 0;
-
- return ret;
-}
-
-/*
- * Get us an open_bucket we can allocate from, return with it locked:
- */
-struct open_bucket *bch_alloc_sectors_start(struct bch_fs *c,
- struct write_point *wp,
- unsigned nr_replicas,
- unsigned nr_replicas_required,
- enum alloc_reserve reserve,
- struct closure *cl)
-{
- struct open_bucket *ob;
- unsigned open_buckets_reserved = wp == &c->btree_write_point
- ? 0 : BTREE_NODE_RESERVE;
- int ret;
-
- BUG_ON(!reserve);
- BUG_ON(!nr_replicas);
-retry:
- ob = lock_writepoint(c, wp);
-
- /*
- * If ob->sectors_free == 0, one or more of the buckets ob points to is
- * full. We can't drop pointers from an open bucket - garbage collection
- * still needs to find them; instead, we must allocate a new open bucket
- * and copy any pointers to non-full buckets into the new open bucket.
- */
- if (!ob || ob->has_full_ptrs) {
- struct open_bucket *new_ob;
-
- new_ob = bch_open_bucket_get(c, open_buckets_reserved, cl);
- if (IS_ERR(new_ob))
- return new_ob;
-
- mutex_lock(&new_ob->lock);
-
- /*
- * We point the write point at the open_bucket before doing the
- * allocation to avoid a race with shutdown:
- */
- if (race_fault() ||
- cmpxchg(&wp->b, ob, new_ob) != ob) {
- /* We raced: */
- mutex_unlock(&new_ob->lock);
- bch_open_bucket_put(c, new_ob);
-
- if (ob)
- mutex_unlock(&ob->lock);
- goto retry;
- }
-
- if (ob) {
- open_bucket_copy_unused_ptrs(c, new_ob, ob);
- mutex_unlock(&ob->lock);
- bch_open_bucket_put(c, ob);
- }
-
- ob = new_ob;
- }
-
- ret = open_bucket_add_buckets(c, wp, ob, nr_replicas,
- nr_replicas_required,
- reserve, cl);
- if (ret) {
- mutex_unlock(&ob->lock);
- return ERR_PTR(ret);
- }
-
- ob->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas);
-
- BUG_ON(!ob->sectors_free);
- verify_not_stale(c, ob);
-
- return ob;
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
- unsigned nr_replicas, struct open_bucket *ob,
- unsigned sectors)
-{
- struct bch_extent_ptr tmp;
- bool has_data = false;
- unsigned i;
-
- /*
- * We're keeping any existing pointer k has, and appending new pointers:
- * __bch_write() will only write to the pointers we add here:
- */
-
- BUG_ON(sectors > ob->sectors_free);
-
- /* didn't use all the ptrs: */
- if (nr_replicas < ob->nr_ptrs)
- has_data = true;
-
- for (i = 0; i < min(ob->nr_ptrs, nr_replicas); i++) {
- EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
-
- tmp = ob->ptrs[i];
- tmp.cached = bkey_extent_is_cached(&e->k);
- tmp.offset += ob->ptr_offset[i];
- extent_ptr_append(e, tmp);
-
- ob->ptr_offset[i] += sectors;
-
- this_cpu_add(*c->devs[tmp.dev]->sectors_written, sectors);
- }
-}
-
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
-void bch_alloc_sectors_done(struct bch_fs *c, struct write_point *wp,
- struct open_bucket *ob)
-{
- bool has_data = false;
- unsigned i;
-
- for (i = 0; i < ob->nr_ptrs; i++) {
- if (!ob_ptr_sectors_free(c, ob, &ob->ptrs[i]))
- ob->has_full_ptrs = true;
- else
- has_data = true;
- }
-
- if (likely(has_data))
- atomic_inc(&ob->pin);
- else
- BUG_ON(xchg(&wp->b, NULL) != ob);
-
- mutex_unlock(&ob->lock);
-}
-
-/*
- * Allocates some space in the cache to write to, and k to point to the newly
- * allocated space, and updates k->size and k->offset (to point to the
- * end of the newly allocated space).
- *
- * May allocate fewer sectors than @sectors, k->size indicates how many
- * sectors were actually allocated.
- *
- * Return codes:
- * - -EAGAIN: closure was added to waitlist
- * - -ENOSPC: out of space and no closure provided
- *
- * @c - filesystem.
- * @wp - write point to use for allocating sectors.
- * @k - key to return the allocated space information.
- * @cl - closure to wait for a bucket
- */
-struct open_bucket *bch_alloc_sectors(struct bch_fs *c,
- struct write_point *wp,
- struct bkey_i_extent *e,
- unsigned nr_replicas,
- unsigned nr_replicas_required,
- enum alloc_reserve reserve,
- struct closure *cl)
-{
- struct open_bucket *ob;
-
- ob = bch_alloc_sectors_start(c, wp, nr_replicas,
- nr_replicas_required,
- reserve, cl);
- if (IS_ERR_OR_NULL(ob))
- return ob;
-
- if (e->k.size > ob->sectors_free)
- bch_key_resize(&e->k, ob->sectors_free);
-
- bch_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size);
-
- bch_alloc_sectors_done(c, wp, ob);
-
- return ob;
-}
-
-/* Startup/shutdown (ro/rw): */
-
-void bch_recalc_capacity(struct bch_fs *c)
-{
- struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
- struct bch_dev *ca;
- u64 total_capacity, capacity = 0, reserved_sectors = 0;
- unsigned long ra_pages = 0;
- unsigned i, j;
-
- for_each_online_member(ca, c, i) {
- struct backing_dev_info *bdi =
- blk_get_backing_dev_info(ca->disk_sb.bdev);
-
- ra_pages += bdi->ra_pages;
- }
-
- c->bdi.ra_pages = ra_pages;
-
- /* Find fastest, slowest tiers with devices: */
-
- for (tier = c->tiers;
- tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
- if (!tier->devs.nr)
- continue;
- if (!fastest_tier)
- fastest_tier = tier;
- slowest_tier = tier;
- }
-
- c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
-
- c->promote_write_point.group = &fastest_tier->devs;
-
- if (!fastest_tier)
- goto set_capacity;
-
- /*
- * Capacity of the filesystem is the capacity of all the devices in the
- * slowest (highest) tier - we don't include lower tier devices.
- */
- spin_lock(&slowest_tier->devs.lock);
- group_for_each_dev(ca, &slowest_tier->devs, i) {
- size_t reserve = 0;
-
- /*
- * We need to reserve buckets (from the number
- * of currently available buckets) against
- * foreground writes so that mainly copygc can
- * make forward progress.
- *
- * We need enough to refill the various reserves
- * from scratch - copygc will use its entire
- * reserve all at once, then run against when
- * its reserve is refilled (from the formerly
- * available buckets).
- *
- * This reserve is just used when considering if
- * allocations for foreground writes must wait -
- * not -ENOSPC calculations.
- */
- for (j = 0; j < RESERVE_NONE; j++)
- reserve += ca->free[j].size;
-
- reserve += ca->free_inc.size;
-
- reserve += ARRAY_SIZE(c->write_points);
-
- if (ca->mi.tier)
- reserve += 1; /* tiering write point */
- reserve += 1; /* btree write point */
-
- reserved_sectors += reserve << ca->bucket_bits;
-
- capacity += (ca->mi.nbuckets -
- ca->mi.first_bucket) <<
- ca->bucket_bits;
- }
- spin_unlock(&slowest_tier->devs.lock);
-set_capacity:
- total_capacity = capacity;
-
- capacity *= (100 - c->opts.gc_reserve_percent);
- capacity = div64_u64(capacity, 100);
-
- BUG_ON(capacity + reserved_sectors > total_capacity);
-
- c->capacity = capacity;
-
- if (c->capacity) {
- bch_io_timer_add(&c->io_clock[READ],
- &c->prio_clock[READ].rescale);
- bch_io_timer_add(&c->io_clock[WRITE],
- &c->prio_clock[WRITE].rescale);
- } else {
- bch_io_timer_del(&c->io_clock[READ],
- &c->prio_clock[READ].rescale);
- bch_io_timer_del(&c->io_clock[WRITE],
- &c->prio_clock[WRITE].rescale);
- }
-
- /* Wake up case someone was waiting for buckets */
- closure_wake_up(&c->freelist_wait);
-}
-
-static void bch_stop_write_point(struct bch_dev *ca,
- struct write_point *wp)
-{
- struct bch_fs *c = ca->fs;
- struct open_bucket *ob;
- struct bch_extent_ptr *ptr;
-
- ob = lock_writepoint(c, wp);
- if (!ob)
- return;
-
- for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
- if (ptr->dev == ca->dev_idx)
- goto found;
-
- mutex_unlock(&ob->lock);
- return;
-found:
- BUG_ON(xchg(&wp->b, NULL) != ob);
- mutex_unlock(&ob->lock);
-
- /* Drop writepoint's ref: */
- bch_open_bucket_put(c, ob);
-}
-
-static bool bch_dev_has_open_write_point(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bch_extent_ptr *ptr;
- struct open_bucket *ob;
-
- for (ob = c->open_buckets;
- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
- ob++)
- if (atomic_read(&ob->pin)) {
- mutex_lock(&ob->lock);
- for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
- if (ptr->dev == ca->dev_idx) {
- mutex_unlock(&ob->lock);
- return true;
- }
- mutex_unlock(&ob->lock);
- }
-
- return false;
-}
-
-/* device goes ro: */
-void bch_dev_allocator_stop(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
- struct task_struct *p;
- struct closure cl;
- unsigned i;
-
- closure_init_stack(&cl);
-
- /* First, remove device from allocation groups: */
-
- bch_dev_group_remove(tier, ca);
- bch_dev_group_remove(&c->all_devs, ca);
-
- bch_recalc_capacity(c);
-
- /*
- * Stopping the allocator thread comes after removing from allocation
- * groups, else pending allocations will hang:
- */
-
- p = ca->alloc_thread;
- ca->alloc_thread = NULL;
- smp_wmb();
-
- /*
- * We need an rcu barrier between setting ca->alloc_thread = NULL and
- * the thread shutting down to avoid a race with bch_usage_update() -
- * the allocator thread itself does a synchronize_rcu() on exit.
- *
- * XXX: it would be better to have the rcu barrier be asynchronous
- * instead of blocking us here
- */
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
-
- /* Next, close write points that point to this device... */
-
- for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
- bch_stop_write_point(ca, &c->write_points[i]);
-
- bch_stop_write_point(ca, &ca->copygc_write_point);
- bch_stop_write_point(ca, &c->promote_write_point);
- bch_stop_write_point(ca, &ca->tiering_write_point);
- bch_stop_write_point(ca, &c->migration_write_point);
- bch_stop_write_point(ca, &c->btree_write_point);
-
- mutex_lock(&c->btree_reserve_cache_lock);
- while (c->btree_reserve_cache_nr) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
- bch_open_bucket_put(c, a->ob);
- }
- mutex_unlock(&c->btree_reserve_cache_lock);
-
- /* Avoid deadlocks.. */
-
- closure_wake_up(&c->freelist_wait);
- wake_up(&c->journal.wait);
-
- /* Now wait for any in flight writes: */
-
- while (1) {
- closure_wait(&c->open_buckets_wait, &cl);
-
- if (!bch_dev_has_open_write_point(ca)) {
- closure_wake_up(&c->open_buckets_wait);
- break;
- }
-
- closure_sync(&cl);
- }
-}
-
-/*
- * Startup the allocator thread for transition to RW mode:
- */
-int bch_dev_allocator_start(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct dev_group *tier = &c->tiers[ca->mi.tier].devs;
- struct bch_sb_field_journal *journal_buckets;
- bool has_journal;
- struct task_struct *k;
-
- /*
- * allocator thread already started?
- */
- if (ca->alloc_thread)
- return 0;
-
- k = kthread_create(bch_allocator_thread, ca, "bcache_allocator");
- if (IS_ERR(k))
- return 0;
-
- get_task_struct(k);
- ca->alloc_thread = k;
-
- bch_dev_group_add(tier, ca);
- bch_dev_group_add(&c->all_devs, ca);
-
- mutex_lock(&c->sb_lock);
- journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
- has_journal = bch_nr_journal_buckets(journal_buckets) >=
- BCH_JOURNAL_BUCKETS_MIN;
- mutex_unlock(&c->sb_lock);
-
- if (has_journal)
- bch_dev_group_add(&c->journal.devs, ca);
-
- bch_recalc_capacity(c);
-
- /*
- * Don't wake up allocator thread until after adding device to
- * allocator groups - otherwise, alloc thread could get a spurious
- * -EROFS due to prio_write() -> journal_meta() not finding any devices:
- */
- wake_up_process(k);
- return 0;
-}
-
-void bch_fs_allocator_init(struct bch_fs *c)
-{
- unsigned i;
-
- INIT_LIST_HEAD(&c->open_buckets_open);
- INIT_LIST_HEAD(&c->open_buckets_free);
- spin_lock_init(&c->open_buckets_lock);
- bch_prio_timer_init(c, READ);
- bch_prio_timer_init(c, WRITE);
-
- /* open bucket 0 is a sentinal NULL: */
- mutex_init(&c->open_buckets[0].lock);
- INIT_LIST_HEAD(&c->open_buckets[0].list);
-
- for (i = 1; i < ARRAY_SIZE(c->open_buckets); i++) {
- mutex_init(&c->open_buckets[i].lock);
- c->open_buckets_nr_free++;
- list_add(&c->open_buckets[i].list, &c->open_buckets_free);
- }
-
- spin_lock_init(&c->all_devs.lock);
-
- for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
- spin_lock_init(&c->tiers[i].devs.lock);
-
- for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
- c->write_points[i].throttle = true;
-
- c->pd_controllers_update_seconds = 5;
- INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
-
- spin_lock_init(&c->foreground_write_pd_lock);
- bch_pd_controller_init(&c->foreground_write_pd);
- /*
- * We do not want the write rate to have an effect on the computed
- * rate, for two reasons:
- *
- * We do not call bch_ratelimit_delay() at all if the write rate
- * exceeds 1GB/s. In this case, the PD controller will think we are
- * not "keeping up" and not change the rate.
- */
- c->foreground_write_pd.backpressure = 0;
- init_timer(&c->foreground_write_wakeup);
-
- c->foreground_write_wakeup.data = (unsigned long) c;
- c->foreground_write_wakeup.function = bch_wake_delayed_writes;
-}
diff --git a/libbcache/alloc.h b/libbcache/alloc.h
deleted file mode 100644
index f8aa762d..00000000
--- a/libbcache/alloc.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef _BCACHE_ALLOC_H
-#define _BCACHE_ALLOC_H
-
-#include "alloc_types.h"
-
-struct bkey;
-struct bucket;
-struct bch_dev;
-struct bch_fs;
-struct dev_group;
-
-static inline size_t prios_per_bucket(const struct bch_dev *ca)
-{
- return (bucket_bytes(ca) - sizeof(struct prio_set)) /
- sizeof(struct bucket_disk);
-}
-
-static inline size_t prio_buckets(const struct bch_dev *ca)
-{
- return DIV_ROUND_UP((size_t) (ca)->mi.nbuckets, prios_per_bucket(ca));
-}
-
-void bch_dev_group_remove(struct dev_group *, struct bch_dev *);
-void bch_dev_group_add(struct dev_group *, struct bch_dev *);
-
-int bch_prio_read(struct bch_dev *);
-
-size_t bch_bucket_alloc(struct bch_dev *, enum alloc_reserve);
-
-void bch_open_bucket_put(struct bch_fs *, struct open_bucket *);
-
-struct open_bucket *bch_alloc_sectors_start(struct bch_fs *,
- struct write_point *,
- unsigned, unsigned,
- enum alloc_reserve,
- struct closure *);
-
-void bch_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *,
- unsigned, struct open_bucket *, unsigned);
-void bch_alloc_sectors_done(struct bch_fs *, struct write_point *,
- struct open_bucket *);
-
-struct open_bucket *bch_alloc_sectors(struct bch_fs *, struct write_point *,
- struct bkey_i_extent *, unsigned, unsigned,
- enum alloc_reserve, struct closure *);
-
-static inline void bch_wake_allocator(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- rcu_read_lock();
- if ((p = ACCESS_ONCE(ca->alloc_thread)))
- wake_up_process(p);
- rcu_read_unlock();
-}
-
-static inline struct bch_dev *dev_group_next(struct dev_group *devs,
- unsigned *iter)
-{
- struct bch_dev *ret = NULL;
-
- while (*iter < devs->nr &&
- !(ret = rcu_dereference_check(devs->d[*iter].dev,
- lockdep_is_held(&devs->lock))))
- (*iter)++;
-
- return ret;
-}
-
-#define group_for_each_dev(ca, devs, iter) \
- for ((iter) = 0; \
- ((ca) = dev_group_next((devs), &(iter))); \
- (iter)++)
-
-#define open_bucket_for_each_ptr(_ob, _ptr) \
- for ((_ptr) = (_ob)->ptrs; \
- (_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs; \
- (_ptr)++)
-
-void bch_recalc_capacity(struct bch_fs *);
-void bch_dev_allocator_stop(struct bch_dev *);
-int bch_dev_allocator_start(struct bch_dev *);
-void bch_fs_allocator_init(struct bch_fs *);
-
-#endif /* _BCACHE_ALLOC_H */
diff --git a/libbcache/alloc_types.h b/libbcache/alloc_types.h
deleted file mode 100644
index 1bf48ef9..00000000
--- a/libbcache/alloc_types.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef _BCACHE_ALLOC_TYPES_H
-#define _BCACHE_ALLOC_TYPES_H
-
-#include <linux/mutex.h>
-
-#include "clock_types.h"
-
-/*
- * There's two of these clocks, one for reads and one for writes:
- *
- * All fields protected by bucket_lock
- */
-struct prio_clock {
- /*
- * "now" in (read/write) IO time - incremented whenever we do X amount
- * of reads or writes.
- *
- * Goes with the bucket read/write prios: when we read or write to a
- * bucket we reset the bucket's prio to the current hand; thus hand -
- * prio = time since bucket was last read/written.
- *
- * The units are some amount (bytes/sectors) of data read/written, and
- * the units can change on the fly if we need to rescale to fit
- * everything in a u16 - your only guarantee is that the units are
- * consistent.
- */
- u16 hand;
- u16 min_prio;
-
- int rw;
-
- struct io_timer rescale;
-};
-
-/* There is one reserve for each type of btree, one for prios and gens
- * and one for moving GC */
-enum alloc_reserve {
- RESERVE_PRIO,
- RESERVE_BTREE,
- RESERVE_METADATA_LAST = RESERVE_BTREE,
- RESERVE_MOVINGGC,
-
- RESERVE_NONE,
- RESERVE_NR,
-};
-
-static inline bool allocation_is_metadata(enum alloc_reserve id)
-{
- return id <= RESERVE_METADATA_LAST;
-}
-
-struct dev_group {
- spinlock_t lock;
- unsigned nr;
- unsigned cur_device;
- struct {
- u64 weight;
- struct bch_dev *dev;
- } d[BCH_SB_MEMBERS_MAX];
-};
-
-/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
-#define OPEN_BUCKETS_COUNT 256
-
-#define WRITE_POINT_COUNT 16
-
-struct open_bucket {
- struct list_head list;
- struct mutex lock;
- atomic_t pin;
- bool has_full_ptrs;
- /*
- * recalculated every time we allocate from this open_bucket based on
- * how many pointers we're actually going to use:
- */
- unsigned sectors_free;
- unsigned nr_ptrs;
- struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
- unsigned ptr_offset[BCH_REPLICAS_MAX];
-};
-
-struct write_point {
- struct open_bucket *b;
-
- /*
- * Throttle writes to this write point if tier 0 is full?
- */
- bool throttle;
-
- /*
- * If not NULL, cache group for tiering, promotion and moving GC -
- * always allocates a single replica
- */
- struct dev_group *group;
-
- /*
- * Otherwise do a normal replicated bucket allocation that could come
- * from any device in tier 0 (foreground write)
- */
-};
-
-#endif /* _BCACHE_ALLOC_TYPES_H */
diff --git a/libbcache/bcache.h b/libbcache/bcache.h
deleted file mode 100644
index 1d0e998c..00000000
--- a/libbcache/bcache.h
+++ /dev/null
@@ -1,831 +0,0 @@
-#ifndef _BCACHE_H
-#define _BCACHE_H
-
-/*
- * SOME HIGH LEVEL CODE DOCUMENTATION:
- *
- * Bcache mostly works with cache sets, cache devices, and backing devices.
- *
- * Support for multiple cache devices hasn't quite been finished off yet, but
- * it's about 95% plumbed through. A cache set and its cache devices is sort of
- * like a md raid array and its component devices. Most of the code doesn't care
- * about individual cache devices, the main abstraction is the cache set.
- *
- * Multiple cache devices is intended to give us the ability to mirror dirty
- * cached data and metadata, without mirroring clean cached data.
- *
- * Backing devices are different, in that they have a lifetime independent of a
- * cache set. When you register a newly formatted backing device it'll come up
- * in passthrough mode, and then you can attach and detach a backing device from
- * a cache set at runtime - while it's mounted and in use. Detaching implicitly
- * invalidates any cached data for that backing device.
- *
- * A cache set can have multiple (many) backing devices attached to it.
- *
- * There's also flash only volumes - this is the reason for the distinction
- * between struct cached_dev and struct bcache_device. A flash only volume
- * works much like a bcache device that has a backing device, except the
- * "cached" data is always dirty. The end result is that we get thin
- * provisioning with very little additional code.
- *
- * Flash only volumes work but they're not production ready because the moving
- * garbage collector needs more work. More on that later.
- *
- * BUCKETS/ALLOCATION:
- *
- * Bcache is primarily designed for caching, which means that in normal
- * operation all of our available space will be allocated. Thus, we need an
- * efficient way of deleting things from the cache so we can write new things to
- * it.
- *
- * To do this, we first divide the cache device up into buckets. A bucket is the
- * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
- * works efficiently.
- *
- * Each bucket has a 16 bit priority, and an 8 bit generation associated with
- * it. The gens and priorities for all the buckets are stored contiguously and
- * packed on disk (in a linked list of buckets - aside from the superblock, all
- * of bcache's metadata is stored in buckets).
- *
- * The priority is used to implement an LRU. We reset a bucket's priority when
- * we allocate it or on cache it, and every so often we decrement the priority
- * of each bucket. It could be used to implement something more sophisticated,
- * if anyone ever gets around to it.
- *
- * The generation is used for invalidating buckets. Each pointer also has an 8
- * bit generation embedded in it; for a pointer to be considered valid, its gen
- * must match the gen of the bucket it points into. Thus, to reuse a bucket all
- * we have to do is increment its gen (and write its new gen to disk; we batch
- * this up).
- *
- * Bcache is entirely COW - we never write twice to a bucket, even buckets that
- * contain metadata (including btree nodes).
- *
- * THE BTREE:
- *
- * Bcache is in large part design around the btree.
- *
- * At a high level, the btree is just an index of key -> ptr tuples.
- *
- * Keys represent extents, and thus have a size field. Keys also have a variable
- * number of pointers attached to them (potentially zero, which is handy for
- * invalidating the cache).
- *
- * The key itself is an inode:offset pair. The inode number corresponds to a
- * backing device or a flash only volume. The offset is the ending offset of the
- * extent within the inode - not the starting offset; this makes lookups
- * slightly more convenient.
- *
- * Pointers contain the cache device id, the offset on that device, and an 8 bit
- * generation number. More on the gen later.
- *
- * Index lookups are not fully abstracted - cache lookups in particular are
- * still somewhat mixed in with the btree code, but things are headed in that
- * direction.
- *
- * Updates are fairly well abstracted, though. There are two different ways of
- * updating the btree; insert and replace.
- *
- * BTREE_INSERT will just take a list of keys and insert them into the btree -
- * overwriting (possibly only partially) any extents they overlap with. This is
- * used to update the index after a write.
- *
- * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
- * overwriting a key that matches another given key. This is used for inserting
- * data into the cache after a cache miss, and for background writeback, and for
- * the moving garbage collector.
- *
- * There is no "delete" operation; deleting things from the index is
- * accomplished by either by invalidating pointers (by incrementing a bucket's
- * gen) or by inserting a key with 0 pointers - which will overwrite anything
- * previously present at that location in the index.
- *
- * This means that there are always stale/invalid keys in the btree. They're
- * filtered out by the code that iterates through a btree node, and removed when
- * a btree node is rewritten.
- *
- * BTREE NODES:
- *
- * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
- * free smaller than a bucket - so, that's how big our btree nodes are.
- *
- * (If buckets are really big we'll only use part of the bucket for a btree node
- * - no less than 1/4th - but a bucket still contains no more than a single
- * btree node. I'd actually like to change this, but for now we rely on the
- * bucket's gen for deleting btree nodes when we rewrite/split a node.)
- *
- * Anyways, btree nodes are big - big enough to be inefficient with a textbook
- * btree implementation.
- *
- * The way this is solved is that btree nodes are internally log structured; we
- * can append new keys to an existing btree node without rewriting it. This
- * means each set of keys we write is sorted, but the node is not.
- *
- * We maintain this log structure in memory - keeping 1Mb of keys sorted would
- * be expensive, and we have to distinguish between the keys we have written and
- * the keys we haven't. So to do a lookup in a btree node, we have to search
- * each sorted set. But we do merge written sets together lazily, so the cost of
- * these extra searches is quite low (normally most of the keys in a btree node
- * will be in one big set, and then there'll be one or two sets that are much
- * smaller).
- *
- * This log structure makes bcache's btree more of a hybrid between a
- * conventional btree and a compacting data structure, with some of the
- * advantages of both.
- *
- * GARBAGE COLLECTION:
- *
- * We can't just invalidate any bucket - it might contain dirty data or
- * metadata. If it once contained dirty data, other writes might overwrite it
- * later, leaving no valid pointers into that bucket in the index.
- *
- * Thus, the primary purpose of garbage collection is to find buckets to reuse.
- * It also counts how much valid data it each bucket currently contains, so that
- * allocation can reuse buckets sooner when they've been mostly overwritten.
- *
- * It also does some things that are really internal to the btree
- * implementation. If a btree node contains pointers that are stale by more than
- * some threshold, it rewrites the btree node to avoid the bucket's generation
- * wrapping around. It also merges adjacent btree nodes if they're empty enough.
- *
- * THE JOURNAL:
- *
- * Bcache's journal is not necessary for consistency; we always strictly
- * order metadata writes so that the btree and everything else is consistent on
- * disk in the event of an unclean shutdown, and in fact bcache had writeback
- * caching (with recovery from unclean shutdown) before journalling was
- * implemented.
- *
- * Rather, the journal is purely a performance optimization; we can't complete a
- * write until we've updated the index on disk, otherwise the cache would be
- * inconsistent in the event of an unclean shutdown. This means that without the
- * journal, on random write workloads we constantly have to update all the leaf
- * nodes in the btree, and those writes will be mostly empty (appending at most
- * a few keys each) - highly inefficient in terms of amount of metadata writes,
- * and it puts more strain on the various btree resorting/compacting code.
- *
- * The journal is just a log of keys we've inserted; on startup we just reinsert
- * all the keys in the open journal entries. That means that when we're updating
- * a node in the btree, we can wait until a 4k block of keys fills up before
- * writing them out.
- *
- * For simplicity, we only journal updates to leaf nodes; updates to parent
- * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
- * the complexity to deal with journalling them (in particular, journal replay)
- * - updates to non leaf nodes just happen synchronously (see btree_split()).
- */
-
-#undef pr_fmt
-#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
-
-#include <linux/bug.h>
-#include <linux/bcache.h>
-#include <linux/bio.h>
-#include <linux/kobject.h>
-#include <linux/lglock.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/percpu-refcount.h>
-#include <linux/radix-tree.h>
-#include <linux/rbtree.h>
-#include <linux/rhashtable.h>
-#include <linux/rwsem.h>
-#include <linux/seqlock.h>
-#include <linux/shrinker.h>
-#include <linux/types.h>
-#include <linux/workqueue.h>
-
-#include "bset.h"
-#include "fifo.h"
-#include "util.h"
-#include "closure.h"
-#include "opts.h"
-
-#include <linux/dynamic_fault.h>
-
-#define bch_fs_init_fault(name) \
- dynamic_fault("bcache:bch_fs_init:" name)
-#define bch_meta_read_fault(name) \
- dynamic_fault("bcache:meta:read:" name)
-#define bch_meta_write_fault(name) \
- dynamic_fault("bcache:meta:write:" name)
-
-#ifndef bch_fmt
-#define bch_fmt(_c, fmt) "bcache (%s): " fmt "\n", ((_c)->name)
-#endif
-
-#define bch_info(c, fmt, ...) \
- printk(KERN_INFO bch_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_notice(c, fmt, ...) \
- printk(KERN_NOTICE bch_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_warn(c, fmt, ...) \
- printk(KERN_WARNING bch_fmt(c, fmt), ##__VA_ARGS__)
-#define bch_err(c, fmt, ...) \
- printk(KERN_ERR bch_fmt(c, fmt), ##__VA_ARGS__)
-
-#define bch_verbose(c, fmt, ...) \
-do { \
- if ((c)->opts.verbose_recovery) \
- bch_info(c, fmt, ##__VA_ARGS__); \
-} while (0)
-
-/* Parameters that are useful for debugging, but should always be compiled in: */
-#define BCH_DEBUG_PARAMS_ALWAYS() \
- BCH_DEBUG_PARAM(key_merging_disabled, \
- "Disables merging of extents") \
- BCH_DEBUG_PARAM(btree_gc_always_rewrite, \
- "Causes mark and sweep to compact and rewrite every " \
- "btree node it traverses") \
- BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \
- "Disables rewriting of btree nodes during mark and sweep")\
- BCH_DEBUG_PARAM(btree_gc_coalesce_disabled, \
- "Disables coalescing of btree nodes") \
- BCH_DEBUG_PARAM(btree_shrinker_disabled, \
- "Disables the shrinker callback for the btree node cache")
-
-/* Parameters that should only be compiled in in debug mode: */
-#define BCH_DEBUG_PARAMS_DEBUG() \
- BCH_DEBUG_PARAM(expensive_debug_checks, \
- "Enables various runtime debugging checks that " \
- "significantly affect performance") \
- BCH_DEBUG_PARAM(debug_check_bkeys, \
- "Run bkey_debugcheck (primarily checking GC/allocation "\
- "information) when iterating over keys") \
- BCH_DEBUG_PARAM(version_stress_test, \
- "Assigns random version numbers to newly written " \
- "extents, to test overlapping extent cases") \
- BCH_DEBUG_PARAM(verify_btree_ondisk, \
- "Reread btree nodes at various points to verify the " \
- "mergesort in the read path against modifications " \
- "done in memory") \
-
-#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
-
-#ifdef CONFIG_BCACHE_DEBUG
-#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
-#else
-#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
-#endif
-
-/* name, frequency_units, duration_units */
-#define BCH_TIME_STATS() \
- BCH_TIME_STAT(mca_alloc, sec, us) \
- BCH_TIME_STAT(mca_scan, sec, ms) \
- BCH_TIME_STAT(btree_gc, sec, ms) \
- BCH_TIME_STAT(btree_coalesce, sec, ms) \
- BCH_TIME_STAT(btree_split, sec, us) \
- BCH_TIME_STAT(btree_sort, ms, us) \
- BCH_TIME_STAT(btree_read, ms, us) \
- BCH_TIME_STAT(journal_write, us, us) \
- BCH_TIME_STAT(journal_delay, ms, us) \
- BCH_TIME_STAT(journal_blocked, sec, ms) \
- BCH_TIME_STAT(journal_flush_seq, us, us)
-
-#include "alloc_types.h"
-#include "blockdev_types.h"
-#include "buckets_types.h"
-#include "clock_types.h"
-#include "io_types.h"
-#include "journal_types.h"
-#include "keylist_types.h"
-#include "keybuf_types.h"
-#include "move_types.h"
-#include "stats_types.h"
-#include "super_types.h"
-
-/* 256k, in sectors */
-#define BTREE_NODE_SIZE_MAX 512
-
-/*
- * Number of nodes we might have to allocate in a worst case btree split
- * operation - we split all the way up to the root, then allocate a new root.
- */
-#define btree_reserve_required_nodes(depth) (((depth) + 1) * 2 + 1)
-
-/* Number of nodes btree coalesce will try to coalesce at once */
-#define GC_MERGE_NODES 4U
-
-/* Maximum number of nodes we might need to allocate atomically: */
-#define BTREE_RESERVE_MAX \
- (btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES)
-
-/* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 2)
-
-struct btree;
-struct crypto_blkcipher;
-struct crypto_ahash;
-
-enum gc_phase {
- GC_PHASE_SB_METADATA = BTREE_ID_NR + 1,
- GC_PHASE_PENDING_DELETE,
- GC_PHASE_DONE
-};
-
-struct gc_pos {
- enum gc_phase phase;
- struct bpos pos;
- unsigned level;
-};
-
-struct bch_member_cpu {
- u64 nbuckets; /* device size */
- u16 first_bucket; /* index of first bucket used */
- u16 bucket_size; /* sectors */
- u8 state;
- u8 tier;
- u8 has_metadata;
- u8 has_data;
- u8 replacement;
- u8 discard;
- u8 valid;
-};
-
-struct bch_dev {
- struct kobject kobj;
- struct percpu_ref ref;
- struct percpu_ref io_ref;
- struct completion stop_complete;
- struct completion offline_complete;
-
- struct bch_fs *fs;
-
- u8 dev_idx;
- /*
- * Cached version of this device's member info from superblock
- * Committed by bch_write_super() -> bch_fs_mi_update()
- */
- struct bch_member_cpu mi;
- uuid_le uuid;
- char name[BDEVNAME_SIZE];
-
- struct bcache_superblock disk_sb;
-
- struct dev_group self;
-
- /* biosets used in cloned bios for replicas and moving_gc */
- struct bio_set replica_set;
-
- struct task_struct *alloc_thread;
-
- struct prio_set *disk_buckets;
-
- /*
- * When allocating new buckets, prio_write() gets first dibs - since we
- * may not be allocate at all without writing priorities and gens.
- * prio_last_buckets[] contains the last buckets we wrote priorities to
- * (so gc can mark them as metadata).
- */
- u64 *prio_buckets;
- u64 *prio_last_buckets;
- spinlock_t prio_buckets_lock;
- struct bio *bio_prio;
-
- /*
- * free: Buckets that are ready to be used
- *
- * free_inc: Incoming buckets - these are buckets that currently have
- * cached data in them, and we can't reuse them until after we write
- * their new gen to disk. After prio_write() finishes writing the new
- * gens/prios, they'll be moved to the free list (and possibly discarded
- * in the process)
- */
- DECLARE_FIFO(long, free)[RESERVE_NR];
- DECLARE_FIFO(long, free_inc);
- spinlock_t freelist_lock;
-
- size_t fifo_last_bucket;
-
- /* Allocation stuff: */
-
- /* most out of date gen in the btree */
- u8 *oldest_gens;
- struct bucket *buckets;
- unsigned short bucket_bits; /* ilog2(bucket_size) */
-
- /* last calculated minimum prio */
- u16 min_prio[2];
-
- /*
- * Bucket book keeping. The first element is updated by GC, the
- * second contains a saved copy of the stats from the beginning
- * of GC.
- */
- struct bch_dev_usage __percpu *usage_percpu;
- struct bch_dev_usage usage_cached;
-
- atomic_long_t saturated_count;
- size_t inc_gen_needs_gc;
-
- struct mutex heap_lock;
- DECLARE_HEAP(struct bucket_heap_entry, heap);
-
- /* Moving GC: */
- struct task_struct *moving_gc_read;
-
- struct bch_pd_controller moving_gc_pd;
-
- /* Tiering: */
- struct write_point tiering_write_point;
-
- struct write_point copygc_write_point;
-
- struct journal_device journal;
-
- struct work_struct io_error_work;
-
- /* The rest of this all shows up in sysfs */
-#define IO_ERROR_SHIFT 20
- atomic_t io_errors;
- atomic_t io_count;
-
- atomic64_t meta_sectors_written;
- atomic64_t btree_sectors_written;
- u64 __percpu *sectors_written;
-};
-
-/*
- * Flag bits for what phase of startup/shutdown the cache set is at, how we're
- * shutting down, etc.:
- *
- * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
- * all the backing devices first (their cached data gets invalidated, and they
- * won't automatically reattach).
- */
-enum {
- BCH_FS_INITIAL_GC_DONE,
- BCH_FS_DETACHING,
- BCH_FS_EMERGENCY_RO,
- BCH_FS_WRITE_DISABLE_COMPLETE,
- BCH_FS_GC_STOPPING,
- BCH_FS_GC_FAILURE,
- BCH_FS_BDEV_MOUNTED,
- BCH_FS_ERROR,
- BCH_FS_FSCK_FIXED_ERRORS,
-};
-
-struct btree_debug {
- unsigned id;
- struct dentry *btree;
- struct dentry *btree_format;
- struct dentry *failed;
-};
-
-struct bch_tier {
- unsigned idx;
- struct task_struct *migrate;
- struct bch_pd_controller pd;
-
- struct dev_group devs;
-};
-
-enum bch_fs_state {
- BCH_FS_STARTING = 0,
- BCH_FS_STOPPING,
- BCH_FS_RO,
- BCH_FS_RW,
-};
-
-struct bch_fs {
- struct closure cl;
-
- struct list_head list;
- struct kobject kobj;
- struct kobject internal;
- struct kobject opts_dir;
- struct kobject time_stats;
- unsigned long flags;
-
- int minor;
- struct device *chardev;
- struct super_block *vfs_sb;
- char name[40];
-
- /* ro/rw, add/remove devices: */
- struct mutex state_lock;
- enum bch_fs_state state;
-
- /* Counts outstanding writes, for clean transition to read-only */
- struct percpu_ref writes;
- struct work_struct read_only_work;
-
- struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
-
- struct bch_opts opts;
-
- /* Updated by bch_sb_update():*/
- struct {
- uuid_le uuid;
- uuid_le user_uuid;
-
- u16 block_size;
- u16 btree_node_size;
-
- u8 nr_devices;
- u8 clean;
-
- u8 meta_replicas_have;
- u8 data_replicas_have;
-
- u8 str_hash_type;
- u8 encryption_type;
-
- u64 time_base_lo;
- u32 time_base_hi;
- u32 time_precision;
- } sb;
-
- struct bch_sb *disk_sb;
- unsigned disk_sb_order;
-
- unsigned short block_bits; /* ilog2(block_size) */
-
- struct closure sb_write;
- struct mutex sb_lock;
-
- struct backing_dev_info bdi;
-
- /* BTREE CACHE */
- struct bio_set btree_read_bio;
-
- struct btree_root btree_roots[BTREE_ID_NR];
- struct mutex btree_root_lock;
-
- bool btree_cache_table_init_done;
- struct rhashtable btree_cache_table;
-
- /*
- * We never free a struct btree, except on shutdown - we just put it on
- * the btree_cache_freed list and reuse it later. This simplifies the
- * code, and it doesn't cost us much memory as the memory usage is
- * dominated by buffers that hold the actual btree node data and those
- * can be freed - and the number of struct btrees allocated is
- * effectively bounded.
- *
- * btree_cache_freeable effectively is a small cache - we use it because
- * high order page allocations can be rather expensive, and it's quite
- * common to delete and allocate btree nodes in quick succession. It
- * should never grow past ~2-3 nodes in practice.
- */
- struct mutex btree_cache_lock;
- struct list_head btree_cache;
- struct list_head btree_cache_freeable;
- struct list_head btree_cache_freed;
-
- /* Number of elements in btree_cache + btree_cache_freeable lists */
- unsigned btree_cache_used;
- unsigned btree_cache_reserve;
- struct shrinker btree_cache_shrink;
-
- /*
- * If we need to allocate memory for a new btree node and that
- * allocation fails, we can cannibalize another node in the btree cache
- * to satisfy the allocation - lock to guarantee only one thread does
- * this at a time:
- */
- struct closure_waitlist mca_wait;
- struct task_struct *btree_cache_alloc_lock;
-
- mempool_t btree_reserve_pool;
-
- /*
- * Cache of allocated btree nodes - if we allocate a btree node and
- * don't use it, if we free it that space can't be reused until going
- * _all_ the way through the allocator (which exposes us to a livelock
- * when allocating btree reserves fail halfway through) - instead, we
- * can stick them here:
- */
- struct btree_alloc {
- struct open_bucket *ob;
- BKEY_PADDED(k);
- } btree_reserve_cache[BTREE_NODE_RESERVE * 2];
- unsigned btree_reserve_cache_nr;
- struct mutex btree_reserve_cache_lock;
-
- mempool_t btree_interior_update_pool;
- struct list_head btree_interior_update_list;
- struct mutex btree_interior_update_lock;
-
- struct workqueue_struct *wq;
- /* copygc needs its own workqueue for index updates.. */
- struct workqueue_struct *copygc_wq;
-
- /* ALLOCATION */
- struct bch_pd_controller foreground_write_pd;
- struct delayed_work pd_controllers_update;
- unsigned pd_controllers_update_seconds;
- spinlock_t foreground_write_pd_lock;
- struct bch_write_op *write_wait_head;
- struct bch_write_op *write_wait_tail;
-
- struct timer_list foreground_write_wakeup;
-
- /*
- * These contain all r/w devices - i.e. devices we can currently
- * allocate from:
- */
- struct dev_group all_devs;
- struct bch_tier tiers[BCH_TIER_MAX];
- /* NULL if we only have devices in one tier: */
- struct bch_tier *fastest_tier;
-
- u64 capacity; /* sectors */
-
- /*
- * When capacity _decreases_ (due to a disk being removed), we
- * increment capacity_gen - this invalidates outstanding reservations
- * and forces them to be revalidated
- */
- u32 capacity_gen;
-
- atomic64_t sectors_available;
-
- struct bch_fs_usage __percpu *usage_percpu;
- struct bch_fs_usage usage_cached;
- struct lglock usage_lock;
-
- struct mutex bucket_lock;
-
- struct closure_waitlist freelist_wait;
-
- /*
- * When we invalidate buckets, we use both the priority and the amount
- * of good data to determine which buckets to reuse first - to weight
- * those together consistently we keep track of the smallest nonzero
- * priority of any bucket.
- */
- struct prio_clock prio_clock[2];
-
- struct io_clock io_clock[2];
-
- /* SECTOR ALLOCATOR */
- struct list_head open_buckets_open;
- struct list_head open_buckets_free;
- unsigned open_buckets_nr_free;
- struct closure_waitlist open_buckets_wait;
- spinlock_t open_buckets_lock;
- struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
-
- struct write_point btree_write_point;
-
- struct write_point write_points[WRITE_POINT_COUNT];
- struct write_point promote_write_point;
-
- /*
- * This write point is used for migrating data off a device
- * and can point to any other device.
- * We can't use the normal write points because those will
- * gang up n replicas, and for migration we want only one new
- * replica.
- */
- struct write_point migration_write_point;
-
- /* GARBAGE COLLECTION */
- struct task_struct *gc_thread;
- atomic_t kick_gc;
-
- /*
- * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
- * has been marked by GC.
- *
- * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
- *
- * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
- * currently running, and gc marks are currently valid
- *
- * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
- * can read without a lock.
- */
- seqcount_t gc_pos_lock;
- struct gc_pos gc_pos;
-
- /*
- * The allocation code needs gc_mark in struct bucket to be correct, but
- * it's not while a gc is in progress.
- */
- struct rw_semaphore gc_lock;
-
- /* IO PATH */
- struct bio_set bio_read;
- struct bio_set bio_read_split;
- struct bio_set bio_write;
- struct mutex bio_bounce_pages_lock;
- mempool_t bio_bounce_pages;
-
- mempool_t lz4_workspace_pool;
- void *zlib_workspace;
- struct mutex zlib_workspace_lock;
- mempool_t compression_bounce[2];
-
- struct crypto_blkcipher *chacha20;
- struct crypto_shash *poly1305;
-
- atomic64_t key_version;
-
- /* For punting bio submissions to workqueue, io.c */
- struct bio_list bio_submit_list;
- struct work_struct bio_submit_work;
- spinlock_t bio_submit_lock;
-
- struct bio_list read_retry_list;
- struct work_struct read_retry_work;
- spinlock_t read_retry_lock;
-
- /* FILESYSTEM */
- wait_queue_head_t writeback_wait;
- atomic_t writeback_pages;
- unsigned writeback_pages_max;
- atomic_long_t nr_inodes;
-
- /* NOTIFICATIONS */
- struct mutex uevent_lock;
- struct kobj_uevent_env uevent_env;
-
- /* DEBUG JUNK */
- struct dentry *debug;
- struct btree_debug btree_debug[BTREE_ID_NR];
-#ifdef CONFIG_BCACHE_DEBUG
- struct btree *verify_data;
- struct btree_node *verify_ondisk;
- struct mutex verify_lock;
-#endif
-
- u64 unused_inode_hint;
-
- /*
- * A btree node on disk could have too many bsets for an iterator to fit
- * on the stack - have to dynamically allocate them
- */
- mempool_t fill_iter;
-
- mempool_t btree_bounce_pool;
-
- struct journal journal;
-
- unsigned bucket_journal_seq;
-
- /* CACHING OTHER BLOCK DEVICES */
- mempool_t search;
- struct radix_tree_root devices;
- struct list_head cached_devs;
- u64 cached_dev_sectors;
- struct closure caching;
-
-#define CONGESTED_MAX 1024
- unsigned congested_last_us;
- atomic_t congested;
-
- /* The rest of this all shows up in sysfs */
- unsigned congested_read_threshold_us;
- unsigned congested_write_threshold_us;
-
- struct cache_accounting accounting;
- atomic_long_t cache_read_races;
- atomic_long_t writeback_keys_done;
- atomic_long_t writeback_keys_failed;
-
- unsigned error_limit;
- unsigned error_decay;
-
- unsigned foreground_write_ratelimit_enabled:1;
- unsigned copy_gc_enabled:1;
- unsigned tiering_enabled:1;
- unsigned tiering_percent;
-
- /*
- * foreground writes will be throttled when the number of free
- * buckets is below this percentage
- */
- unsigned foreground_target_percent;
-
-#define BCH_DEBUG_PARAM(name, description) bool name;
- BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_TIME_STAT(name, frequency_units, duration_units) \
- struct time_stats name##_time;
- BCH_TIME_STATS()
-#undef BCH_TIME_STAT
-};
-
-static inline bool bch_fs_running(struct bch_fs *c)
-{
- return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
-}
-
-static inline unsigned bucket_pages(const struct bch_dev *ca)
-{
- return ca->mi.bucket_size / PAGE_SECTORS;
-}
-
-static inline unsigned bucket_bytes(const struct bch_dev *ca)
-{
- return ca->mi.bucket_size << 9;
-}
-
-static inline unsigned block_bytes(const struct bch_fs *c)
-{
- return c->sb.block_size << 9;
-}
-
-#endif /* _BCACHE_H */
diff --git a/libbcache/bkey.c b/libbcache/bkey.c
deleted file mode 100644
index 374237e2..00000000
--- a/libbcache/bkey.c
+++ /dev/null
@@ -1,1167 +0,0 @@
-
-#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
-
-#include <linux/kernel.h>
-
-#include "bkey.h"
-#include "bset.h"
-#include "util.h"
-
-const struct bkey_format bch_bkey_format_current = BKEY_FORMAT_CURRENT;
-
-struct bkey __bkey_unpack_key(const struct bkey_format *,
- const struct bkey_packed *);
-
-void bch_to_binary(char *out, const u64 *p, unsigned nr_bits)
-{
- unsigned bit = high_bit_offset, done = 0;
-
- while (1) {
- while (bit < 64) {
- if (done && !(done % 8))
- *out++ = ' ';
- *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
- bit++;
- done++;
- if (done == nr_bits) {
- *out++ = '\0';
- return;
- }
- }
-
- p = next_word(p);
- bit = 0;
- }
-}
-
-#ifdef CONFIG_BCACHE_DEBUG
-
-static void bch_bkey_pack_verify(const struct bkey_packed *packed,
- const struct bkey *unpacked,
- const struct bkey_format *format)
-{
- struct bkey tmp;
-
- BUG_ON(bkeyp_val_u64s(format, packed) !=
- bkey_val_u64s(unpacked));
-
- BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
-
- tmp = __bkey_unpack_key(format, packed);
-
- if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
- char buf1[160], buf2[160];
- char buf3[160], buf4[160];
-
- bch_bkey_to_text(buf1, sizeof(buf1), unpacked);
- bch_bkey_to_text(buf2, sizeof(buf2), &tmp);
- bch_to_binary(buf3, (void *) unpacked, 80);
- bch_to_binary(buf4, high_word(format, packed), 80);
-
- panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
- format->key_u64s,
- format->bits_per_field[0],
- format->bits_per_field[1],
- format->bits_per_field[2],
- format->bits_per_field[3],
- format->bits_per_field[4],
- buf1, buf2, buf3, buf4);
- }
-}
-
-#else
-static inline void bch_bkey_pack_verify(const struct bkey_packed *packed,
- const struct bkey *unpacked,
- const struct bkey_format *format) {}
-#endif
-
-int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
-{
- char *out = buf, *end = buf + size;
-
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
-
- p("u64s %u type %u %llu:%llu snap %u len %u ver %llu",
- k->u64s, k->type, k->p.inode, k->p.offset,
- k->p.snapshot, k->size, k->version.lo);
-
- BUG_ON(bkey_packed(k));
-
- switch (k->type) {
- case KEY_TYPE_DELETED:
- p(" deleted");
- break;
- case KEY_TYPE_DISCARD:
- p(" discard");
- break;
- case KEY_TYPE_ERROR:
- p(" error");
- break;
- case KEY_TYPE_COOKIE:
- p(" cookie");
- break;
- }
-#undef p
-
- return out - buf;
-}
-
-struct pack_state {
- const struct bkey_format *format;
- unsigned bits; /* bits remaining in current word */
- u64 w; /* current word */
- u64 *p; /* pointer to next word */
-};
-
-__always_inline
-static struct pack_state pack_state_init(const struct bkey_format *format,
- struct bkey_packed *k)
-{
- u64 *p = high_word(format, k);
-
- return (struct pack_state) {
- .format = format,
- .bits = 64 - high_bit_offset,
- .w = 0,
- .p = p,
- };
-}
-
-__always_inline
-static void pack_state_finish(struct pack_state *state,
- struct bkey_packed *k)
-{
- EBUG_ON(state->p < k->_data);
- EBUG_ON(state->p >= k->_data + state->format->key_u64s);
-
- *state->p = state->w;
-}
-
-struct unpack_state {
- const struct bkey_format *format;
- unsigned bits; /* bits remaining in current word */
- u64 w; /* current word */
- const u64 *p; /* pointer to next word */
-};
-
-__always_inline
-static struct unpack_state unpack_state_init(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- const u64 *p = high_word(format, k);
-
- return (struct unpack_state) {
- .format = format,
- .bits = 64 - high_bit_offset,
- .w = *p << high_bit_offset,
- .p = p,
- };
-}
-
-__always_inline
-static u64 get_inc_field(struct unpack_state *state, unsigned field)
-{
- unsigned bits = state->format->bits_per_field[field];
- u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
-
- if (bits >= state->bits) {
- v = state->w >> (64 - bits);
- bits -= state->bits;
-
- state->p = next_word(state->p);
- state->w = *state->p;
- state->bits = 64;
- }
-
- /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
- v |= (state->w >> 1) >> (63 - bits);
- state->w <<= bits;
- state->bits -= bits;
-
- return v + offset;
-}
-
-__always_inline
-static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
-{
- unsigned bits = state->format->bits_per_field[field];
- u64 offset = le64_to_cpu(state->format->field_offset[field]);
-
- if (v < offset)
- return false;
-
- v -= offset;
-
- if (fls64(v) > bits)
- return false;
-
- if (bits > state->bits) {
- bits -= state->bits;
- /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
- state->w |= (v >> 1) >> (bits - 1);
-
- *state->p = state->w;
- state->p = next_word(state->p);
- state->w = 0;
- state->bits = 64;
- }
-
- state->bits -= bits;
- state->w |= v << state->bits;
-
- return true;
-}
-
-/*
- * Note: does NOT set out->format (we don't know what it should be here!)
- *
- * Also: doesn't work on extents - it doesn't preserve the invariant that
- * if k is packed bkey_start_pos(k) will successfully pack
- */
-static bool bch_bkey_transform_key(const struct bkey_format *out_f,
- struct bkey_packed *out,
- const struct bkey_format *in_f,
- const struct bkey_packed *in)
-{
- struct pack_state out_s = pack_state_init(out_f, out);
- struct unpack_state in_s = unpack_state_init(in_f, in);
- unsigned i;
-
- out->_data[0] = 0;
-
- for (i = 0; i < BKEY_NR_FIELDS; i++)
- if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
- return false;
-
- /* Can't happen because the val would be too big to unpack: */
- EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
-
- pack_state_finish(&out_s, out);
- out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s;
- out->needs_whiteout = in->needs_whiteout;
- out->type = in->type;
-
- return true;
-}
-
-bool bch_bkey_transform(const struct bkey_format *out_f,
- struct bkey_packed *out,
- const struct bkey_format *in_f,
- const struct bkey_packed *in)
-{
- if (!bch_bkey_transform_key(out_f, out, in_f, in))
- return false;
-
- memcpy_u64s((u64 *) out + out_f->key_u64s,
- (u64 *) in + in_f->key_u64s,
- (in->u64s - in_f->key_u64s));
- return true;
-}
-
-#define bkey_fields() \
- x(BKEY_FIELD_INODE, p.inode) \
- x(BKEY_FIELD_OFFSET, p.offset) \
- x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
- x(BKEY_FIELD_SIZE, size) \
- x(BKEY_FIELD_VERSION_HI, version.hi) \
- x(BKEY_FIELD_VERSION_LO, version.lo)
-
-struct bkey __bkey_unpack_key(const struct bkey_format *format,
- const struct bkey_packed *in)
-{
- struct unpack_state state = unpack_state_init(format, in);
- struct bkey out;
-
- EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
- EBUG_ON(in->u64s < format->key_u64s);
- EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
- EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
-
- out.u64s = BKEY_U64s + in->u64s - format->key_u64s;
- out.format = KEY_FORMAT_CURRENT;
- out.needs_whiteout = in->needs_whiteout;
- out.type = in->type;
- out.pad[0] = 0;
-
-#define x(id, field) out.field = get_inc_field(&state, id);
- bkey_fields()
-#undef x
-
- return out;
-}
-
-#ifndef HAVE_BCACHE_COMPILED_UNPACK
-struct bpos __bkey_unpack_pos(const struct bkey_format *format,
- const struct bkey_packed *in)
-{
- struct unpack_state state = unpack_state_init(format, in);
- struct bpos out;
-
- EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
- EBUG_ON(in->u64s < format->key_u64s);
- EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
-
- out.inode = get_inc_field(&state, BKEY_FIELD_INODE);
- out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET);
- out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
-
- return out;
-}
-#endif
-
-/**
- * bkey_pack_key -- pack just the key, not the value
- */
-bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
- const struct bkey_format *format)
-{
- struct pack_state state = pack_state_init(format, out);
-
- EBUG_ON((void *) in == (void *) out);
- EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
- EBUG_ON(in->format != KEY_FORMAT_CURRENT);
-
- out->_data[0] = 0;
-
-#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false;
- bkey_fields()
-#undef x
-
- /*
- * Extents - we have to guarantee that if an extent is packed, a trimmed
- * version will also pack:
- */
- if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET])
- return false;
-
- pack_state_finish(&state, out);
- out->u64s = format->key_u64s + in->u64s - BKEY_U64s;
- out->format = KEY_FORMAT_LOCAL_BTREE;
- out->needs_whiteout = in->needs_whiteout;
- out->type = in->type;
-
- bch_bkey_pack_verify(out, in, format);
- return true;
-}
-
-/**
- * bkey_unpack -- unpack the key and the value
- */
-void bkey_unpack(const struct btree *b, struct bkey_i *dst,
- const struct bkey_packed *src)
-{
- dst->k = bkey_unpack_key(b, src);
-
- memcpy_u64s(&dst->v,
- bkeyp_val(&b->format, src),
- bkeyp_val_u64s(&b->format, src));
-}
-
-/**
- * bkey_pack -- pack the key and the value
- */
-bool bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
- const struct bkey_format *format)
-{
- struct bkey_packed tmp;
-
- if (!bkey_pack_key(&tmp, &in->k, format))
- return false;
-
- memmove_u64s((u64 *) out + format->key_u64s,
- &in->v,
- bkey_val_u64s(&in->k));
- memcpy_u64s(out, &tmp, format->key_u64s);
-
- return true;
-}
-
-__always_inline
-static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
-{
- unsigned bits = state->format->bits_per_field[field];
- u64 offset = le64_to_cpu(state->format->field_offset[field]);
- bool ret = true;
-
- EBUG_ON(v < offset);
- v -= offset;
-
- if (fls64(v) > bits) {
- v = ~(~0ULL << bits);
- ret = false;
- }
-
- if (bits > state->bits) {
- bits -= state->bits;
- state->w |= (v >> 1) >> (bits - 1);
-
- *state->p = state->w;
- state->p = next_word(state->p);
- state->w = 0;
- state->bits = 64;
- }
-
- state->bits -= bits;
- state->w |= v << state->bits;
-
- return ret;
-}
-
-#ifdef CONFIG_BCACHE_DEBUG
-static bool bkey_packed_successor(struct bkey_packed *out,
- const struct btree *b,
- struct bkey_packed k)
-{
- const struct bkey_format *f = &b->format;
- unsigned nr_key_bits = b->nr_key_bits;
- unsigned first_bit, offset;
- u64 *p;
-
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
- if (!nr_key_bits)
- return false;
-
- *out = k;
-
- first_bit = high_bit_offset + nr_key_bits - 1;
- p = nth_word(high_word(f, out), first_bit >> 6);
- offset = 63 - (first_bit & 63);
-
- while (nr_key_bits) {
- unsigned bits = min(64 - offset, nr_key_bits);
- u64 mask = (~0ULL >> (64 - bits)) << offset;
-
- if ((*p & mask) != mask) {
- *p += 1ULL << offset;
- EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
- return true;
- }
-
- *p &= ~mask;
- p = prev_word(p);
- nr_key_bits -= bits;
- offset = 0;
- }
-
- return false;
-}
-#endif
-
-/*
- * Returns a packed key that compares <= in
- *
- * This is used in bset_search_tree(), where we need a packed pos in order to be
- * able to compare against the keys in the auxiliary search tree - and it's
- * legal to use a packed pos that isn't equivalent to the original pos,
- * _provided_ it compares <= to the original pos.
- */
-enum bkey_pack_pos_ret bkey_pack_pos_lossy(struct bkey_packed *out,
- struct bpos in,
- const struct btree *b)
-{
- const struct bkey_format *f = &b->format;
- struct pack_state state = pack_state_init(f, out);
-#ifdef CONFIG_BCACHE_DEBUG
- struct bpos orig = in;
-#endif
- bool exact = true;
-
- out->_data[0] = 0;
-
- if (unlikely(in.snapshot <
- le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
- if (!in.offset-- &&
- !in.inode--)
- return BKEY_PACK_POS_FAIL;
- in.snapshot = KEY_SNAPSHOT_MAX;
- exact = false;
- }
-
- if (unlikely(in.offset <
- le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
- if (!in.inode--)
- return BKEY_PACK_POS_FAIL;
- in.offset = KEY_OFFSET_MAX;
- in.snapshot = KEY_SNAPSHOT_MAX;
- exact = false;
- }
-
- if (unlikely(in.inode <
- le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
- return BKEY_PACK_POS_FAIL;
-
- if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) {
- in.offset = KEY_OFFSET_MAX;
- in.snapshot = KEY_SNAPSHOT_MAX;
- exact = false;
- }
-
- if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) {
- in.snapshot = KEY_SNAPSHOT_MAX;
- exact = false;
- }
-
- if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))
- exact = false;
-
- pack_state_finish(&state, out);
- out->u64s = f->key_u64s;
- out->format = KEY_FORMAT_LOCAL_BTREE;
- out->type = KEY_TYPE_DELETED;
-
-#ifdef CONFIG_BCACHE_DEBUG
- if (exact) {
- BUG_ON(bkey_cmp_left_packed(b, out, &orig));
- } else {
- struct bkey_packed successor;
-
- BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
- BUG_ON(bkey_packed_successor(&successor, b, *out) &&
- bkey_cmp_left_packed(b, &successor, &orig) < 0);
- }
-#endif
-
- return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
-}
-
-void bch_bkey_format_init(struct bkey_format_state *s)
-{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
- s->field_min[i] = U64_MAX;
-
- for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
- s->field_max[i] = 0;
-
- /* Make sure we can store a size of 0: */
- s->field_min[BKEY_FIELD_SIZE] = 0;
-}
-
-static void __bkey_format_add(struct bkey_format_state *s,
- unsigned field, u64 v)
-{
- s->field_min[field] = min(s->field_min[field], v);
- s->field_max[field] = max(s->field_max[field], v);
-}
-
-/*
- * Changes @format so that @k can be successfully packed with @format
- */
-void bch_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
-{
-#define x(id, field) __bkey_format_add(s, id, k->field);
- bkey_fields()
-#undef x
- __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
-}
-
-void bch_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
-{
- unsigned field = 0;
-
- __bkey_format_add(s, field++, p.inode);
- __bkey_format_add(s, field++, p.offset);
- __bkey_format_add(s, field++, p.snapshot);
-}
-
-/*
- * We don't want it to be possible for the packed format to represent fields
- * bigger than a u64... that will cause confusion and issues (like with
- * bkey_packed_successor())
- */
-static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
- unsigned bits, u64 offset)
-{
- offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1));
-
- f->bits_per_field[i] = bits;
- f->field_offset[i] = cpu_to_le64(offset);
-}
-
-struct bkey_format bch_bkey_format_done(struct bkey_format_state *s)
-{
- unsigned i, bits = KEY_PACKED_BITS_START;
- struct bkey_format ret = {
- .nr_fields = BKEY_NR_FIELDS,
- };
-
- for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
- s->field_min[i] = min(s->field_min[i], s->field_max[i]);
-
- set_format_field(&ret, i,
- fls64(s->field_max[i] - s->field_min[i]),
- s->field_min[i]);
-
- bits += ret.bits_per_field[i];
- }
-
- /* allow for extent merging: */
- if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
- ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
- bits += 4;
- }
-
- ret.key_u64s = DIV_ROUND_UP(bits, 64);
-
- /* if we have enough spare bits, round fields up to nearest byte */
- bits = ret.key_u64s * 64 - bits;
-
- for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
- unsigned r = round_up(ret.bits_per_field[i], 8) -
- ret.bits_per_field[i];
-
- if (r <= bits) {
- set_format_field(&ret, i,
- ret.bits_per_field[i] + r,
- le64_to_cpu(ret.field_offset[i]));
- bits -= r;
- }
- }
-
- EBUG_ON(bch_bkey_format_validate(&ret));
- return ret;
-}
-
-const char *bch_bkey_format_validate(struct bkey_format *f)
-{
- unsigned i, bits = KEY_PACKED_BITS_START;
-
- if (f->nr_fields != BKEY_NR_FIELDS)
- return "invalid format: incorrect number of fields";
-
- for (i = 0; i < f->nr_fields; i++) {
- u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
- if (f->bits_per_field[i] > 64)
- return "invalid format: field too large";
-
- if (field_offset &&
- (f->bits_per_field[i] == 64 ||
- (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
- field_offset)))
- return "invalid format: offset + bits overflow";
-
- bits += f->bits_per_field[i];
- }
-
- if (f->key_u64s != DIV_ROUND_UP(bits, 64))
- return "invalid format: incorrect key_u64s";
-
- return NULL;
-}
-
-/*
- * Most significant differing bit
- * Bits are indexed from 0 - return is [0, nr_key_bits)
- */
-__pure
-unsigned bkey_greatest_differing_bit(const struct btree *b,
- const struct bkey_packed *l_k,
- const struct bkey_packed *r_k)
-{
- const u64 *l = high_word(&b->format, l_k);
- const u64 *r = high_word(&b->format, r_k);
- unsigned nr_key_bits = b->nr_key_bits;
- unsigned word_bits = 64 - high_bit_offset;
- u64 l_v, r_v;
-
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
-
- /* for big endian, skip past header */
- l_v = *l & (~0ULL >> high_bit_offset);
- r_v = *r & (~0ULL >> high_bit_offset);
-
- while (nr_key_bits) {
- if (nr_key_bits < word_bits) {
- l_v >>= word_bits - nr_key_bits;
- r_v >>= word_bits - nr_key_bits;
- nr_key_bits = 0;
- } else {
- nr_key_bits -= word_bits;
- }
-
- if (l_v != r_v)
- return fls64(l_v ^ r_v) - 1 + nr_key_bits;
-
- l = next_word(l);
- r = next_word(r);
-
- l_v = *l;
- r_v = *r;
- word_bits = 64;
- }
-
- return 0;
-}
-
-/*
- * First set bit
- * Bits are indexed from 0 - return is [0, nr_key_bits)
- */
-__pure
-unsigned bkey_ffs(const struct btree *b,
- const struct bkey_packed *k)
-{
- const u64 *p = high_word(&b->format, k);
- unsigned nr_key_bits = b->nr_key_bits;
- unsigned ret = 0, offset;
-
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
-
- offset = nr_key_bits;
- while (offset > 64) {
- p = next_word(p);
- offset -= 64;
- }
-
- offset = 64 - offset;
-
- while (nr_key_bits) {
- unsigned bits = nr_key_bits + offset < 64
- ? nr_key_bits
- : 64 - offset;
-
- u64 mask = (~0ULL >> (64 - bits)) << offset;
-
- if (*p & mask)
- return ret + __ffs64(*p & mask) - offset;
-
- p = prev_word(p);
- nr_key_bits -= bits;
- ret += bits;
- offset = 0;
- }
-
- return 0;
-}
-
-#ifdef CONFIG_X86_64
-
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
- unsigned nr_key_bits)
-{
- long d0, d1, d2, d3;
- int cmp;
-
- /* we shouldn't need asm for this, but gcc is being retarded: */
-
- asm(".intel_syntax noprefix;"
- "xor eax, eax;"
- "xor edx, edx;"
- "1:;"
- "mov r8, [rdi];"
- "mov r9, [rsi];"
- "sub ecx, 64;"
- "jl 2f;"
-
- "cmp r8, r9;"
- "jnz 3f;"
-
- "lea rdi, [rdi - 8];"
- "lea rsi, [rsi - 8];"
- "jmp 1b;"
-
- "2:;"
- "not ecx;"
- "shr r8, 1;"
- "shr r9, 1;"
- "shr r8, cl;"
- "shr r9, cl;"
- "cmp r8, r9;"
-
- "3:\n"
- "seta al;"
- "setb dl;"
- "sub eax, edx;"
- ".att_syntax prefix;"
- : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
- : "0" (l), "1" (r), "3" (nr_key_bits)
- : "r8", "r9", "cc", "memory");
-
- return cmp;
-}
-
-#define I(_x) (*(out)++ = (_x))
-#define I1(i0) I(i0)
-#define I2(i0, i1) (I1(i0), I(i1))
-#define I3(i0, i1, i2) (I2(i0, i1), I(i2))
-#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3))
-#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4))
-
-static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
- enum bch_bkey_fields field,
- unsigned dst_offset, unsigned dst_size,
- bool *eax_zeroed)
-{
- unsigned byte = format->key_u64s * sizeof(u64);
- unsigned bits = format->bits_per_field[field];
- u64 offset = format->field_offset[field];
- unsigned i, bit_offset = 0;
- unsigned shl, shr;
-
- if (!bits && !offset) {
- if (!*eax_zeroed) {
- /* xor eax, eax */
- I2(0x31, 0xc0);
- }
-
- *eax_zeroed = true;
- goto set_field;
- }
-
- if (!bits) {
- /* just return offset: */
-
- switch (dst_size) {
- case 8:
- if (offset > S32_MAX) {
- /* mov [rdi + dst_offset], offset */
- I3(0xc7, 0x47, dst_offset);
- memcpy(out, &offset, 4);
- out += 4;
-
- I3(0xc7, 0x47, dst_offset + 4);
- memcpy(out, (void *) &offset + 4, 4);
- out += 4;
- } else {
- /* mov [rdi + dst_offset], offset */
- /* sign extended */
- I4(0x48, 0xc7, 0x47, dst_offset);
- memcpy(out, &offset, 4);
- out += 4;
- }
- break;
- case 4:
- /* mov [rdi + dst_offset], offset */
- I3(0xc7, 0x47, dst_offset);
- memcpy(out, &offset, 4);
- out += 4;
- break;
- default:
- BUG();
- }
-
- return out;
- }
-
- for (i = 0; i <= field; i++)
- bit_offset += format->bits_per_field[i];
-
- byte -= DIV_ROUND_UP(bit_offset, 8);
- bit_offset = round_up(bit_offset, 8) - bit_offset;
-
- *eax_zeroed = false;
-
- if (bit_offset == 0 && bits == 8) {
- /* movzx eax, BYTE PTR [rsi + imm8] */
- I4(0x0f, 0xb6, 0x46, byte);
- } else if (bit_offset == 0 && bits == 16) {
- /* movzx eax, WORD PTR [rsi + imm8] */
- I4(0x0f, 0xb7, 0x46, byte);
- } else if (bit_offset + bits <= 32) {
- /* mov eax, [rsi + imm8] */
- I3(0x8b, 0x46, byte);
-
- if (bit_offset) {
- /* shr eax, imm8 */
- I3(0xc1, 0xe8, bit_offset);
- }
-
- if (bit_offset + bits < 32) {
- unsigned mask = ~0U >> (32 - bits);
-
- /* and eax, imm32 */
- I1(0x25);
- memcpy(out, &mask, 4);
- out += 4;
- }
- } else if (bit_offset + bits <= 64) {
- /* mov rax, [rsi + imm8] */
- I4(0x48, 0x8b, 0x46, byte);
-
- shl = 64 - bit_offset - bits;
- shr = bit_offset + shl;
-
- if (shl) {
- /* shl rax, imm8 */
- I4(0x48, 0xc1, 0xe0, shl);
- }
-
- if (shr) {
- /* shr rax, imm8 */
- I4(0x48, 0xc1, 0xe8, shr);
- }
- } else {
- /* mov rax, [rsi + byte] */
- I4(0x48, 0x8b, 0x46, byte);
-
- /* mov edx, [rsi + byte + 8] */
- I3(0x8b, 0x56, byte + 8);
-
- /* bits from next word: */
- shr = bit_offset + bits - 64;
- BUG_ON(shr > bit_offset);
-
- /* shr rax, bit_offset */
- I4(0x48, 0xc1, 0xe8, shr);
-
- /* shl rdx, imm8 */
- I4(0x48, 0xc1, 0xe2, 64 - shr);
-
- /* or rax, rdx */
- I3(0x48, 0x09, 0xd0);
-
- shr = bit_offset - shr;
-
- if (shr) {
- /* shr rax, imm8 */
- I4(0x48, 0xc1, 0xe8, shr);
- }
- }
-
- /* rax += offset: */
- if (offset > S32_MAX) {
- /* mov rdx, imm64 */
- I2(0x48, 0xba);
- memcpy(out, &offset, 8);
- out += 8;
- /* add %rdx, %rax */
- I3(0x48, 0x01, 0xd0);
- } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
- /* add rax, imm32 */
- I2(0x48, 0x05);
- memcpy(out, &offset, 4);
- out += 4;
- } else if (offset) {
- /* add eax, imm32 */
- I1(0x05);
- memcpy(out, &offset, 4);
- out += 4;
- }
-set_field:
- switch (dst_size) {
- case 8:
- /* mov [rdi + dst_offset], rax */
- I4(0x48, 0x89, 0x47, dst_offset);
- break;
- case 4:
- /* mov [rdi + dst_offset], eax */
- I3(0x89, 0x47, dst_offset);
- break;
- default:
- BUG();
- }
-
- return out;
-}
-
-int bch_compile_bkey_format(const struct bkey_format *format, void *_out)
-{
- bool eax_zeroed = false;
- u8 *out = _out;
-
- /*
- * rdi: dst - unpacked key
- * rsi: src - packed key
- */
-
- /* k->u64s, k->format, k->type */
-
- /* mov eax, [rsi] */
- I2(0x8b, 0x06);
-
- /* add eax, BKEY_U64s - format->key_u64s */
- I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
-
- /* and eax, imm32: mask out k->pad: */
- I5(0x25, 0xff, 0xff, 0xff, 0);
-
- /* mov [rdi], eax */
- I2(0x89, 0x07);
-
-#define x(id, field) \
- out = compile_bkey_field(format, out, id, \
- offsetof(struct bkey, field), \
- sizeof(((struct bkey *) NULL)->field), \
- &eax_zeroed);
- bkey_fields()
-#undef x
-
- /* retq */
- I1(0xc3);
-
- return (void *) out - _out;
-}
-
-#else
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
- unsigned nr_key_bits)
-{
- u64 l_v, r_v;
-
- if (!nr_key_bits)
- return 0;
-
- /* for big endian, skip past header */
- nr_key_bits += high_bit_offset;
- l_v = *l & (~0ULL >> high_bit_offset);
- r_v = *r & (~0ULL >> high_bit_offset);
-
- while (1) {
- if (nr_key_bits < 64) {
- l_v >>= 64 - nr_key_bits;
- r_v >>= 64 - nr_key_bits;
- nr_key_bits = 0;
- } else {
- nr_key_bits -= 64;
- }
-
- if (l_v != r_v)
- return l_v < r_v ? -1 : 1;
-
- if (!nr_key_bits)
- return 0;
-
- l = next_word(l);
- r = next_word(r);
-
- l_v = *l;
- r_v = *r;
- }
-}
-#endif
-
-__pure
-int __bkey_cmp_packed_format_checked(const struct bkey_packed *l,
- const struct bkey_packed *r,
- const struct btree *b)
-{
- const struct bkey_format *f = &b->format;
- int ret;
-
- EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
- ret = __bkey_cmp_bits(high_word(f, l),
- high_word(f, r),
- b->nr_key_bits);
-
- EBUG_ON(ret != bkey_cmp(bkey_unpack_key_format_checked(b, l).p,
- bkey_unpack_key_format_checked(b, r).p));
- return ret;
-}
-
-__pure __flatten
-int __bkey_cmp_left_packed_format_checked(const struct btree *b,
- const struct bkey_packed *l,
- const struct bpos *r)
-{
- return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
-}
-
-__pure __flatten
-int __bkey_cmp_packed(const struct bkey_packed *l,
- const struct bkey_packed *r,
- const struct btree *b)
-{
- int packed = bkey_lr_packed(l, r);
-
- if (likely(packed == BKEY_PACKED_BOTH))
- return __bkey_cmp_packed_format_checked(l, r, b);
-
- switch (packed) {
- case BKEY_PACKED_NONE:
- return bkey_cmp(((struct bkey *) l)->p,
- ((struct bkey *) r)->p);
- case BKEY_PACKED_LEFT:
- return __bkey_cmp_left_packed_format_checked(b,
- (struct bkey_packed *) l,
- &((struct bkey *) r)->p);
- case BKEY_PACKED_RIGHT:
- return -__bkey_cmp_left_packed_format_checked(b,
- (struct bkey_packed *) r,
- &((struct bkey *) l)->p);
- default:
- unreachable();
- }
-}
-
-__pure __flatten
-int bkey_cmp_left_packed(const struct btree *b,
- const struct bkey_packed *l, const struct bpos *r)
-{
- const struct bkey *l_unpacked;
-
- return unlikely(l_unpacked = packed_to_bkey_c(l))
- ? bkey_cmp(l_unpacked->p, *r)
- : __bkey_cmp_left_packed_format_checked(b, l, r);
-}
-
-void bch_bpos_swab(struct bpos *p)
-{
- u8 *l = (u8 *) p;
- u8 *h = ((u8 *) &p[1]) - 1;
-
- while (l < h) {
- swap(*l, *h);
- l++;
- --h;
- }
-}
-
-void bch_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
-{
- const struct bkey_format *f = bkey_packed(k) ? _f : &bch_bkey_format_current;
- u8 *l = k->key_start;
- u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
-
- while (l < h) {
- swap(*l, *h);
- l++;
- --h;
- }
-}
-
-#ifdef CONFIG_BCACHE_DEBUG
-void bkey_pack_test(void)
-{
- struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
- struct bkey_packed p;
-
- struct bkey_format test_format = {
- .key_u64s = 2,
- .nr_fields = BKEY_NR_FIELDS,
- .bits_per_field = {
- 13,
- 64,
- },
- };
-
- struct unpack_state in_s =
- unpack_state_init(&bch_bkey_format_current, (void *) &t);
- struct pack_state out_s = pack_state_init(&test_format, &p);
- unsigned i;
-
- for (i = 0; i < out_s.format->nr_fields; i++) {
- u64 a, v = get_inc_field(&in_s, i);
-
- switch (i) {
-#define x(id, field) case id: a = t.field; break;
- bkey_fields()
-#undef x
- default:
- BUG();
- }
-
- if (a != v)
- panic("got %llu actual %llu i %u\n", v, a, i);
-
- if (!set_inc_field(&out_s, i, v))
- panic("failed at %u\n", i);
- }
-
- BUG_ON(!bkey_pack_key(&p, &t, &test_format));
-}
-#endif
diff --git a/libbcache/bkey.h b/libbcache/bkey.h
deleted file mode 100644
index 0893134f..00000000
--- a/libbcache/bkey.h
+++ /dev/null
@@ -1,606 +0,0 @@
-#ifndef _BCACHE_BKEY_H
-#define _BCACHE_BKEY_H
-
-#include <linux/bug.h>
-#include <linux/bcache.h>
-
-#include "util.h"
-#include "vstructs.h"
-
-void bch_to_binary(char *, const u64 *, unsigned);
-int bch_bkey_to_text(char *, size_t, const struct bkey *);
-
-#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
-
-/* bkey with split value, const */
-struct bkey_s_c {
- const struct bkey *k;
- const struct bch_val *v;
-};
-
-/* bkey with split value */
-struct bkey_s {
- union {
- struct {
- struct bkey *k;
- struct bch_val *v;
- };
- struct bkey_s_c s_c;
- };
-};
-
-#define bkey_next(_k) vstruct_next(_k)
-
-static inline unsigned bkey_val_u64s(const struct bkey *k)
-{
- return k->u64s - BKEY_U64s;
-}
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
- return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
- k->u64s = BKEY_U64s + val_u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
- k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
-}
-
-/*
- * Mark a key as deleted without changing the size of the value (i.e. modifying
- * keys in the btree in place)
- */
-static inline void __set_bkey_deleted(struct bkey *k)
-{
- k->type = KEY_TYPE_DELETED;
-}
-
-static inline void set_bkey_deleted(struct bkey *k)
-{
- __set_bkey_deleted(k);
- set_bkey_val_u64s(k, 0);
-}
-
-#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_DELETED)
-
-#define bkey_whiteout(_k) \
- ((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD)
-
-#define bkey_packed_typecheck(_k) \
-({ \
- BUILD_BUG_ON(!type_is(_k, struct bkey *) && \
- !type_is(_k, struct bkey_packed *)); \
- type_is(_k, struct bkey_packed *); \
-})
-
-enum bkey_lr_packed {
- BKEY_PACKED_BOTH,
- BKEY_PACKED_RIGHT,
- BKEY_PACKED_LEFT,
- BKEY_PACKED_NONE,
-};
-
-#define bkey_lr_packed_typecheck(_l, _r) \
- (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
-
-#define bkey_lr_packed(_l, _r) \
- ((_l)->format + ((_r)->format << 1))
-
-#define bkey_copy(_dst, _src) \
-do { \
- BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \
- !type_is(_dst, struct bkey_packed *)); \
- BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \
- !type_is(_src, struct bkey_packed *)); \
- EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \
- (u64 *) (_dst) < (u64 *) (_src) + \
- ((struct bkey *) (_src))->u64s); \
- \
- __memmove_u64s_down((_dst), (_src), \
- ((struct bkey *) (_src))->u64s); \
-} while (0)
-
-struct btree;
-
-struct bkey_format_state {
- u64 field_min[BKEY_NR_FIELDS];
- u64 field_max[BKEY_NR_FIELDS];
-};
-
-void bch_bkey_format_init(struct bkey_format_state *);
-void bch_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
-void bch_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
-struct bkey_format bch_bkey_format_done(struct bkey_format_state *);
-const char *bch_bkey_format_validate(struct bkey_format *);
-
-__pure
-unsigned bkey_greatest_differing_bit(const struct btree *,
- const struct bkey_packed *,
- const struct bkey_packed *);
-__pure
-unsigned bkey_ffs(const struct btree *, const struct bkey_packed *);
-
-__pure
-int __bkey_cmp_packed_format_checked(const struct bkey_packed *,
- const struct bkey_packed *,
- const struct btree *);
-
-__pure
-int __bkey_cmp_left_packed_format_checked(const struct btree *,
- const struct bkey_packed *,
- const struct bpos *);
-
-__pure
-int __bkey_cmp_packed(const struct bkey_packed *,
- const struct bkey_packed *,
- const struct btree *);
-
-__pure
-int bkey_cmp_left_packed(const struct btree *,
- const struct bkey_packed *,
- const struct bpos *);
-
-/*
- * we prefer to pass bpos by ref, but it's often enough terribly convenient to
- * pass it by by val... as much as I hate c++, const ref would be nice here:
- */
-__pure __flatten
-static inline int bkey_cmp_left_packed_byval(const struct btree *b,
- const struct bkey_packed *l,
- struct bpos r)
-{
- return bkey_cmp_left_packed(b, l, &r);
-}
-
-/*
- * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
- * skip dispatching on k->format:
- */
-#define bkey_cmp_packed(_b, _l, _r) \
-({ \
- int _cmp; \
- \
- switch (bkey_lr_packed_typecheck(_l, _r)) { \
- case BKEY_PACKED_NONE: \
- _cmp = bkey_cmp(((struct bkey *) (_l))->p, \
- ((struct bkey *) (_r))->p); \
- break; \
- case BKEY_PACKED_LEFT: \
- _cmp = bkey_cmp_left_packed((_b), \
- (struct bkey_packed *) (_l), \
- &((struct bkey *) (_r))->p); \
- break; \
- case BKEY_PACKED_RIGHT: \
- _cmp = -bkey_cmp_left_packed((_b), \
- (struct bkey_packed *) (_r), \
- &((struct bkey *) (_l))->p); \
- break; \
- case BKEY_PACKED_BOTH: \
- _cmp = __bkey_cmp_packed((void *) (_l), \
- (void *) (_r), (_b)); \
- break; \
- } \
- _cmp; \
-})
-
-#if 1
-static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
-{
- if (l.inode != r.inode)
- return l.inode < r.inode ? -1 : 1;
- if (l.offset != r.offset)
- return l.offset < r.offset ? -1 : 1;
- if (l.snapshot != r.snapshot)
- return l.snapshot < r.snapshot ? -1 : 1;
- return 0;
-}
-#else
-int bkey_cmp(struct bpos l, struct bpos r);
-#endif
-
-static inline struct bpos bpos_min(struct bpos l, struct bpos r)
-{
- return bkey_cmp(l, r) < 0 ? l : r;
-}
-
-void bch_bpos_swab(struct bpos *);
-void bch_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
-
-static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
-{
- if (l.hi != r.hi)
- return l.hi < r.hi ? -1 : 1;
- if (l.lo != r.lo)
- return l.lo < r.lo ? -1 : 1;
- return 0;
-}
-
-#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
-
-static __always_inline int bversion_zero(struct bversion v)
-{
- return !bversion_cmp(v, ZERO_VERSION);
-}
-
-#ifdef CONFIG_BCACHE_DEBUG
-/* statement expressions confusing unlikely()? */
-#define bkey_packed(_k) \
- ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \
- (_k)->format != KEY_FORMAT_CURRENT; })
-#else
-#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT)
-#endif
-
-/*
- * It's safe to treat an unpacked bkey as a packed one, but not the reverse
- */
-static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
-{
- return (struct bkey_packed *) k;
-}
-
-static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
-{
- return (const struct bkey_packed *) k;
-}
-
-static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
-{
- return bkey_packed(k) ? NULL : (struct bkey_i *) k;
-}
-
-static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
-{
- return bkey_packed(k) ? NULL : (const struct bkey *) k;
-}
-
-static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
-{
- return format->bits_per_field[BKEY_FIELD_INODE] +
- format->bits_per_field[BKEY_FIELD_OFFSET] +
- format->bits_per_field[BKEY_FIELD_SNAPSHOT];
-}
-
-static inline struct bpos bkey_successor(struct bpos p)
-{
- struct bpos ret = p;
-
- if (!++ret.offset)
- BUG_ON(!++ret.inode);
-
- return ret;
-}
-
-static inline u64 bkey_start_offset(const struct bkey *k)
-{
- return k->p.offset - k->size;
-}
-
-static inline struct bpos bkey_start_pos(const struct bkey *k)
-{
- return (struct bpos) {
- .inode = k->p.inode,
- .offset = bkey_start_offset(k),
- .snapshot = k->p.snapshot,
- };
-}
-
-/* Packed helpers */
-
-static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
-
- EBUG_ON(k->u64s < ret);
- return ret;
-}
-
-static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- return bkeyp_key_u64s(format, k) * sizeof(u64);
-}
-
-static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- return k->u64s - bkeyp_key_u64s(format, k);
-}
-
-static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
- const struct bkey_packed *k)
-{
- return bkeyp_val_u64s(format, k) * sizeof(u64);
-}
-
-static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
- struct bkey_packed *k, unsigned val_u64s)
-{
- k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
-}
-
-#define bkeyp_val(_format, _k) \
- ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
-
-extern const struct bkey_format bch_bkey_format_current;
-
-bool bch_bkey_transform(const struct bkey_format *,
- struct bkey_packed *,
- const struct bkey_format *,
- const struct bkey_packed *);
-
-struct bkey __bkey_unpack_key(const struct bkey_format *,
- const struct bkey_packed *);
-
-#ifndef HAVE_BCACHE_COMPILED_UNPACK
-struct bpos __bkey_unpack_pos(const struct bkey_format *,
- const struct bkey_packed *);
-#endif
-
-bool bkey_pack_key(struct bkey_packed *, const struct bkey *,
- const struct bkey_format *);
-
-enum bkey_pack_pos_ret {
- BKEY_PACK_POS_EXACT,
- BKEY_PACK_POS_SMALLER,
- BKEY_PACK_POS_FAIL,
-};
-
-enum bkey_pack_pos_ret bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
- const struct btree *);
-
-static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
- const struct btree *b)
-{
- return bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
-}
-
-void bkey_unpack(const struct btree *, struct bkey_i *,
- const struct bkey_packed *);
-bool bkey_pack(struct bkey_packed *, const struct bkey_i *,
- const struct bkey_format *);
-
-static inline u64 bkey_field_max(const struct bkey_format *f,
- enum bch_bkey_fields nr)
-{
- return f->bits_per_field[nr] < 64
- ? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr])
- : U64_MAX;
-}
-
-#ifdef CONFIG_X86_64
-#define HAVE_BCACHE_COMPILED_UNPACK 1
-
-int bch_compile_bkey_format(const struct bkey_format *, void *);
-
-#else
-
-static inline int bch_compile_bkey_format(const struct bkey_format *format,
- void *out) { return 0; }
-
-#endif
-
-static inline void bkey_reassemble(struct bkey_i *dst,
- struct bkey_s_c src)
-{
- BUG_ON(bkey_packed(src.k));
- dst->k = *src.k;
- memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
-}
-
-#define bkey_s_null ((struct bkey_s) { .k = NULL })
-#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
- return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
- return (struct bkey_s_c) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
- return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
- return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define __BKEY_VAL_ACCESSORS(name, nr, _assert) \
-struct bkey_s_c_##name { \
- union { \
- struct { \
- const struct bkey *k; \
- const struct bch_##name *v; \
- }; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-struct bkey_s_##name { \
- union { \
- struct { \
- struct bkey *k; \
- struct bch_##name *v; \
- }; \
- struct bkey_s_c_##name c; \
- struct bkey_s s; \
- struct bkey_s_c s_c; \
- }; \
-}; \
- \
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
-{ \
- _assert(k->k.type, nr); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline const struct bkey_i_##name * \
-bkey_i_to_##name##_c(const struct bkey_i *k) \
-{ \
- _assert(k->k.type, nr); \
- return container_of(&k->k, struct bkey_i_##name, k); \
-} \
- \
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
-{ \
- _assert(k.k->type, nr); \
- return (struct bkey_s_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{ \
- _assert(k.k->type, nr); \
- return (struct bkey_s_c_##name) { \
- .k = k.k, \
- .v = container_of(k.v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{ \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-name##_i_to_s_c(const struct bkey_i_##name *k) \
-{ \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = &k->v, \
- }; \
-} \
- \
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
-{ \
- _assert(k->k.type, nr); \
- return (struct bkey_s_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bkey_s_c_##name \
-bkey_i_to_s_c_##name(const struct bkey_i *k) \
-{ \
- _assert(k->k.type, nr); \
- return (struct bkey_s_c_##name) { \
- .k = &k->k, \
- .v = container_of(&k->v, struct bch_##name, v), \
- }; \
-} \
- \
-static inline struct bch_##name * \
-bkey_p_##name##_val(const struct bkey_format *f, \
- struct bkey_packed *k) \
-{ \
- return container_of(bkeyp_val(f, k), struct bch_##name, v); \
-} \
- \
-static inline const struct bch_##name * \
-bkey_p_c_##name##_val(const struct bkey_format *f, \
- const struct bkey_packed *k) \
-{ \
- return container_of(bkeyp_val(f, k), struct bch_##name, v); \
-} \
- \
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{ \
- struct bkey_i_##name *k = \
- container_of(&_k->k, struct bkey_i_##name, k); \
- \
- bkey_init(&k->k); \
- memset(&k->v, 0, sizeof(k->v)); \
- k->k.type = nr; \
- set_bkey_val_bytes(&k->k, sizeof(k->v)); \
- \
- return k; \
-}
-
-#define __BKEY_VAL_ASSERT(_type, _nr) EBUG_ON(_type != _nr)
-
-#define BKEY_VAL_ACCESSORS(name, _nr) \
- static inline void __bch_##name##_assert(u8 type, u8 nr) \
- { \
- EBUG_ON(type != _nr); \
- } \
- \
- __BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert)
-
-BKEY_VAL_ACCESSORS(cookie, KEY_TYPE_COOKIE);
-
-static inline void __bch_extent_assert(u8 type, u8 nr)
-{
- EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED);
-}
-
-__BKEY_VAL_ACCESSORS(extent, BCH_EXTENT, __bch_extent_assert);
-BKEY_VAL_ACCESSORS(reservation, BCH_RESERVATION);
-
-BKEY_VAL_ACCESSORS(inode, BCH_INODE_FS);
-BKEY_VAL_ACCESSORS(inode_blockdev, BCH_INODE_BLOCKDEV);
-
-BKEY_VAL_ACCESSORS(dirent, BCH_DIRENT);
-
-BKEY_VAL_ACCESSORS(xattr, BCH_XATTR);
-
-/* byte order helpers */
-
-#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
-#error edit for your odd byteorder.
-#endif
-
-#ifdef __LITTLE_ENDIAN
-
-#define high_bit_offset 0
-#define __high_word(u64s, k) ((k)->_data + (u64s) - 1)
-#define nth_word(p, n) ((p) - (n))
-
-#else
-
-#define high_bit_offset KEY_PACKED_BITS_START
-#define __high_word(u64s, k) ((k)->_data)
-#define nth_word(p, n) ((p) + (n))
-
-#endif
-
-#define high_word(format, k) __high_word((format)->key_u64s, k)
-#define next_word(p) nth_word(p, 1)
-#define prev_word(p) nth_word(p, -1)
-
-#ifdef CONFIG_BCACHE_DEBUG
-void bkey_pack_test(void);
-#else
-static inline void bkey_pack_test(void) {}
-#endif
-
-#endif /* _BCACHE_BKEY_H */
diff --git a/libbcache/bkey_methods.c b/libbcache/bkey_methods.c
deleted file mode 100644
index 2908489c..00000000
--- a/libbcache/bkey_methods.c
+++ /dev/null
@@ -1,127 +0,0 @@
-
-#include "bcache.h"
-#include "bkey_methods.h"
-#include "btree_types.h"
-#include "dirent.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "xattr.h"
-
-const struct bkey_ops *bch_bkey_ops[] = {
- [BKEY_TYPE_EXTENTS] = &bch_bkey_extent_ops,
- [BKEY_TYPE_INODES] = &bch_bkey_inode_ops,
- [BKEY_TYPE_DIRENTS] = &bch_bkey_dirent_ops,
- [BKEY_TYPE_XATTRS] = &bch_bkey_xattr_ops,
- [BKEY_TYPE_BTREE] = &bch_bkey_btree_ops,
-};
-
-/* Returns string indicating reason for being invalid, or NULL if valid: */
-const char *bkey_invalid(struct bch_fs *c, enum bkey_type type,
- struct bkey_s_c k)
-{
- const struct bkey_ops *ops = bch_bkey_ops[type];
-
- if (k.k->u64s < BKEY_U64s)
- return "u64s too small";
-
- if (k.k->size &&
- (bkey_deleted(k.k) || !ops->is_extents))
- return "nonzero size field";
-
- switch (k.k->type) {
- case KEY_TYPE_DELETED:
- case KEY_TYPE_DISCARD:
- return NULL;
-
- case KEY_TYPE_ERROR:
- return bkey_val_bytes(k.k) != 0
- ? "value size should be zero"
- : NULL;
-
- case KEY_TYPE_COOKIE:
- return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
- ? "incorrect value size"
- : NULL;
-
- default:
- if (k.k->type < KEY_TYPE_GENERIC_NR)
- return "invalid type";
-
- return ops->key_invalid(c, k);
- }
-}
-
-const char *btree_bkey_invalid(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k)
-{
- if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
- return "key before start of btree node";
-
- if (bkey_cmp(k.k->p, b->data->max_key) > 0)
- return "key past end of btree node";
-
- if (k.k->p.snapshot)
- return "nonzero snapshot";
-
- return bkey_invalid(c, btree_node_type(b), k);
-}
-
-void bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
-{
- enum bkey_type type = btree_node_type(b);
- const struct bkey_ops *ops = bch_bkey_ops[type];
- const char *invalid;
-
- BUG_ON(!k.k->u64s);
-
- invalid = btree_bkey_invalid(c, b, k);
- if (invalid) {
- char buf[160];
-
- bch_bkey_val_to_text(c, type, buf, sizeof(buf), k);
- bch_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
- return;
- }
-
- if (k.k->type >= KEY_TYPE_GENERIC_NR &&
- ops->key_debugcheck)
- ops->key_debugcheck(c, b, k);
-}
-
-void bch_val_to_text(struct bch_fs *c, enum bkey_type type,
- char *buf, size_t size, struct bkey_s_c k)
-{
- const struct bkey_ops *ops = bch_bkey_ops[type];
-
- if (k.k->type >= KEY_TYPE_GENERIC_NR &&
- ops->val_to_text)
- ops->val_to_text(c, buf, size, k);
-}
-
-void bch_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
- char *buf, size_t size, struct bkey_s_c k)
-{
- const struct bkey_ops *ops = bch_bkey_ops[type];
- char *out = buf, *end = buf + size;
-
- out += bch_bkey_to_text(out, end - out, k.k);
-
- if (k.k->type >= KEY_TYPE_GENERIC_NR &&
- ops->val_to_text) {
- out += scnprintf(out, end - out, " -> ");
- ops->val_to_text(c, out, end - out, k);
- }
-}
-
-void bch_bkey_swab(enum bkey_type type,
- const struct bkey_format *f,
- struct bkey_packed *k)
-{
- const struct bkey_ops *ops = bch_bkey_ops[type];
-
- bch_bkey_swab_key(f, k);
-
- if (ops->swab)
- ops->swab(f, k);
-}
diff --git a/libbcache/bkey_methods.h b/libbcache/bkey_methods.h
deleted file mode 100644
index 111b1789..00000000
--- a/libbcache/bkey_methods.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef _BCACHE_BKEY_METHODS_H
-#define _BCACHE_BKEY_METHODS_H
-
-#include "bkey.h"
-
-#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val,
-
-enum bkey_type {
- DEFINE_BCH_BTREE_IDS()
- BKEY_TYPE_BTREE,
-};
-
-/* Type of a key in btree @id at level @level: */
-static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
-{
- return level ? BKEY_TYPE_BTREE : id;
-}
-
-static inline bool btree_type_has_ptrs(enum bkey_type type)
-{
- switch (type) {
- case BKEY_TYPE_BTREE:
- case BKEY_TYPE_EXTENTS:
- return true;
- default:
- return false;
- }
-}
-
-struct bch_fs;
-struct btree;
-struct bkey;
-
-enum merge_result {
- BCH_MERGE_NOMERGE,
-
- /*
- * The keys were mergeable, but would have overflowed size - so instead
- * l was changed to the maximum size, and both keys were modified:
- */
- BCH_MERGE_PARTIAL,
- BCH_MERGE_MERGE,
-};
-
-typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *,
- struct bkey_s);
-typedef enum merge_result (*key_merge_fn)(struct bch_fs *,
- struct btree *,
- struct bkey_i *, struct bkey_i *);
-
-struct bkey_ops {
- /* Returns reason for being invalid if invalid, else NULL: */
- const char * (*key_invalid)(const struct bch_fs *,
- struct bkey_s_c);
- void (*key_debugcheck)(struct bch_fs *, struct btree *,
- struct bkey_s_c);
- void (*val_to_text)(struct bch_fs *, char *,
- size_t, struct bkey_s_c);
- void (*swab)(const struct bkey_format *, struct bkey_packed *);
- key_filter_fn key_normalize;
- key_merge_fn key_merge;
- bool is_extents;
-};
-
-const char *bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
-const char *btree_bkey_invalid(struct bch_fs *, struct btree *,
- struct bkey_s_c);
-
-void bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-void bch_val_to_text(struct bch_fs *, enum bkey_type,
- char *, size_t, struct bkey_s_c);
-void bch_bkey_val_to_text(struct bch_fs *, enum bkey_type,
- char *, size_t, struct bkey_s_c);
-
-void bch_bkey_swab(enum bkey_type, const struct bkey_format *,
- struct bkey_packed *);
-
-extern const struct bkey_ops *bch_bkey_ops[];
-
-#undef DEF_BTREE_ID
-
-#endif /* _BCACHE_BKEY_METHODS_H */
diff --git a/libbcache/blockdev.c b/libbcache/blockdev.c
deleted file mode 100644
index a4522ad2..00000000
--- a/libbcache/blockdev.c
+++ /dev/null
@@ -1,819 +0,0 @@
-
-#include "bcache.h"
-#include "blockdev.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "checksum.h"
-#include "error.h"
-#include "inode.h"
-#include "request.h"
-#include "super-io.h"
-#include "writeback.h"
-
-#include <linux/kthread.h>
-#include <linux/module.h>
-#include <linux/random.h>
-
-static int bch_blockdev_major;
-static DEFINE_IDA(bch_blockdev_minor);
-static LIST_HEAD(uncached_devices);
-static DEFINE_MUTEX(bch_blockdev_lock);
-
-static struct kmem_cache *bch_search_cache;
-
-static void write_bdev_super_endio(struct bio *bio)
-{
- struct cached_dev *dc = bio->bi_private;
- /* XXX: error checking */
-
- closure_put(&dc->sb_write);
-}
-
-static void bch_write_bdev_super_unlock(struct closure *cl)
-{
- struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
-
- up(&dc->sb_write_mutex);
-}
-
-void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
-{
- struct backingdev_sb *sb = dc->disk_sb.sb;
- struct closure *cl = &dc->sb_write;
- struct bio *bio = dc->disk_sb.bio;
-
- down(&dc->sb_write_mutex);
- closure_init(cl, parent);
-
- sb->csum = csum_vstruct(NULL, BCH_CSUM_CRC64,
- (struct nonce) { 0 }, sb).lo;
-
- bio_reset(bio);
- bio->bi_bdev = dc->disk_sb.bdev;
- bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
- bio->bi_iter.bi_size =
- roundup(vstruct_bytes(sb),
- bdev_logical_block_size(dc->disk_sb.bdev));
- bio->bi_end_io = write_bdev_super_endio;
- bio->bi_private = dc;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FUA|REQ_META);
- bch_bio_map(bio, sb);
-
- closure_get(cl);
-
- closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
-}
-
-static int open_dev(struct block_device *b, fmode_t mode)
-{
- struct bcache_device *d = b->bd_disk->private_data;
-
- if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
- return -ENXIO;
-
- closure_get(&d->cl);
- return 0;
-}
-
-static void release_dev(struct gendisk *b, fmode_t mode)
-{
- struct bcache_device *d = b->private_data;
-
- closure_put(&d->cl);
-}
-
-static int ioctl_dev(struct block_device *b, fmode_t mode,
- unsigned int cmd, unsigned long arg)
-{
- struct bcache_device *d = b->bd_disk->private_data;
-
- return d->ioctl(d, mode, cmd, arg);
-}
-
-static const struct block_device_operations bcache_ops = {
- .open = open_dev,
- .release = release_dev,
- .ioctl = ioctl_dev,
- .owner = THIS_MODULE,
-};
-
-void bch_blockdev_stop(struct bcache_device *d)
-{
- if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
- closure_queue(&d->cl);
-}
-
-static void bcache_device_unlink(struct bcache_device *d)
-{
- if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
- sysfs_remove_link(&d->c->kobj, d->name);
- sysfs_remove_link(&d->kobj, "cache");
- }
-}
-
-static void bcache_device_link(struct bcache_device *d, struct bch_fs *c,
- const char *name)
-{
- snprintf(d->name, BCACHEDEVNAME_SIZE,
- "%s%llu", name, bcache_dev_inum(d));
-
- WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
- sysfs_create_link(&c->kobj, &d->kobj, d->name),
- "Couldn't create device <-> cache set symlinks");
-
- clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
-}
-
-static void bcache_device_detach(struct bcache_device *d)
-{
- if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
- mutex_lock(&d->inode_lock);
- bch_inode_rm(d->c, bcache_dev_inum(d));
- mutex_unlock(&d->inode_lock);
- }
-
- bcache_device_unlink(d);
-
- radix_tree_delete(&d->c->devices, bcache_dev_inum(d));
-
- closure_put(&d->c->caching);
- d->c = NULL;
-}
-
-static int bcache_device_attach(struct bcache_device *d, struct bch_fs *c)
-{
- int ret;
-
- ret = radix_tree_insert(&c->devices, bcache_dev_inum(d), d);
- if (ret) {
- pr_err("radix_tree_insert() error for inum %llu",
- bcache_dev_inum(d));
- return ret;
- }
-
- d->c = c;
- closure_get(&c->caching);
-
- return ret;
-}
-
-static void bcache_device_free(struct bcache_device *d)
-{
- pr_info("%s stopped", d->disk->disk_name);
-
- if (d->c)
- bcache_device_detach(d);
- if (d->disk && d->disk->flags & GENHD_FL_UP)
- del_gendisk(d->disk);
- if (d->disk && d->disk->queue)
- blk_cleanup_queue(d->disk->queue);
- if (d->disk) {
- ida_simple_remove(&bch_blockdev_minor, d->disk->first_minor);
- put_disk(d->disk);
- }
-
- bioset_exit(&d->bio_split);
-
- closure_debug_destroy(&d->cl);
-}
-
-static int bcache_device_init(struct bcache_device *d, unsigned block_size,
- sector_t sectors)
-{
- struct request_queue *q;
- int minor;
-
- mutex_init(&d->inode_lock);
-
- minor = ida_simple_get(&bch_blockdev_minor, 0, MINORMASK + 1, GFP_KERNEL);
- if (minor < 0) {
- pr_err("cannot allocate minor");
- return minor;
- }
-
- if (!(d->disk = alloc_disk(1)) ||
- bioset_init(&d->bio_split, 4, offsetof(struct bch_read_bio, bio))) {
- pr_err("cannot allocate disk");
- ida_simple_remove(&bch_blockdev_minor, minor);
- return -ENOMEM;
- }
-
- set_capacity(d->disk, sectors);
- snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
-
- d->disk->major = bch_blockdev_major;
- d->disk->first_minor = minor;
- d->disk->fops = &bcache_ops;
- d->disk->private_data = d;
-
- q = blk_alloc_queue(GFP_KERNEL);
- if (!q) {
- pr_err("cannot allocate queue");
- return -ENOMEM;
- }
-
- blk_queue_make_request(q, NULL);
- d->disk->queue = q;
- q->queuedata = d;
- q->backing_dev_info.congested_data = d;
- q->limits.max_hw_sectors = UINT_MAX;
- q->limits.max_sectors = UINT_MAX;
- q->limits.max_segment_size = UINT_MAX;
- q->limits.max_segments = BIO_MAX_PAGES;
- blk_queue_max_discard_sectors(q, UINT_MAX);
- q->limits.discard_granularity = 512;
- q->limits.io_min = block_size;
- q->limits.logical_block_size = block_size;
- q->limits.physical_block_size = block_size;
- set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
- clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
- set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
-
- blk_queue_write_cache(q, true, true);
-
- return 0;
-}
-
-/* Cached device */
-
-static void calc_cached_dev_sectors(struct bch_fs *c)
-{
- u64 sectors = 0;
- struct cached_dev *dc;
-
- list_for_each_entry(dc, &c->cached_devs, list)
- sectors += bdev_sectors(dc->disk_sb.bdev);
-
- c->cached_dev_sectors = sectors;
-}
-
-void bch_cached_dev_run(struct cached_dev *dc)
-{
- struct bcache_device *d = &dc->disk;
- char buf[BCH_SB_LABEL_SIZE + 1];
- char *env[] = {
- "DRIVER=bcache",
- kasprintf(GFP_KERNEL, "CACHED_UUID=%pU",
- dc->disk_sb.sb->disk_uuid.b),
- NULL,
- NULL,
- };
-
- memcpy(buf, dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
- buf[BCH_SB_LABEL_SIZE] = '\0';
- env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
-
- if (atomic_xchg(&dc->running, 1)) {
- kfree(env[1]);
- kfree(env[2]);
- return;
- }
-
- if (!d->c &&
- BDEV_STATE(dc->disk_sb.sb) != BDEV_STATE_NONE) {
- struct closure cl;
-
- closure_init_stack(&cl);
-
- SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_STALE);
- bch_write_bdev_super(dc, &cl);
- closure_sync(&cl);
- }
-
- add_disk(d->disk);
- bd_link_disk_holder(dc->disk_sb.bdev, dc->disk.disk);
- /* won't show up in the uevent file, use udevadm monitor -e instead
- * only class / kset properties are persistent */
- kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
- kfree(env[1]);
- kfree(env[2]);
-
- if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
- sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
- pr_debug("error creating sysfs link");
-}
-
-static void cached_dev_detach_finish(struct work_struct *w)
-{
- struct cached_dev *dc = container_of(w, struct cached_dev, detach);
- char buf[BDEVNAME_SIZE];
- struct closure cl;
-
- closure_init_stack(&cl);
-
- BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
- BUG_ON(atomic_read(&dc->count));
-
- mutex_lock(&bch_blockdev_lock);
-
- memset(&dc->disk_sb.sb->set_uuid, 0, 16);
- SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_NONE);
-
- bch_write_bdev_super(dc, &cl);
- closure_sync(&cl);
-
- bcache_device_detach(&dc->disk);
- list_move(&dc->list, &uncached_devices);
-
- clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
- clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
-
- mutex_unlock(&bch_blockdev_lock);
-
- pr_info("Caching disabled for %s", bdevname(dc->disk_sb.bdev, buf));
-
- /* Drop ref we took in cached_dev_detach() */
- closure_put(&dc->disk.cl);
-}
-
-void bch_cached_dev_detach(struct cached_dev *dc)
-{
- if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
- return;
-
- if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
- return;
-
- /*
- * Block the device from being closed and freed until we're finished
- * detaching
- */
- closure_get(&dc->disk.cl);
-
- dc->writeback_pd.rate.rate = UINT_MAX;
- bch_writeback_queue(dc);
- cached_dev_put(dc);
-}
-
-int bch_cached_dev_attach(struct cached_dev *dc, struct bch_fs *c)
-{
- __le64 rtime = cpu_to_le64(ktime_get_seconds());
- char buf[BDEVNAME_SIZE];
- bool found;
- int ret;
-
- lockdep_assert_held(&c->state_lock);
-
- bdevname(dc->disk_sb.bdev, buf);
-
- if (memcmp(&dc->disk_sb.sb->set_uuid,
- &c->sb.uuid,
- sizeof(c->sb.uuid)))
- return -ENOENT;
-
- if (dc->disk.c) {
- pr_err("Can't attach %s: already attached", buf);
- return -EINVAL;
- }
-
- if (!bch_fs_running(c)) {
- pr_err("Can't attach %s: not running", buf);
- return -EINVAL;
- }
-
- if (le16_to_cpu(dc->disk_sb.sb->block_size) < c->sb.block_size) {
- /* Will die */
- pr_err("Couldn't attach %s: block size less than set's block size",
- buf);
- return -EINVAL;
- }
-
- found = !bch_cached_dev_inode_find_by_uuid(c,
- &dc->disk_sb.sb->disk_uuid,
- &dc->disk.inode);
-
- if (!found && BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) {
- pr_err("Couldn't find uuid for %s in set", buf);
- return -ENOENT;
- }
-
- if (found &&
- (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE ||
- BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE)) {
- found = false;
- bch_inode_rm(c, bcache_dev_inum(&dc->disk));
- }
-
- /* Deadlocks since we're called via sysfs...
- sysfs_remove_file(&dc->kobj, &sysfs_attach);
- */
-
- if (!found) {
- struct closure cl;
-
- closure_init_stack(&cl);
-
- bkey_inode_blockdev_init(&dc->disk.inode.k_i);
- dc->disk.inode.k.type = BCH_INODE_BLOCKDEV;
- SET_CACHED_DEV(&dc->disk.inode.v, true);
- dc->disk.inode.v.i_uuid = dc->disk_sb.sb->disk_uuid;
- memcpy(dc->disk.inode.v.i_label,
- dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
- dc->disk.inode.v.i_ctime = rtime;
- dc->disk.inode.v.i_mtime = rtime;
-
- ret = bch_inode_create(c, &dc->disk.inode.k_i,
- 0, BLOCKDEV_INODE_MAX,
- &c->unused_inode_hint);
- if (ret) {
- pr_err("Error %d, not caching %s", ret, buf);
- return ret;
- }
-
- pr_info("attached inode %llu", bcache_dev_inum(&dc->disk));
-
- dc->disk_sb.sb->set_uuid = c->sb.uuid;
- SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN);
-
- bch_write_bdev_super(dc, &cl);
- closure_sync(&cl);
- } else {
- dc->disk.inode.v.i_mtime = rtime;
- bch_btree_update(c, BTREE_ID_INODES,
- &dc->disk.inode.k_i, NULL);
- }
-
- /* Count dirty sectors before attaching */
- if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY)
- bch_sectors_dirty_init(dc, c);
-
- ret = bcache_device_attach(&dc->disk, c);
- if (ret)
- return ret;
-
- list_move(&dc->list, &c->cached_devs);
- calc_cached_dev_sectors(c);
-
- /*
- * dc->c must be set before dc->count != 0 - paired with the mb in
- * cached_dev_get()
- */
- smp_wmb();
- atomic_set(&dc->count, 1);
-
- if (bch_cached_dev_writeback_start(dc))
- return -ENOMEM;
-
- if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_DIRTY) {
- atomic_set(&dc->has_dirty, 1);
- atomic_inc(&dc->count);
- }
-
- bch_cached_dev_run(dc);
- bcache_device_link(&dc->disk, c, "bdev");
-
- pr_info("Caching %s as %s on set %pU",
- bdevname(dc->disk_sb.bdev, buf), dc->disk.disk->disk_name,
- dc->disk.c->sb.uuid.b);
- return 0;
-}
-
-void bch_attach_backing_devs(struct bch_fs *c)
-{
- struct cached_dev *dc, *t;
-
- lockdep_assert_held(&c->state_lock);
-
- mutex_lock(&bch_blockdev_lock);
-
- list_for_each_entry_safe(dc, t, &uncached_devices, list)
- bch_cached_dev_attach(dc, c);
-
- mutex_unlock(&bch_blockdev_lock);
-}
-
-void bch_cached_dev_release(struct kobject *kobj)
-{
- struct cached_dev *dc = container_of(kobj, struct cached_dev,
- disk.kobj);
- kfree(dc);
- module_put(THIS_MODULE);
-}
-
-static void cached_dev_free(struct closure *cl)
-{
- struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
-
- bch_cached_dev_writeback_stop(dc);
- bch_cached_dev_writeback_free(dc);
-
- mutex_lock(&bch_blockdev_lock);
-
- if (atomic_read(&dc->running))
- bd_unlink_disk_holder(dc->disk_sb.bdev, dc->disk.disk);
- bcache_device_free(&dc->disk);
- list_del(&dc->list);
-
- mutex_unlock(&bch_blockdev_lock);
-
- bch_free_super((void *) &dc->disk_sb);
-
- kobject_put(&dc->disk.kobj);
-}
-
-static void cached_dev_flush(struct closure *cl)
-{
- struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
- struct bcache_device *d = &dc->disk;
-
- bch_cache_accounting_destroy(&dc->accounting);
- bcache_device_unlink(d);
- kobject_del(&d->kobj);
-
- continue_at(cl, cached_dev_free, system_wq);
-}
-
-static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
-{
- int ret;
- struct io *io;
- struct request_queue *q = bdev_get_queue(dc->disk_sb.bdev);
-
- dc->sequential_cutoff = 4 << 20;
-
- for (io = dc->io; io < dc->io + RECENT_IO; io++) {
- list_add(&io->lru, &dc->io_lru);
- hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
- }
-
- dc->disk.stripe_size = q->limits.io_opt >> 9;
-
- if (dc->disk.stripe_size)
- dc->partial_stripes_expensive =
- q->limits.raid_partial_stripes_expensive;
-
- ret = bcache_device_init(&dc->disk, block_size,
- dc->disk_sb.bdev->bd_part->nr_sects -
- le64_to_cpu(dc->disk_sb.sb->data_offset));
- if (ret)
- return ret;
-
- dc->disk.disk->queue->backing_dev_info.ra_pages =
- max(dc->disk.disk->queue->backing_dev_info.ra_pages,
- q->backing_dev_info.ra_pages);
-
- bch_cached_dev_request_init(dc);
- ret = bch_cached_dev_writeback_init(dc);
- if (ret)
- return ret;
-
- return 0;
-}
-
-/* Cached device - bcache superblock */
-
-static const char *bdev_validate_super(struct backingdev_sb *sb)
-{
- switch (le64_to_cpu(sb->version)) {
- case BCACHE_SB_VERSION_BDEV:
- sb->data_offset = cpu_to_le64(BDEV_DATA_START_DEFAULT);
- break;
- case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
- if (le64_to_cpu(sb->data_offset) < BDEV_DATA_START_DEFAULT)
- return "Bad data offset";
-
- break;
- default:
- return"Unsupported superblock version";
- }
-
- sb->last_mount = cpu_to_le32(get_seconds());
-
- return NULL;
-}
-
-const char *bch_backing_dev_register(struct bcache_superblock *sb)
-{
- char name[BDEVNAME_SIZE];
- const char *err;
- struct bch_fs *c;
- struct cached_dev *dc;
-
- dc = kzalloc(sizeof(*dc), GFP_KERNEL);
- if (!dc)
- return "cannot allocate memory";
-
- __module_get(THIS_MODULE);
- INIT_LIST_HEAD(&dc->list);
- closure_init(&dc->disk.cl, NULL);
- set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
- kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
- INIT_WORK(&dc->detach, cached_dev_detach_finish);
- sema_init(&dc->sb_write_mutex, 1);
- INIT_LIST_HEAD(&dc->io_lru);
- spin_lock_init(&dc->io_lock);
- bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
-
- memcpy(&dc->disk_sb, sb, sizeof(*sb));
- dc->disk_sb.bdev->bd_holder = dc;
- memset(sb, 0, sizeof(*sb));
-
- err = bdev_validate_super(dc->disk_sb.sb);
- if (err)
- goto err;
-
- if (cached_dev_init(dc, le16_to_cpu(dc->disk_sb.sb->block_size) << 9))
- goto err;
-
- err = "error creating kobject";
- if (kobject_add(&dc->disk.kobj,
- &part_to_dev(dc->disk_sb.bdev->bd_part)->kobj,
- "bcache"))
- goto err;
-
- err = "error accounting kobject";
- if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
- goto err;
-
- pr_info("registered backing device %s",
- bdevname(dc->disk_sb.bdev, name));
-
- list_add(&dc->list, &uncached_devices);
- c = bch_uuid_to_fs(dc->disk_sb.sb->set_uuid);
- if (c) {
- bch_cached_dev_attach(dc, c);
- closure_put(&c->cl);
- }
-
- if (BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_NONE ||
- BDEV_STATE(dc->disk_sb.sb) == BDEV_STATE_STALE)
- bch_cached_dev_run(dc);
-
- return NULL;
-err:
- bch_blockdev_stop(&dc->disk);
- return err;
-}
-
-/* Flash only volumes */
-
-void bch_blockdev_volume_release(struct kobject *kobj)
-{
- struct bcache_device *d = container_of(kobj, struct bcache_device,
- kobj);
- kfree(d);
-}
-
-static void blockdev_volume_free(struct closure *cl)
-{
- struct bcache_device *d = container_of(cl, struct bcache_device, cl);
-
- bcache_device_free(d);
- kobject_put(&d->kobj);
-}
-
-static void blockdev_volume_flush(struct closure *cl)
-{
- struct bcache_device *d = container_of(cl, struct bcache_device, cl);
-
- bcache_device_unlink(d);
- kobject_del(&d->kobj);
- continue_at(cl, blockdev_volume_free, system_wq);
-}
-
-static int blockdev_volume_run(struct bch_fs *c,
- struct bkey_s_c_inode_blockdev inode)
-{
- struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
- GFP_KERNEL);
- int ret = -ENOMEM;
-
- if (!d)
- return ret;
-
- bkey_reassemble(&d->inode.k_i, inode.s_c);
-
- closure_init(&d->cl, NULL);
- set_closure_fn(&d->cl, blockdev_volume_flush, system_wq);
-
- kobject_init(&d->kobj, &bch_blockdev_volume_ktype);
-
- ret = bcache_device_init(d, block_bytes(c),
- le64_to_cpu(inode.v->i_size) >> 9);
- if (ret)
- goto err;
-
- ret = bcache_device_attach(d, c);
- if (ret)
- goto err;
-
- bch_blockdev_volume_request_init(d);
- add_disk(d->disk);
-
- if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
- goto err;
-
- bcache_device_link(d, c, "volume");
-
- return 0;
-err:
- kobject_put(&d->kobj);
- return ret;
-}
-
-int bch_blockdev_volumes_start(struct bch_fs *c)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_inode_blockdev inode;
- int ret = 0;
-
- if (!bch_fs_running(c))
- return -EINVAL;
-
- for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
- if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
- break;
-
- if (k.k->type != BCH_INODE_BLOCKDEV)
- continue;
-
- inode = bkey_s_c_to_inode_blockdev(k);
-
- ret = blockdev_volume_run(c, inode);
- if (ret)
- break;
- }
- bch_btree_iter_unlock(&iter);
-
- return ret;
-}
-
-int bch_blockdev_volume_create(struct bch_fs *c, u64 size)
-{
- __le64 rtime = cpu_to_le64(ktime_get_seconds());
- struct bkey_i_inode_blockdev inode;
- int ret;
-
- bkey_inode_blockdev_init(&inode.k_i);
- get_random_bytes(&inode.v.i_uuid, sizeof(inode.v.i_uuid));
- inode.v.i_ctime = rtime;
- inode.v.i_mtime = rtime;
- inode.v.i_size = cpu_to_le64(size);
-
- ret = bch_inode_create(c, &inode.k_i, 0, BLOCKDEV_INODE_MAX,
- &c->unused_inode_hint);
- if (ret) {
- pr_err("Can't create volume: %d", ret);
- return ret;
- }
-
- return blockdev_volume_run(c, inode_blockdev_i_to_s_c(&inode));
-}
-
-void bch_blockdevs_stop(struct bch_fs *c)
-{
- struct cached_dev *dc;
- struct bcache_device *d;
- struct radix_tree_iter iter;
- void **slot;
-
- mutex_lock(&bch_blockdev_lock);
- rcu_read_lock();
-
- radix_tree_for_each_slot(slot, &c->devices, &iter, 0) {
- d = radix_tree_deref_slot(slot);
-
- if (CACHED_DEV(&d->inode.v) &&
- test_bit(BCH_FS_DETACHING, &c->flags)) {
- dc = container_of(d, struct cached_dev, disk);
- bch_cached_dev_detach(dc);
- } else {
- bch_blockdev_stop(d);
- }
- }
-
- rcu_read_unlock();
- mutex_unlock(&bch_blockdev_lock);
-}
-
-void bch_fs_blockdev_exit(struct bch_fs *c)
-{
- mempool_exit(&c->search);
-}
-
-int bch_fs_blockdev_init(struct bch_fs *c)
-{
- return mempool_init_slab_pool(&c->search, 1, bch_search_cache);
-}
-
-void bch_blockdev_exit(void)
-{
- kmem_cache_destroy(bch_search_cache);
-
- if (bch_blockdev_major >= 0)
- unregister_blkdev(bch_blockdev_major, "bcache");
-}
-
-int __init bch_blockdev_init(void)
-{
- bch_blockdev_major = register_blkdev(0, "bcache");
- if (bch_blockdev_major < 0)
- return bch_blockdev_major;
-
- bch_search_cache = KMEM_CACHE(search, 0);
- if (!bch_search_cache)
- return -ENOMEM;
-
- return 0;
-}
diff --git a/libbcache/blockdev.h b/libbcache/blockdev.h
deleted file mode 100644
index 5423d776..00000000
--- a/libbcache/blockdev.h
+++ /dev/null
@@ -1,134 +0,0 @@
-#ifndef _BCACHE_BLOCKDEV_H
-#define _BCACHE_BLOCKDEV_H
-
-#include "blockdev_types.h"
-#include "io_types.h"
-
-struct search {
- /* Stack frame for bio_complete */
- struct closure cl;
-
- union {
- struct bch_read_bio rbio;
- struct bch_write_bio wbio;
- };
- /* Not modified */
- struct bio *orig_bio;
- struct bcache_device *d;
-
- unsigned inode;
- unsigned write:1;
-
- /* Flags only used for reads */
- unsigned recoverable:1;
- unsigned read_dirty_data:1;
- unsigned cache_miss:1;
-
- /*
- * For reads: bypass read from cache and insertion into cache
- * For writes: discard key range from cache, sending the write to
- * the backing device (if there is a backing device)
- */
- unsigned bypass:1;
-
- unsigned long start_time;
-
- /*
- * Mostly only used for writes. For reads, we still make use of
- * some trivial fields:
- * - c
- * - error
- */
- struct bch_write_op iop;
-};
-
-#ifndef NO_BCACHE_BLOCKDEV
-
-extern struct kobj_type bch_cached_dev_ktype;
-extern struct kobj_type bch_blockdev_volume_ktype;
-
-void bch_write_bdev_super(struct cached_dev *, struct closure *);
-
-void bch_cached_dev_release(struct kobject *);
-void bch_blockdev_volume_release(struct kobject *);
-
-int bch_cached_dev_attach(struct cached_dev *, struct bch_fs *);
-void bch_attach_backing_devs(struct bch_fs *);
-
-void bch_cached_dev_detach(struct cached_dev *);
-void bch_cached_dev_run(struct cached_dev *);
-void bch_blockdev_stop(struct bcache_device *);
-
-const char *bch_backing_dev_register(struct bcache_superblock *);
-
-int bch_blockdev_volume_create(struct bch_fs *, u64);
-int bch_blockdev_volumes_start(struct bch_fs *);
-
-void bch_blockdevs_stop(struct bch_fs *);
-
-void bch_fs_blockdev_exit(struct bch_fs *);
-int bch_fs_blockdev_init(struct bch_fs *);
-void bch_blockdev_exit(void);
-int bch_blockdev_init(void);
-
-#else
-
-static inline void bch_write_bdev_super(struct cached_dev *dc,
- struct closure *cl) {}
-
-static inline void bch_cached_dev_release(struct kobject *kobj) {}
-static inline void bch_blockdev_volume_release(struct kobject *kobj) {}
-
-static inline int bch_cached_dev_attach(struct cached_dev *dc, struct bch_fs *c)
-{
- return 0;
-}
-static inline void bch_attach_backing_devs(struct bch_fs *c) {}
-
-static inline void bch_cached_dev_detach(struct cached_dev *dc) {}
-static inline void bch_cached_dev_run(struct cached_dev *dc) {}
-static inline void bch_blockdev_stop(struct bcache_device *d) {}
-
-static inline const char *bch_backing_dev_register(struct bcache_superblock *sb)
-{
- return "not implemented";
-}
-
-static inline int bch_blockdev_volume_create(struct bch_fs *c, u64 s) { return 0; }
-static inline int bch_blockdev_volumes_start(struct bch_fs *c) { return 0; }
-
-static inline void bch_blockdevs_stop(struct bch_fs *c) {}
-static inline void bch_fs_blockdev_exit(struct bch_fs *c) {}
-static inline int bch_fs_blockdev_init(struct bch_fs *c) { return 0; }
-static inline void bch_blockdev_exit(void) {}
-static inline int bch_blockdev_init(void) { return 0; }
-
-#endif
-
-static inline void cached_dev_put(struct cached_dev *dc)
-{
- if (atomic_dec_and_test(&dc->count))
- schedule_work(&dc->detach);
-}
-
-static inline bool cached_dev_get(struct cached_dev *dc)
-{
- if (!atomic_inc_not_zero(&dc->count))
- return false;
-
- /* Paired with the mb in cached_dev_attach */
- smp_mb__after_atomic();
- return true;
-}
-
-static inline u64 bcache_dev_inum(struct bcache_device *d)
-{
- return d->inode.k.p.inode;
-}
-
-static inline struct bcache_device *bch_dev_find(struct bch_fs *c, u64 inode)
-{
- return radix_tree_lookup(&c->devices, inode);
-}
-
-#endif /* _BCACHE_BLOCKDEV_H */
diff --git a/libbcache/blockdev_types.h b/libbcache/blockdev_types.h
deleted file mode 100644
index e5172004..00000000
--- a/libbcache/blockdev_types.h
+++ /dev/null
@@ -1,123 +0,0 @@
-#ifndef _BCACHE_BLOCKDEV_TYPES_H
-#define _BCACHE_BLOCKDEV_TYPES_H
-
-#include "keybuf_types.h"
-#include "stats_types.h"
-#include "super_types.h"
-#include "util.h"
-
-struct bcache_device {
- struct closure cl;
-
- struct kobject kobj;
-
- struct bch_fs *c;
-
- struct rb_node node;
- struct bkey_i_inode_blockdev inode;
- struct mutex inode_lock;
-
-#define BCACHEDEVNAME_SIZE 12
- char name[BCACHEDEVNAME_SIZE];
-
- struct gendisk *disk;
-
- unsigned long flags;
-#define BCACHE_DEV_CLOSING 0
-#define BCACHE_DEV_DETACHING 1
-#define BCACHE_DEV_UNLINK_DONE 2
-
- unsigned nr_stripes;
- unsigned stripe_size;
- atomic_t *stripe_sectors_dirty;
- unsigned long *full_dirty_stripes;
-
- struct bio_set bio_split;
-
- unsigned data_csum:1;
-
- int (*ioctl)(struct bcache_device *, fmode_t, unsigned, unsigned long);
-};
-
-struct io {
- /* Used to track sequential IO so it can be skipped */
- struct hlist_node hash;
- struct list_head lru;
-
- unsigned long last_io;
- unsigned sequential;
- sector_t last;
-};
-
-struct cached_dev {
- struct list_head list;
- struct bcache_device disk;
-
- //struct backingdev_sb sb;
-
- struct {
- struct backingdev_sb *sb;
- struct block_device *bdev;
- struct bio *bio;
- unsigned page_order;
- } disk_sb;
- struct closure sb_write;
- struct semaphore sb_write_mutex;
-
- /* Refcount on the cache set. Always nonzero when we're caching. */
- atomic_t count;
- struct work_struct detach;
-
- /*
- * Device might not be running if it's dirty and the cache set hasn't
- * showed up yet.
- */
- atomic_t running;
-
- /*
- * Writes take a shared lock from start to finish; scanning for dirty
- * data to refill the rb tree requires an exclusive lock.
- */
- struct rw_semaphore writeback_lock;
-
- /*
- * Nonzero, and writeback has a refcount (d->count), iff there is dirty
- * data in the cache. Protected by writeback_lock; must have an
- * shared lock to set and exclusive lock to clear.
- */
- atomic_t has_dirty;
-
- /* for dynamic rate control of writeback */
- struct bch_pd_controller writeback_pd;
- struct delayed_work writeback_pd_update;
- unsigned writeback_pd_update_seconds;
-
- struct task_struct *writeback_thread;
- struct keybuf writeback_keys;
- mempool_t writeback_io_pool;
- mempool_t writeback_page_pool;
-
- /* For tracking sequential IO */
-#define RECENT_IO_BITS 7
-#define RECENT_IO (1 << RECENT_IO_BITS)
- struct io io[RECENT_IO];
- struct hlist_head io_hash[RECENT_IO + 1];
- struct list_head io_lru;
- spinlock_t io_lock;
-
- struct cache_accounting accounting;
-
- /* The rest of this all shows up in sysfs */
- unsigned sequential_cutoff;
- unsigned readahead;
-
- unsigned verify:1;
- unsigned bypass_torture_test:1;
-
- unsigned partial_stripes_expensive:1;
- unsigned writeback_metadata:1;
- unsigned writeback_running:1;
- unsigned char writeback_percent;
-};
-
-#endif /* _BCACHE_BLOCKDEV_TYPES_H */
diff --git a/libbcache/bset.c b/libbcache/bset.c
deleted file mode 100644
index a88d8017..00000000
--- a/libbcache/bset.c
+++ /dev/null
@@ -1,1846 +0,0 @@
-/*
- * Code for working with individual keys, and sorted sets of keys with in a
- * btree node
- *
- * Copyright 2012 Google, Inc.
- */
-
-#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
-
-#include "eytzinger.h"
-#include "util.h"
-#include "bset.h"
-
-#include <asm/unaligned.h>
-#include <linux/dynamic_fault.h>
-#include <linux/console.h>
-#include <linux/random.h>
-#include <linux/prefetch.h>
-
-/* hack.. */
-#include "alloc_types.h"
-#include <trace/events/bcache.h>
-
-struct bset_tree *bch_bkey_to_bset(struct btree *b, struct bkey_packed *k)
-{
- struct bset_tree *t;
-
- for_each_bset(b, t)
- if (k >= btree_bkey_first(b, t) &&
- k < btree_bkey_last(b, t))
- return t;
-
- BUG();
-}
-
-/*
- * There are never duplicate live keys in the btree - but including keys that
- * have been flagged as deleted (and will be cleaned up later) we _will_ see
- * duplicates.
- *
- * Thus the sort order is: usual key comparison first, but for keys that compare
- * equal the deleted key(s) come first, and the (at most one) live version comes
- * last.
- *
- * The main reason for this is insertion: to handle overwrites, we first iterate
- * over keys that compare equal to our insert key, and then insert immediately
- * prior to the first key greater than the key we're inserting - our insert
- * position will be after all keys that compare equal to our insert key, which
- * by the time we actually do the insert will all be deleted.
- */
-
-void bch_dump_bset(struct btree *b, struct bset *i, unsigned set)
-{
- struct bkey_packed *_k, *_n;
- struct bkey k, n;
- char buf[120];
-
- if (!i->u64s)
- return;
-
- for (_k = i->start, k = bkey_unpack_key(b, _k);
- _k < vstruct_last(i);
- _k = _n, k = n) {
- _n = bkey_next(_k);
-
- bch_bkey_to_text(buf, sizeof(buf), &k);
- printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
- _k->_data - i->_data, i->u64s, buf);
-
- if (_n == vstruct_last(i))
- continue;
-
- n = bkey_unpack_key(b, _n);
-
- if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) {
- printk(KERN_ERR "Key skipped backwards\n");
- continue;
- }
-
- /*
- * Weird check for duplicate non extent keys: extents are
- * deleted iff they have 0 size, so if it has zero size and it's
- * not deleted these aren't extents:
- */
- if (((!k.size && !bkey_deleted(&k)) ||
- (!n.size && !bkey_deleted(&n))) &&
- !bkey_deleted(&k) &&
- !bkey_cmp(n.p, k.p))
- printk(KERN_ERR "Duplicate keys\n");
- }
-}
-
-void bch_dump_btree_node(struct btree *b)
-{
- struct bset_tree *t;
-
- console_lock();
- for_each_bset(b, t)
- bch_dump_bset(b, bset(b, t), t - b->set);
- console_unlock();
-}
-
-void bch_dump_btree_node_iter(struct btree *b,
- struct btree_node_iter *iter)
-{
- struct btree_node_iter_set *set;
-
- printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
-
- btree_node_iter_for_each(iter, set) {
- struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
- struct bset_tree *t = bch_bkey_to_bset(b, k);
- struct bkey uk = bkey_unpack_key(b, k);
- char buf[100];
-
- bch_bkey_to_text(buf, sizeof(buf), &uk);
- printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
- k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
- }
-}
-
-#ifdef CONFIG_BCACHE_DEBUG
-
-static bool keys_out_of_order(struct btree *b,
- const struct bkey_packed *prev,
- const struct bkey_packed *next,
- bool is_extents)
-{
- struct bkey nextu = bkey_unpack_key(b, next);
-
- return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
- ((is_extents
- ? !bkey_deleted(next)
- : !bkey_deleted(prev)) &&
- !bkey_cmp_packed(b, prev, next));
-}
-
-void __bch_verify_btree_nr_keys(struct btree *b)
-{
- struct bset_tree *t;
- struct bkey_packed *k;
- struct btree_nr_keys nr = { 0 };
-
- for_each_bset(b, t)
- for (k = btree_bkey_first(b, t);
- k != btree_bkey_last(b, t);
- k = bkey_next(k))
- if (!bkey_whiteout(k))
- btree_keys_account_key_add(&nr, t - b->set, k);
-
- BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
-}
-
-static void bch_btree_node_iter_next_check(struct btree_node_iter *iter,
- struct btree *b,
- struct bkey_packed *k)
-{
- const struct bkey_packed *n = bch_btree_node_iter_peek_all(iter, b);
-
- bkey_unpack_key(b, k);
-
- if (n &&
- keys_out_of_order(b, k, n, iter->is_extents)) {
- struct bkey ku = bkey_unpack_key(b, k);
- struct bkey nu = bkey_unpack_key(b, n);
- char buf1[80], buf2[80];
-
- bch_dump_btree_node(b);
- bch_bkey_to_text(buf1, sizeof(buf1), &ku);
- bch_bkey_to_text(buf2, sizeof(buf2), &nu);
- panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
- }
-}
-
-void bch_btree_node_iter_verify(struct btree_node_iter *iter,
- struct btree *b)
-{
- struct btree_node_iter_set *set;
- struct bset_tree *t;
- struct bkey_packed *k, *first;
-
- BUG_ON(iter->used > MAX_BSETS);
-
- if (!iter->used)
- return;
-
- btree_node_iter_for_each(iter, set) {
- k = __btree_node_offset_to_key(b, set->k);
- t = bch_bkey_to_bset(b, k);
-
- BUG_ON(__btree_node_offset_to_key(b, set->end) !=
- btree_bkey_last(b, t));
-
- BUG_ON(set + 1 < iter->data + iter->used &&
- btree_node_iter_cmp(iter, b, set[0], set[1]) > 0);
- }
-
- first = __btree_node_offset_to_key(b, iter->data[0].k);
-
- for_each_bset(b, t)
- if (bch_btree_node_iter_bset_pos(iter, b, t) ==
- btree_bkey_last(b, t) &&
- (k = bkey_prev_all(b, t, btree_bkey_last(b, t))))
- BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
- k, first) > 0);
-}
-
-void bch_verify_key_order(struct btree *b,
- struct btree_node_iter *iter,
- struct bkey_packed *where)
-{
- struct bset_tree *t = bch_bkey_to_bset(b, where);
- struct bkey_packed *k, *prev;
- struct bkey uk, uw = bkey_unpack_key(b, where);
-
- k = bkey_prev_all(b, t, where);
- if (k &&
- keys_out_of_order(b, k, where, iter->is_extents)) {
- char buf1[100], buf2[100];
-
- bch_dump_btree_node(b);
- uk = bkey_unpack_key(b, k);
- bch_bkey_to_text(buf1, sizeof(buf1), &uk);
- bch_bkey_to_text(buf2, sizeof(buf2), &uw);
- panic("out of order with prev:\n%s\n%s\n",
- buf1, buf2);
- }
-
- k = bkey_next(where);
- BUG_ON(k != btree_bkey_last(b, t) &&
- keys_out_of_order(b, where, k, iter->is_extents));
-
- for_each_bset(b, t) {
- if (where >= btree_bkey_first(b, t) ||
- where < btree_bkey_last(b, t))
- continue;
-
- k = bch_btree_node_iter_bset_pos(iter, b, t);
-
- if (k == btree_bkey_last(b, t))
- k = bkey_prev_all(b, t, k);
-
- while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
- (prev = bkey_prev_all(b, t, k)))
- k = prev;
-
- for (;
- k != btree_bkey_last(b, t);
- k = bkey_next(k)) {
- uk = bkey_unpack_key(b, k);
-
- if (iter->is_extents) {
- BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
- bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
- } else {
- BUG_ON(!bkey_cmp(uw.p, uk.p) &&
- !bkey_deleted(&uk));
- }
-
- if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
- break;
- }
- }
-}
-
-#else
-
-static void bch_btree_node_iter_next_check(struct btree_node_iter *iter,
- struct btree *b,
- struct bkey_packed *k) {}
-
-#endif
-
-/* Auxiliary search trees */
-
-#define BFLOAT_FAILED_UNPACKED (U8_MAX - 0)
-#define BFLOAT_FAILED_PREV (U8_MAX - 1)
-#define BFLOAT_FAILED_OVERFLOW (U8_MAX - 2)
-#define BFLOAT_FAILED (U8_MAX - 2)
-
-#define KEY_WORDS BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS)
-
-struct bkey_float {
- u8 exponent;
- u8 key_offset;
- union {
- u32 mantissa32;
- struct {
- u16 mantissa16;
- u16 _pad;
- };
- };
-} __packed;
-
-#define BFLOAT_32BIT_NR 32U
-
-static unsigned bkey_float_byte_offset(unsigned idx)
-{
- int d = (idx - BFLOAT_32BIT_NR) << 1;
-
- d &= ~(d >> 31);
-
- return idx * 6 - d;
-}
-
-struct ro_aux_tree {
- struct bkey_float _d[0];
-};
-
-struct rw_aux_tree {
- u16 offset;
- struct bpos k;
-};
-
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-
-#define BSET_CACHELINE 128
-
-/* Space required for the btree node keys */
-static inline size_t btree_keys_bytes(struct btree *b)
-{
- return PAGE_SIZE << b->page_order;
-}
-
-static inline size_t btree_keys_cachelines(struct btree *b)
-{
- return btree_keys_bytes(b) / BSET_CACHELINE;
-}
-
-static inline size_t btree_aux_data_bytes(struct btree *b)
-{
- return btree_keys_cachelines(b) * 8;
-}
-
-static inline size_t btree_aux_data_u64s(struct btree *b)
-{
- return btree_aux_data_bytes(b) / sizeof(u64);
-}
-
-static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
-{
- BUG_ON(t->aux_data_offset == U16_MAX);
-
- switch (bset_aux_tree_type(t)) {
- case BSET_NO_AUX_TREE:
- return t->aux_data_offset;
- case BSET_RO_AUX_TREE:
- return t->aux_data_offset +
- DIV_ROUND_UP(bkey_float_byte_offset(t->size) +
- sizeof(u8) * t->size, 8);
- case BSET_RW_AUX_TREE:
- return t->aux_data_offset +
- DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
- default:
- BUG();
- }
-}
-
-static unsigned bset_aux_tree_buf_start(const struct btree *b,
- const struct bset_tree *t)
-{
- return t == b->set
- ? DIV_ROUND_UP(b->unpack_fn_len, 8)
- : bset_aux_tree_buf_end(t - 1);
-}
-
-static void *__aux_tree_base(const struct btree *b,
- const struct bset_tree *t)
-{
- return b->aux_data + t->aux_data_offset * 8;
-}
-
-static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
- const struct bset_tree *t)
-{
- EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
- return __aux_tree_base(b, t);
-}
-
-static u8 *ro_aux_tree_prev(const struct btree *b,
- const struct bset_tree *t)
-{
- EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
- return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
-}
-
-static struct bkey_float *bkey_float_get(struct ro_aux_tree *b,
- unsigned idx)
-{
- return (void *) b + bkey_float_byte_offset(idx);
-}
-
-static struct bkey_float *bkey_float(const struct btree *b,
- const struct bset_tree *t,
- unsigned idx)
-{
- return bkey_float_get(ro_aux_tree_base(b, t), idx);
-}
-
-static void bset_aux_tree_verify(struct btree *b)
-{
-#ifdef CONFIG_BCACHE_DEBUG
- struct bset_tree *t;
-
- for_each_bset(b, t) {
- if (t->aux_data_offset == U16_MAX)
- continue;
-
- BUG_ON(t != b->set &&
- t[-1].aux_data_offset == U16_MAX);
-
- BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
- BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
- BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
- }
-#endif
-}
-
-/* Memory allocation */
-
-void bch_btree_keys_free(struct btree *b)
-{
- vfree(b->aux_data);
- b->aux_data = NULL;
-}
-
-int bch_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
-{
- b->page_order = page_order;
- b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp,
- PAGE_KERNEL_EXEC);
- if (!b->aux_data)
- return -ENOMEM;
-
- return 0;
-}
-
-void bch_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
-{
- unsigned i;
-
- b->nsets = 0;
- memset(&b->nr, 0, sizeof(b->nr));
-#ifdef CONFIG_BCACHE_DEBUG
- b->expensive_debug_checks = expensive_debug_checks;
-#endif
- for (i = 0; i < MAX_BSETS; i++)
- b->set[i].data_offset = U16_MAX;
-
- bch_bset_set_no_aux_tree(b, b->set);
-}
-
-/* Binary tree stuff for auxiliary search trees */
-
-/*
- * Cacheline/offset <-> bkey pointer arithmetic:
- *
- * t->tree is a binary search tree in an array; each node corresponds to a key
- * in one cacheline in t->set (BSET_CACHELINE bytes).
- *
- * This means we don't have to store the full index of the key that a node in
- * the binary tree points to; eytzinger_to_inorder() gives us the cacheline, and
- * then bkey_float->m gives us the offset within that cacheline, in units of 8
- * bytes.
- *
- * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
- * make this work.
- *
- * To construct the bfloat for an arbitrary key we need to know what the key
- * immediately preceding it is: we have to check if the two keys differ in the
- * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
- * of the previous key so we can walk backwards to it from t->tree[j]'s key.
- */
-
-static inline void *bset_cacheline(const struct btree *b,
- const struct bset_tree *t,
- unsigned cacheline)
-{
- return (void *) round_down((unsigned long) btree_bkey_first(b, t),
- L1_CACHE_BYTES) +
- cacheline * BSET_CACHELINE;
-}
-
-static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
- const struct bset_tree *t,
- unsigned cacheline,
- unsigned offset)
-{
- return bset_cacheline(b, t, cacheline) + offset * 8;
-}
-
-static unsigned bkey_to_cacheline(const struct btree *b,
- const struct bset_tree *t,
- const struct bkey_packed *k)
-{
- return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
-}
-
-static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
- const struct bset_tree *t,
- unsigned cacheline,
- const struct bkey_packed *k)
-{
- return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
-}
-
-static unsigned bkey_to_cacheline_offset(const struct btree *b,
- const struct bset_tree *t,
- unsigned cacheline,
- const struct bkey_packed *k)
-{
- size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
-
- EBUG_ON(m > U8_MAX);
- return m;
-}
-
-static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
- const struct bset_tree *t,
- unsigned j)
-{
- return cacheline_to_bkey(b, t,
- __eytzinger_to_inorder(j, t->size, t->extra),
- bkey_float(b, t, j)->key_offset);
-}
-
-static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
- const struct bset_tree *t,
- unsigned j)
-{
- unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
-
- return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
-}
-
-static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
- const struct bset_tree *t)
-{
- EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
-
- return __aux_tree_base(b, t);
-}
-
-/*
- * For the write set - the one we're currently inserting keys into - we don't
- * maintain a full search tree, we just keep a simple lookup table in t->prev.
- */
-static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
- struct bset_tree *t,
- unsigned j)
-{
- return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
-}
-
-static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
- unsigned j, struct bkey_packed *k)
-{
- BUG_ON(k >= btree_bkey_last(b, t));
-
- rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
- .offset = __btree_node_key_to_offset(b, k),
- .k = bkey_unpack_pos(b, k),
- };
-}
-
-static void bch_bset_verify_rw_aux_tree(struct btree *b,
- struct bset_tree *t)
-{
- struct bkey_packed *k = btree_bkey_first(b, t);
- unsigned j = 0;
-
- if (!btree_keys_expensive_checks(b))
- return;
-
- BUG_ON(bset_has_ro_aux_tree(t));
-
- if (!bset_has_rw_aux_tree(t))
- return;
-
- BUG_ON(t->size < 1);
- BUG_ON(rw_aux_to_bkey(b, t, j) != k);
-
- goto start;
- while (1) {
- if (rw_aux_to_bkey(b, t, j) == k) {
- BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
- bkey_unpack_pos(b, k)));
-start:
- if (++j == t->size)
- break;
-
- BUG_ON(rw_aux_tree(b, t)[j].offset <=
- rw_aux_tree(b, t)[j - 1].offset);
- }
-
- k = bkey_next(k);
- BUG_ON(k >= btree_bkey_last(b, t));
- }
-}
-
-/* returns idx of first entry >= offset: */
-static unsigned rw_aux_tree_bsearch(struct btree *b,
- struct bset_tree *t,
- unsigned offset)
-{
- unsigned l = 0, r = t->size;
-
- BUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
-
- while (l < r) {
- unsigned m = (l + r) >> 1;
-
- if (rw_aux_tree(b, t)[m].offset < offset)
- l = m + 1;
- else
- r = m;
- }
-
- BUG_ON(l < t->size &&
- rw_aux_tree(b, t)[l].offset < offset);
- BUG_ON(l &&
- rw_aux_tree(b, t)[l - 1].offset >= offset);
-
- BUG_ON(l > r);
- BUG_ON(l > t->size);
-
- return l;
-}
-
-static inline unsigned bfloat_mantissa(const struct bkey_float *f,
- unsigned idx)
-{
- return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16;
-}
-
-static inline void bfloat_mantissa_set(struct bkey_float *f,
- unsigned idx, unsigned mantissa)
-{
- if (idx < BFLOAT_32BIT_NR)
- f->mantissa32 = mantissa;
- else
- f->mantissa16 = mantissa;
-}
-
-static inline unsigned bkey_mantissa(const struct bkey_packed *k,
- const struct bkey_float *f,
- unsigned idx)
-{
- u64 v;
-
- EBUG_ON(!bkey_packed(k));
-
- v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
-
- /*
- * In little endian, we're shifting off low bits (and then the bits we
- * want are at the low end), in big endian we're shifting off high bits
- * (and then the bits we want are at the high end, so we shift them
- * back down):
- */
-#ifdef __LITTLE_ENDIAN
- v >>= f->exponent & 7;
-#else
- v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
-#endif
- return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v;
-}
-
-static void make_bfloat(struct btree *b, struct bset_tree *t,
- unsigned j,
- struct bkey_packed *min_key,
- struct bkey_packed *max_key)
-{
- struct bkey_float *f = bkey_float(b, t, j);
- struct bkey_packed *m = tree_to_bkey(b, t, j);
- struct bkey_packed *p = tree_to_prev_bkey(b, t, j);
- struct bkey_packed *l, *r;
- unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16;
- unsigned mantissa;
- int shift, exponent;
-
- EBUG_ON(bkey_next(p) != m);
-
- if (is_power_of_2(j)) {
- l = min_key;
-
- if (!l->u64s) {
- if (!bkey_pack_pos(l, b->data->min_key, b)) {
- struct bkey_i tmp;
-
- bkey_init(&tmp.k);
- tmp.k.p = b->data->min_key;
- bkey_copy(l, &tmp);
- }
- }
- } else {
- l = tree_to_prev_bkey(b, t, j >> ffs(j));
-
- EBUG_ON(m < l);
- }
-
- if (is_power_of_2(j + 1)) {
- r = max_key;
-
- if (!r->u64s) {
- if (!bkey_pack_pos(r, t->max_key, b)) {
- struct bkey_i tmp;
-
- bkey_init(&tmp.k);
- tmp.k.p = t->max_key;
- bkey_copy(r, &tmp);
- }
- }
- } else {
- r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
- EBUG_ON(m > r);
- }
-
- /*
- * for failed bfloats, the lookup code falls back to comparing against
- * the original key.
- */
-
- if (!bkey_packed(l) || !bkey_packed(r) ||
- !bkey_packed(p) || !bkey_packed(m)) {
- f->exponent = BFLOAT_FAILED_UNPACKED;
- return;
- }
-
- /*
- * The greatest differing bit of l and r is the first bit we must
- * include in the bfloat mantissa we're creating in order to do
- * comparisons - that bit always becomes the high bit of
- * bfloat->mantissa, and thus the exponent we're calculating here is
- * the position of what will become the low bit in bfloat->mantissa:
- *
- * Note that this may be negative - we may be running off the low end
- * of the key: we handle this later:
- */
- exponent = (int) bkey_greatest_differing_bit(b, l, r) - (bits - 1);
-
- /*
- * Then we calculate the actual shift value, from the start of the key
- * (k->_data), to get the key bits starting at exponent:
- */
-#ifdef __LITTLE_ENDIAN
- shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
-
- EBUG_ON(shift + bits > b->format.key_u64s * 64);
-#else
- shift = high_bit_offset +
- b->nr_key_bits -
- exponent -
- bits;
-
- EBUG_ON(shift < KEY_PACKED_BITS_START);
-#endif
- EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
-
- f->exponent = shift;
- mantissa = bkey_mantissa(m, f, j);
-
- /*
- * If we've got garbage bits, set them to all 1s - it's legal for the
- * bfloat to compare larger than the original key, but not smaller:
- */
- if (exponent < 0)
- mantissa |= ~(~0U << -exponent);
-
- bfloat_mantissa_set(f, j, mantissa);
-
- /*
- * The bfloat must be able to tell its key apart from the previous key -
- * if its key and the previous key don't differ in the required bits,
- * flag as failed - unless the keys are actually equal, in which case
- * we aren't required to return a specific one:
- */
- if (exponent > 0 &&
- bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) &&
- bkey_cmp_packed(b, p, m)) {
- f->exponent = BFLOAT_FAILED_PREV;
- return;
- }
-
- /*
- * f->mantissa must compare >= the original key - for transitivity with
- * the comparison in bset_search_tree. If we're dropping set bits,
- * increment it:
- */
- if (exponent > (int) bkey_ffs(b, m)) {
- if (j < BFLOAT_32BIT_NR
- ? f->mantissa32 == U32_MAX
- : f->mantissa16 == U16_MAX)
- f->exponent = BFLOAT_FAILED_OVERFLOW;
-
- if (j < BFLOAT_32BIT_NR)
- f->mantissa32++;
- else
- f->mantissa16++;
- }
-}
-
-/* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
-{
- bset_aux_tree_verify(b);
-
- return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
-}
-
-static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
-{
- unsigned bytes = __bset_tree_capacity(b, t);
-
- if (bytes < 7 * BFLOAT_32BIT_NR)
- return bytes / 7;
-
- bytes -= 7 * BFLOAT_32BIT_NR;
-
- return BFLOAT_32BIT_NR + bytes / 5;
-}
-
-static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
-{
- return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
-}
-
-static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
-{
- struct bkey_packed *k;
-
- t->size = 1;
- t->extra = BSET_RW_AUX_TREE_VAL;
- rw_aux_tree(b, t)[0].offset =
- __btree_node_key_to_offset(b, btree_bkey_first(b, t));
-
- for (k = btree_bkey_first(b, t);
- k != btree_bkey_last(b, t);
- k = bkey_next(k)) {
- if (t->size == bset_rw_tree_capacity(b, t))
- break;
-
- if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
- L1_CACHE_BYTES)
- rw_aux_tree_set(b, t, t->size++, k);
- }
-}
-
-static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
-{
- struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
- struct bkey_packed min_key, max_key;
- unsigned j, cacheline = 1;
-
- /* signal to make_bfloat() that they're uninitialized: */
- min_key.u64s = max_key.u64s = 0;
-
- t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
- bset_ro_tree_capacity(b, t));
-retry:
- if (t->size < 2) {
- t->size = 0;
- t->extra = BSET_NO_AUX_TREE_VAL;
- return;
- }
-
- t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
-
- /* First we figure out where the first key in each cacheline is */
- eytzinger_for_each(j, t->size) {
- while (bkey_to_cacheline(b, t, k) < cacheline)
- prev = k, k = bkey_next(k);
-
- if (k >= btree_bkey_last(b, t)) {
- t->size--;
- goto retry;
- }
-
- ro_aux_tree_prev(b, t)[j] = prev->u64s;
- bkey_float(b, t, j)->key_offset =
- bkey_to_cacheline_offset(b, t, cacheline++, k);
-
- BUG_ON(tree_to_prev_bkey(b, t, j) != prev);
- BUG_ON(tree_to_bkey(b, t, j) != k);
- }
-
- while (bkey_next(k) != btree_bkey_last(b, t))
- k = bkey_next(k);
-
- t->max_key = bkey_unpack_pos(b, k);
-
- /* Then we build the tree */
- eytzinger_for_each(j, t->size)
- make_bfloat(b, t, j, &min_key, &max_key);
-}
-
-static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
-{
- struct bset_tree *i;
-
- for (i = b->set; i != t; i++)
- BUG_ON(bset_has_rw_aux_tree(i));
-
- bch_bset_set_no_aux_tree(b, t);
-
- /* round up to next cacheline: */
- t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
- SMP_CACHE_BYTES / sizeof(u64));
-
- bset_aux_tree_verify(b);
-}
-
-void bch_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
- bool writeable)
-{
- if (writeable
- ? bset_has_rw_aux_tree(t)
- : bset_has_ro_aux_tree(t))
- return;
-
- bset_alloc_tree(b, t);
-
- if (!__bset_tree_capacity(b, t))
- return;
-
- if (writeable)
- __build_rw_aux_tree(b, t);
- else
- __build_ro_aux_tree(b, t);
-
- bset_aux_tree_verify(b);
-}
-
-void bch_bset_init_first(struct btree *b, struct bset *i)
-{
- struct bset_tree *t;
-
- BUG_ON(b->nsets);
-
- memset(i, 0, sizeof(*i));
- get_random_bytes(&i->seq, sizeof(i->seq));
- SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
- t = &b->set[b->nsets++];
- set_btree_bset(b, t, i);
-}
-
-void bch_bset_init_next(struct btree *b, struct bset *i)
-{
- struct bset_tree *t;
-
- BUG_ON(b->nsets >= MAX_BSETS);
-
- memset(i, 0, sizeof(*i));
- i->seq = btree_bset_first(b)->seq;
- SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
- t = &b->set[b->nsets++];
- set_btree_bset(b, t, i);
-}
-
-static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
- struct bkey_packed *k)
-{
- struct bkey_packed *p;
- unsigned offset;
- int j;
-
- EBUG_ON(k < btree_bkey_first(b, t) ||
- k > btree_bkey_last(b, t));
-
- if (k == btree_bkey_first(b, t))
- return NULL;
-
- switch (bset_aux_tree_type(t)) {
- case BSET_NO_AUX_TREE:
- p = btree_bkey_first(b, t);
- break;
- case BSET_RO_AUX_TREE:
- j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
-
- do {
- p = j ? tree_to_bkey(b, t,
- __inorder_to_eytzinger(j--,
- t->size, t->extra))
- : btree_bkey_first(b, t);
- } while (p >= k);
- break;
- case BSET_RW_AUX_TREE:
- offset = __btree_node_key_to_offset(b, k);
- j = rw_aux_tree_bsearch(b, t, offset);
- p = j ? rw_aux_to_bkey(b, t, j - 1)
- : btree_bkey_first(b, t);
- break;
- }
-
- return p;
-}
-
-struct bkey_packed *bkey_prev_all(struct btree *b, struct bset_tree *t,
- struct bkey_packed *k)
-{
- struct bkey_packed *p;
-
- p = __bkey_prev(b, t, k);
- if (!p)
- return NULL;
-
- while (bkey_next(p) != k)
- p = bkey_next(p);
-
- return p;
-}
-
-struct bkey_packed *bkey_prev(struct btree *b, struct bset_tree *t,
- struct bkey_packed *k)
-{
- while (1) {
- struct bkey_packed *p, *i, *ret = NULL;
-
- p = __bkey_prev(b, t, k);
- if (!p)
- return NULL;
-
- for (i = p; i != k; i = bkey_next(i))
- if (!bkey_deleted(i))
- ret = i;
-
- if (ret)
- return ret;
-
- k = p;
- }
-}
-
-/* Insert */
-
-static void rw_aux_tree_fix_invalidated_key(struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *k)
-{
- unsigned offset = __btree_node_key_to_offset(b, k);
- unsigned j = rw_aux_tree_bsearch(b, t, offset);
-
- if (j < t->size &&
- rw_aux_tree(b, t)[j].offset == offset)
- rw_aux_tree_set(b, t, j, k);
-
- bch_bset_verify_rw_aux_tree(b, t);
-}
-
-static void ro_aux_tree_fix_invalidated_key(struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *k)
-{
- struct bkey_packed min_key, max_key;
- unsigned inorder, j;
-
- BUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
- /* signal to make_bfloat() that they're uninitialized: */
- min_key.u64s = max_key.u64s = 0;
-
- if (bkey_next(k) == btree_bkey_last(b, t)) {
- t->max_key = bkey_unpack_pos(b, k);
-
- for (j = 1; j < t->size; j = j * 2 + 1)
- make_bfloat(b, t, j, &min_key, &max_key);
- }
-
- inorder = bkey_to_cacheline(b, t, k);
-
- if (inorder &&
- inorder < t->size) {
- j = __inorder_to_eytzinger(inorder, t->size, t->extra);
-
- if (k == tree_to_bkey(b, t, j)) {
- /* Fix the node this key corresponds to */
- make_bfloat(b, t, j, &min_key, &max_key);
-
- /* Children for which this key is the right boundary */
- for (j = eytzinger_left_child(j);
- j < t->size;
- j = eytzinger_right_child(j))
- make_bfloat(b, t, j, &min_key, &max_key);
- }
- }
-
- if (inorder + 1 < t->size) {
- j = __inorder_to_eytzinger(inorder + 1, t->size, t->extra);
-
- if (k == tree_to_prev_bkey(b, t, j)) {
- make_bfloat(b, t, j, &min_key, &max_key);
-
- /* Children for which this key is the left boundary */
- for (j = eytzinger_right_child(j);
- j < t->size;
- j = eytzinger_left_child(j))
- make_bfloat(b, t, j, &min_key, &max_key);
- }
- }
-}
-
-/**
- * bch_bset_fix_invalidated_key() - given an existing key @k that has been
- * modified, fix any auxiliary search tree by remaking all the nodes in the
- * auxiliary search tree that @k corresponds to
- */
-void bch_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t,
- struct bkey_packed *k)
-{
- switch (bset_aux_tree_type(t)) {
- case BSET_NO_AUX_TREE:
- break;
- case BSET_RO_AUX_TREE:
- ro_aux_tree_fix_invalidated_key(b, t, k);
- break;
- case BSET_RW_AUX_TREE:
- rw_aux_tree_fix_invalidated_key(b, t, k);
- break;
- }
-}
-
-static void bch_bset_fix_lookup_table(struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *_where,
- unsigned clobber_u64s,
- unsigned new_u64s)
-{
- int shift = new_u64s - clobber_u64s;
- unsigned l, j, where = __btree_node_key_to_offset(b, _where);
-
- BUG_ON(bset_has_ro_aux_tree(t));
-
- if (!bset_has_rw_aux_tree(t))
- return;
-
- l = rw_aux_tree_bsearch(b, t, where);
-
- /* l is first >= than @where */
-
- BUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
- BUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
-
- if (!l) /* never delete first entry */
- l++;
- else if (l < t->size &&
- where < t->end_offset &&
- rw_aux_tree(b, t)[l].offset == where)
- rw_aux_tree_set(b, t, l++, _where);
-
- /* l now > where */
-
- for (j = l;
- j < t->size &&
- rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
- j++)
- ;
-
- if (j < t->size &&
- rw_aux_tree(b, t)[j].offset + shift ==
- rw_aux_tree(b, t)[l - 1].offset)
- j++;
-
- memmove(&rw_aux_tree(b, t)[l],
- &rw_aux_tree(b, t)[j],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[j]);
- t->size -= j - l;
-
- for (j = l; j < t->size; j++)
- rw_aux_tree(b, t)[j].offset += shift;
-
- BUG_ON(l < t->size &&
- rw_aux_tree(b, t)[l].offset ==
- rw_aux_tree(b, t)[l - 1].offset);
-
- if (t->size < bset_rw_tree_capacity(b, t) &&
- (l < t->size
- ? rw_aux_tree(b, t)[l].offset
- : t->end_offset) -
- rw_aux_tree(b, t)[l - 1].offset >
- L1_CACHE_BYTES / sizeof(u64)) {
- struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
- struct bkey_packed *end = l < t->size
- ? rw_aux_to_bkey(b, t, l)
- : btree_bkey_last(b, t);
- struct bkey_packed *k = start;
-
- while (1) {
- k = bkey_next(k);
- if (k == end)
- break;
-
- if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
- memmove(&rw_aux_tree(b, t)[l + 1],
- &rw_aux_tree(b, t)[l],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[l]);
- t->size++;
- rw_aux_tree_set(b, t, l, k);
- break;
- }
- }
- }
-
- bch_bset_verify_rw_aux_tree(b, t);
- bset_aux_tree_verify(b);
-}
-
-void bch_bset_insert(struct btree *b,
- struct btree_node_iter *iter,
- struct bkey_packed *where,
- struct bkey_i *insert,
- unsigned clobber_u64s)
-{
- struct bkey_format *f = &b->format;
- struct bset_tree *t = bset_tree_last(b);
- struct bkey_packed packed, *src = bkey_to_packed(insert);
-
- bch_bset_verify_rw_aux_tree(b, t);
-
- if (bkey_pack_key(&packed, &insert->k, f))
- src = &packed;
-
- if (!bkey_whiteout(&insert->k))
- btree_keys_account_key_add(&b->nr, t - b->set, src);
-
- if (src->u64s != clobber_u64s) {
- u64 *src_p = where->_data + clobber_u64s;
- u64 *dst_p = where->_data + src->u64s;
-
- BUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
- (int) clobber_u64s - src->u64s);
-
- memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
- le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
- set_btree_bset_end(b, t);
- }
-
- memcpy_u64s(where, src,
- bkeyp_key_u64s(f, src));
- memcpy_u64s(bkeyp_val(f, where), &insert->v,
- bkeyp_val_u64s(f, src));
-
- bch_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
-
- bch_verify_key_order(b, iter, where);
- bch_verify_btree_nr_keys(b);
-}
-
-void bch_bset_delete(struct btree *b,
- struct bkey_packed *where,
- unsigned clobber_u64s)
-{
- struct bset_tree *t = bset_tree_last(b);
- u64 *src_p = where->_data + clobber_u64s;
- u64 *dst_p = where->_data;
-
- bch_bset_verify_rw_aux_tree(b, t);
-
- BUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
-
- memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
- le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
- set_btree_bset_end(b, t);
-
- bch_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
-}
-
-/* Lookup */
-
-__flatten
-static struct bkey_packed *bset_search_write_set(const struct btree *b,
- struct bset_tree *t,
- struct bpos search,
- const struct bkey_packed *packed_search)
-{
- unsigned l = 0, r = t->size;
-
- while (l + 1 != r) {
- unsigned m = (l + r) >> 1;
-
- if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0)
- l = m;
- else
- r = m;
- }
-
- return rw_aux_to_bkey(b, t, l);
-}
-
-noinline
-static int bset_search_tree_slowpath(const struct btree *b,
- struct bset_tree *t, struct bpos *search,
- const struct bkey_packed *packed_search,
- unsigned n)
-{
- return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n),
- packed_search, search) < 0;
-}
-
-__flatten
-static struct bkey_packed *bset_search_tree(const struct btree *b,
- struct bset_tree *t,
- struct bpos search,
- const struct bkey_packed *packed_search)
-{
- struct ro_aux_tree *base = ro_aux_tree_base(b, t);
- struct bkey_float *f = bkey_float_get(base, 1);
- void *p;
- unsigned inorder, n = 1;
-
- while (1) {
- if (likely(n << 4 < t->size)) {
- p = bkey_float_get(base, n << 4);
- prefetch(p);
- } else if (n << 3 < t->size) {
- inorder = __eytzinger_to_inorder(n, t->size, t->extra);
- p = bset_cacheline(b, t, inorder);
-#ifdef CONFIG_X86_64
- asm(".intel_syntax noprefix;"
- "prefetcht0 [%0 - 127 + 64 * 0];"
- "prefetcht0 [%0 - 127 + 64 * 1];"
- "prefetcht0 [%0 - 127 + 64 * 2];"
- "prefetcht0 [%0 - 127 + 64 * 3];"
- ".att_syntax prefix;"
- :
- : "r" (p + 127));
-#else
- prefetch(p + L1_CACHE_BYTES * 0);
- prefetch(p + L1_CACHE_BYTES * 1);
- prefetch(p + L1_CACHE_BYTES * 2);
- prefetch(p + L1_CACHE_BYTES * 3);
-#endif
- } else if (n >= t->size)
- break;
-
- f = bkey_float_get(base, n);
-
- if (packed_search &&
- likely(f->exponent < BFLOAT_FAILED))
- n = n * 2 + (bfloat_mantissa(f, n) <
- bkey_mantissa(packed_search, f, n));
- else
- n = n * 2 + bset_search_tree_slowpath(b, t,
- &search, packed_search, n);
- } while (n < t->size);
-
- inorder = __eytzinger_to_inorder(n >> 1, t->size, t->extra);
-
- /*
- * n would have been the node we recursed to - the low bit tells us if
- * we recursed left or recursed right.
- */
- if (n & 1) {
- return cacheline_to_bkey(b, t, inorder, f->key_offset);
- } else {
- if (--inorder) {
- n = eytzinger_prev(n >> 1, t->size);
- f = bkey_float_get(base, n);
- return cacheline_to_bkey(b, t, inorder, f->key_offset);
- } else
- return btree_bkey_first(b, t);
- }
-}
-
-/*
- * Returns the first key greater than or equal to @search
- */
-__always_inline __flatten
-static struct bkey_packed *bch_bset_search(struct btree *b,
- struct bset_tree *t,
- struct bpos search,
- struct bkey_packed *packed_search,
- const struct bkey_packed *lossy_packed_search,
- bool strictly_greater)
-{
- struct bkey_packed *m;
-
- /*
- * First, we search for a cacheline, then lastly we do a linear search
- * within that cacheline.
- *
- * To search for the cacheline, there's three different possibilities:
- * * The set is too small to have a search tree, so we just do a linear
- * search over the whole set.
- * * The set is the one we're currently inserting into; keeping a full
- * auxiliary search tree up to date would be too expensive, so we
- * use a much simpler lookup table to do a binary search -
- * bset_search_write_set().
- * * Or we use the auxiliary search tree we constructed earlier -
- * bset_search_tree()
- */
-
- switch (bset_aux_tree_type(t)) {
- case BSET_NO_AUX_TREE:
- m = btree_bkey_first(b, t);
- break;
- case BSET_RW_AUX_TREE:
- m = bset_search_write_set(b, t, search, lossy_packed_search);
- break;
- case BSET_RO_AUX_TREE:
- /*
- * Each node in the auxiliary search tree covers a certain range
- * of bits, and keys above and below the set it covers might
- * differ outside those bits - so we have to special case the
- * start and end - handle that here:
- */
-
- if (bkey_cmp(search, t->max_key) > 0)
- return btree_bkey_last(b, t);
-
- m = bset_search_tree(b, t, search, lossy_packed_search);
- break;
- }
-
- if (lossy_packed_search)
- while (m != btree_bkey_last(b, t) &&
- !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search,
- m, strictly_greater))
- m = bkey_next(m);
-
- if (!packed_search)
- while (m != btree_bkey_last(b, t) &&
- !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
- m = bkey_next(m);
-
- if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
- struct bkey_packed *prev = bkey_prev_all(b, t, m);
-
- BUG_ON(prev &&
- btree_iter_pos_cmp_p_or_unp(b, search, packed_search,
- prev, strictly_greater));
- }
-
- return m;
-}
-
-/* Btree node iterator */
-
-void bch_btree_node_iter_push(struct btree_node_iter *iter,
- struct btree *b,
- const struct bkey_packed *k,
- const struct bkey_packed *end)
-{
- if (k != end) {
- struct btree_node_iter_set *pos, n =
- ((struct btree_node_iter_set) {
- __btree_node_key_to_offset(b, k),
- __btree_node_key_to_offset(b, end)
- });
-
- btree_node_iter_for_each(iter, pos)
- if (btree_node_iter_cmp(iter, b, n, *pos) <= 0)
- break;
-
- memmove(pos + 1, pos,
- (void *) (iter->data + iter->used) - (void *) pos);
- iter->used++;
- *pos = n;
- }
-}
-
-noinline __flatten __attribute__((cold))
-static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
- struct btree *b, struct bpos search,
- bool strictly_greater, bool is_extents)
-{
- struct bset_tree *t;
-
- trace_bkey_pack_pos_fail(search);
-
- for_each_bset(b, t)
- __bch_btree_node_iter_push(iter, b,
- bch_bset_search(b, t, search, NULL, NULL,
- strictly_greater),
- btree_bkey_last(b, t));
-
- bch_btree_node_iter_sort(iter, b);
-}
-
-/**
- * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
- * given position
- *
- * Main entry point to the lookup code for individual btree nodes:
- *
- * NOTE:
- *
- * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
- * keys. This doesn't matter for most code, but it does matter for lookups.
- *
- * Some adjacent keys with a string of equal keys:
- * i j k k k k l m
- *
- * If you search for k, the lookup code isn't guaranteed to return you any
- * specific k. The lookup code is conceptually doing a binary search and
- * iterating backwards is very expensive so if the pivot happens to land at the
- * last k that's what you'll get.
- *
- * This works out ok, but it's something to be aware of:
- *
- * - For non extents, we guarantee that the live key comes last - see
- * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
- * see will only be deleted keys you don't care about.
- *
- * - For extents, deleted keys sort last (see the comment at the top of this
- * file). But when you're searching for extents, you actually want the first
- * key strictly greater than your search key - an extent that compares equal
- * to the search key is going to have 0 sectors after the search key.
- *
- * But this does mean that we can't just search for
- * bkey_successor(start_of_range) to get the first extent that overlaps with
- * the range we want - if we're unlucky and there's an extent that ends
- * exactly where we searched, then there could be a deleted key at the same
- * position and we'd get that when we search instead of the preceding extent
- * we needed.
- *
- * So we've got to search for start_of_range, then after the lookup iterate
- * past any extents that compare equal to the position we searched for.
- */
-void bch_btree_node_iter_init(struct btree_node_iter *iter,
- struct btree *b, struct bpos search,
- bool strictly_greater, bool is_extents)
-{
- struct bset_tree *t;
- struct bkey_packed p, *packed_search = NULL;
-
- EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
- bset_aux_tree_verify(b);
-
- __bch_btree_node_iter_init(iter, is_extents);
-
- //if (bkey_cmp(search, b->curr_max_key) > 0)
- // return;
-
- switch (bkey_pack_pos_lossy(&p, search, b)) {
- case BKEY_PACK_POS_EXACT:
- packed_search = &p;
- break;
- case BKEY_PACK_POS_SMALLER:
- packed_search = NULL;
- break;
- case BKEY_PACK_POS_FAIL:
- btree_node_iter_init_pack_failed(iter, b, search,
- strictly_greater, is_extents);
- return;
- }
-
- for_each_bset(b, t)
- __bch_btree_node_iter_push(iter, b,
- bch_bset_search(b, t, search,
- packed_search, &p,
- strictly_greater),
- btree_bkey_last(b, t));
-
- bch_btree_node_iter_sort(iter, b);
-}
-
-void bch_btree_node_iter_init_from_start(struct btree_node_iter *iter,
- struct btree *b,
- bool is_extents)
-{
- struct bset_tree *t;
-
- __bch_btree_node_iter_init(iter, is_extents);
-
- for_each_bset(b, t)
- __bch_btree_node_iter_push(iter, b,
- btree_bkey_first(b, t),
- btree_bkey_last(b, t));
- bch_btree_node_iter_sort(iter, b);
-}
-
-struct bkey_packed *bch_btree_node_iter_bset_pos(struct btree_node_iter *iter,
- struct btree *b,
- struct bset_tree *t)
-{
- struct btree_node_iter_set *set;
-
- BUG_ON(iter->used > MAX_BSETS);
-
- btree_node_iter_for_each(iter, set)
- if (set->end == t->end_offset)
- return __btree_node_offset_to_key(b, set->k);
-
- return btree_bkey_last(b, t);
-}
-
-static inline void btree_node_iter_sift(struct btree_node_iter *iter,
- struct btree *b,
- unsigned start)
-{
- unsigned i;
-
- EBUG_ON(iter->used > MAX_BSETS);
-
- for (i = start;
- i + 1 < iter->used &&
- btree_node_iter_cmp(iter, b, iter->data[i], iter->data[i + 1]) > 0;
- i++)
- swap(iter->data[i], iter->data[i + 1]);
-}
-
-static inline void btree_node_iter_sort_two(struct btree_node_iter *iter,
- struct btree *b,
- unsigned first)
-{
- if (btree_node_iter_cmp(iter, b,
- iter->data[first],
- iter->data[first + 1]) > 0)
- swap(iter->data[first], iter->data[first + 1]);
-}
-
-void bch_btree_node_iter_sort(struct btree_node_iter *iter,
- struct btree *b)
-{
- EBUG_ON(iter->used > 3);
-
- /* unrolled bubble sort: */
-
- if (iter->used > 2) {
- btree_node_iter_sort_two(iter, b, 0);
- btree_node_iter_sort_two(iter, b, 1);
- }
-
- if (iter->used > 1)
- btree_node_iter_sort_two(iter, b, 0);
-}
-EXPORT_SYMBOL(bch_btree_node_iter_sort);
-
-/**
- * bch_btree_node_iter_advance - advance @iter by one key
- *
- * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
- * momentarily have out of order extents.
- */
-void bch_btree_node_iter_advance(struct btree_node_iter *iter,
- struct btree *b)
-{
- struct bkey_packed *k = bch_btree_node_iter_peek_all(iter, b);
-
- iter->data->k += __bch_btree_node_iter_peek_all(iter, b)->u64s;
-
- BUG_ON(iter->data->k > iter->data->end);
-
- if (iter->data->k == iter->data->end) {
- BUG_ON(iter->used == 0);
- iter->data[0] = iter->data[--iter->used];
- }
-
- btree_node_iter_sift(iter, b, 0);
-
- bch_btree_node_iter_next_check(iter, b, k);
-}
-
-/*
- * Expensive:
- */
-struct bkey_packed *bch_btree_node_iter_prev_all(struct btree_node_iter *iter,
- struct btree *b)
-{
- struct bkey_packed *k, *prev = NULL;
- struct btree_node_iter_set *set;
- struct bset_tree *t;
- struct bset_tree *prev_t;
- unsigned end;
-
- bch_btree_node_iter_verify(iter, b);
-
- for_each_bset(b, t) {
- k = bkey_prev_all(b, t,
- bch_btree_node_iter_bset_pos(iter, b, t));
- if (k &&
- (!prev || __btree_node_iter_cmp(iter->is_extents, b,
- k, prev) > 0)) {
- prev = k;
- prev_t = t;
- }
- }
-
- if (!prev)
- return NULL;
-
- /*
- * We're manually memmoving instead of just calling sort() to ensure the
- * prev we picked ends up in slot 0 - sort won't necessarily put it
- * there because of duplicate deleted keys:
- */
- end = __btree_node_key_to_offset(b, btree_bkey_last(b, prev_t));
- btree_node_iter_for_each(iter, set)
- if (set->end == end) {
- memmove(&iter->data[1],
- &iter->data[0],
- (void *) set - (void *) &iter->data[0]);
- goto out;
- }
-
- memmove(&iter->data[1],
- &iter->data[0],
- (void *) &iter->data[iter->used] - (void *) &iter->data[0]);
- iter->used++;
-out:
- iter->data[0].k = __btree_node_key_to_offset(b, prev);
- iter->data[0].end = end;
- return prev;
-}
-
-struct bkey_packed *bch_btree_node_iter_prev(struct btree_node_iter *iter,
- struct btree *b)
-{
- struct bkey_packed *k;
-
- do {
- k = bch_btree_node_iter_prev_all(iter, b);
- } while (k && bkey_deleted(k));
-
- return k;
-}
-
-struct bkey_s_c bch_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
- struct btree *b,
- struct bkey *u)
-{
- struct bkey_packed *k = bch_btree_node_iter_peek(iter, b);
-
- return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
-}
-EXPORT_SYMBOL(bch_btree_node_iter_peek_unpack);
-
-/* Mergesort */
-
-void bch_btree_keys_stats(struct btree *b, struct bset_stats *stats)
-{
- struct bset_tree *t;
-
- for_each_bset(b, t) {
- enum bset_aux_tree_type type = bset_aux_tree_type(t);
- size_t j;
-
- stats->sets[type].nr++;
- stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
- sizeof(u64);
-
- if (bset_has_ro_aux_tree(t)) {
- stats->floats += t->size - 1;
-
- for (j = 1; j < t->size; j++)
- switch (bkey_float(b, t, j)->exponent) {
- case BFLOAT_FAILED_UNPACKED:
- stats->failed_unpacked++;
- break;
- case BFLOAT_FAILED_PREV:
- stats->failed_prev++;
- break;
- case BFLOAT_FAILED_OVERFLOW:
- stats->failed_overflow++;
- break;
- }
- }
- }
-}
-
-int bch_bkey_print_bfloat(struct btree *b, struct bkey_packed *k,
- char *buf, size_t size)
-{
- struct bset_tree *t = bch_bkey_to_bset(b, k);
- struct bkey_packed *l, *r, *p;
- struct bkey uk, up;
- char buf1[200], buf2[200];
- unsigned j;
-
- if (!size)
- return 0;
-
- if (!bset_has_ro_aux_tree(t))
- goto out;
-
- j = __inorder_to_eytzinger(bkey_to_cacheline(b, t, k), t->size, t->extra);
- if (j &&
- j < t->size &&
- k == tree_to_bkey(b, t, j))
- switch (bkey_float(b, t, j)->exponent) {
- case BFLOAT_FAILED_UNPACKED:
- uk = bkey_unpack_key(b, k);
- return scnprintf(buf, size,
- " failed unpacked at depth %u\n"
- "\t%llu:%llu\n",
- ilog2(j),
- uk.p.inode, uk.p.offset);
- case BFLOAT_FAILED_PREV:
- p = tree_to_prev_bkey(b, t, j);
- l = is_power_of_2(j)
- ? btree_bkey_first(b, t)
- : tree_to_prev_bkey(b, t, j >> ffs(j));
- r = is_power_of_2(j + 1)
- ? bkey_prev_all(b, t, btree_bkey_last(b, t))
- : tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
- up = bkey_unpack_key(b, p);
- uk = bkey_unpack_key(b, k);
- bch_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits);
- bch_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits);
-
- return scnprintf(buf, size,
- " failed prev at depth %u\n"
- "\tkey starts at bit %u but first differing bit at %u\n"
- "\t%llu:%llu\n"
- "\t%llu:%llu\n"
- "\t%s\n"
- "\t%s\n",
- ilog2(j),
- bkey_greatest_differing_bit(b, l, r),
- bkey_greatest_differing_bit(b, p, k),
- uk.p.inode, uk.p.offset,
- up.p.inode, up.p.offset,
- buf1, buf2);
- case BFLOAT_FAILED_OVERFLOW:
- uk = bkey_unpack_key(b, k);
- return scnprintf(buf, size,
- " failed overflow at depth %u\n"
- "\t%llu:%llu\n",
- ilog2(j),
- uk.p.inode, uk.p.offset);
- }
-out:
- *buf = '\0';
- return 0;
-}
diff --git a/libbcache/bset.h b/libbcache/bset.h
deleted file mode 100644
index 70868c51..00000000
--- a/libbcache/bset.h
+++ /dev/null
@@ -1,615 +0,0 @@
-#ifndef _BCACHE_BSET_H
-#define _BCACHE_BSET_H
-
-#include <linux/bcache.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-#include "bkey.h"
-#include "bkey_methods.h"
-#include "btree_types.h"
-#include "util.h" /* for time_stats */
-#include "vstructs.h"
-
-/*
- * BKEYS:
- *
- * A bkey contains a key, a size field, a variable number of pointers, and some
- * ancillary flag bits.
- *
- * We use two different functions for validating bkeys, bkey_invalid and
- * bkey_deleted().
- *
- * The one exception to the rule that ptr_invalid() filters out invalid keys is
- * that it also filters out keys of size 0 - these are keys that have been
- * completely overwritten. It'd be safe to delete these in memory while leaving
- * them on disk, just unnecessary work - so we filter them out when resorting
- * instead.
- *
- * We can't filter out stale keys when we're resorting, because garbage
- * collection needs to find them to ensure bucket gens don't wrap around -
- * unless we're rewriting the btree node those stale keys still exist on disk.
- *
- * We also implement functions here for removing some number of sectors from the
- * front or the back of a bkey - this is mainly used for fixing overlapping
- * extents, by removing the overlapping sectors from the older key.
- *
- * BSETS:
- *
- * A bset is an array of bkeys laid out contiguously in memory in sorted order,
- * along with a header. A btree node is made up of a number of these, written at
- * different times.
- *
- * There could be many of them on disk, but we never allow there to be more than
- * 4 in memory - we lazily resort as needed.
- *
- * We implement code here for creating and maintaining auxiliary search trees
- * (described below) for searching an individial bset, and on top of that we
- * implement a btree iterator.
- *
- * BTREE ITERATOR:
- *
- * Most of the code in bcache doesn't care about an individual bset - it needs
- * to search entire btree nodes and iterate over them in sorted order.
- *
- * The btree iterator code serves both functions; it iterates through the keys
- * in a btree node in sorted order, starting from either keys after a specific
- * point (if you pass it a search key) or the start of the btree node.
- *
- * AUXILIARY SEARCH TREES:
- *
- * Since keys are variable length, we can't use a binary search on a bset - we
- * wouldn't be able to find the start of the next key. But binary searches are
- * slow anyways, due to terrible cache behaviour; bcache originally used binary
- * searches and that code topped out at under 50k lookups/second.
- *
- * So we need to construct some sort of lookup table. Since we only insert keys
- * into the last (unwritten) set, most of the keys within a given btree node are
- * usually in sets that are mostly constant. We use two different types of
- * lookup tables to take advantage of this.
- *
- * Both lookup tables share in common that they don't index every key in the
- * set; they index one key every BSET_CACHELINE bytes, and then a linear search
- * is used for the rest.
- *
- * For sets that have been written to disk and are no longer being inserted
- * into, we construct a binary search tree in an array - traversing a binary
- * search tree in an array gives excellent locality of reference and is very
- * fast, since both children of any node are adjacent to each other in memory
- * (and their grandchildren, and great grandchildren...) - this means
- * prefetching can be used to great effect.
- *
- * It's quite useful performance wise to keep these nodes small - not just
- * because they're more likely to be in L2, but also because we can prefetch
- * more nodes on a single cacheline and thus prefetch more iterations in advance
- * when traversing this tree.
- *
- * Nodes in the auxiliary search tree must contain both a key to compare against
- * (we don't want to fetch the key from the set, that would defeat the purpose),
- * and a pointer to the key. We use a few tricks to compress both of these.
- *
- * To compress the pointer, we take advantage of the fact that one node in the
- * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
- * a function (to_inorder()) that takes the index of a node in a binary tree and
- * returns what its index would be in an inorder traversal, so we only have to
- * store the low bits of the offset.
- *
- * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
- * compress that, we take advantage of the fact that when we're traversing the
- * search tree at every iteration we know that both our search key and the key
- * we're looking for lie within some range - bounded by our previous
- * comparisons. (We special case the start of a search so that this is true even
- * at the root of the tree).
- *
- * So we know the key we're looking for is between a and b, and a and b don't
- * differ higher than bit 50, we don't need to check anything higher than bit
- * 50.
- *
- * We don't usually need the rest of the bits, either; we only need enough bits
- * to partition the key range we're currently checking. Consider key n - the
- * key our auxiliary search tree node corresponds to, and key p, the key
- * immediately preceding n. The lowest bit we need to store in the auxiliary
- * search tree is the highest bit that differs between n and p.
- *
- * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
- * comparison. But we'd really like our nodes in the auxiliary search tree to be
- * of fixed size.
- *
- * The solution is to make them fixed size, and when we're constructing a node
- * check if p and n differed in the bits we needed them to. If they don't we
- * flag that node, and when doing lookups we fallback to comparing against the
- * real key. As long as this doesn't happen to often (and it seems to reliably
- * happen a bit less than 1% of the time), we win - even on failures, that key
- * is then more likely to be in cache than if we were doing binary searches all
- * the way, since we're touching so much less memory.
- *
- * The keys in the auxiliary search tree are stored in (software) floating
- * point, with an exponent and a mantissa. The exponent needs to be big enough
- * to address all the bits in the original key, but the number of bits in the
- * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
- *
- * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
- * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
- * We need one node per 128 bytes in the btree node, which means the auxiliary
- * search trees take up 3% as much memory as the btree itself.
- *
- * Constructing these auxiliary search trees is moderately expensive, and we
- * don't want to be constantly rebuilding the search tree for the last set
- * whenever we insert another key into it. For the unwritten set, we use a much
- * simpler lookup table - it's just a flat array, so index i in the lookup table
- * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
- * within each byte range works the same as with the auxiliary search trees.
- *
- * These are much easier to keep up to date when we insert a key - we do it
- * somewhat lazily; when we shift a key up we usually just increment the pointer
- * to it, only when it would overflow do we go to the trouble of finding the
- * first key in that range of bytes again.
- */
-
-struct btree_node_iter;
-struct btree_node_iter_set;
-
-enum bset_aux_tree_type {
- BSET_NO_AUX_TREE,
- BSET_RO_AUX_TREE,
- BSET_RW_AUX_TREE,
-};
-
-#define BSET_TREE_NR_TYPES 3
-
-#define BSET_NO_AUX_TREE_VAL (U16_MAX)
-#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1)
-
-static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
-{
- switch (t->extra) {
- case BSET_NO_AUX_TREE_VAL:
- EBUG_ON(t->size);
- return BSET_NO_AUX_TREE;
- case BSET_RW_AUX_TREE_VAL:
- EBUG_ON(!t->size);
- return BSET_RW_AUX_TREE;
- default:
- EBUG_ON(!t->size);
- return BSET_RO_AUX_TREE;
- }
-}
-
-typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-
-static inline struct bkey
-bkey_unpack_key_format_checked(const struct btree *b,
- const struct bkey_packed *src)
-{
- struct bkey dst;
-
-#ifdef HAVE_BCACHE_COMPILED_UNPACK
- {
- compiled_unpack_fn unpack_fn = b->aux_data;
- unpack_fn(&dst, src);
-
- if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
- struct bkey dst2 = __bkey_unpack_key(&b->format, src);
-
- BUG_ON(memcmp(&dst, &dst2, sizeof(dst)));
- }
- }
-#else
- dst = __bkey_unpack_key(&b->format, src);
-#endif
- return dst;
-}
-
-/**
- * bkey_unpack_key -- unpack just the key, not the value
- */
-static inline struct bkey bkey_unpack_key(const struct btree *b,
- const struct bkey_packed *src)
-{
- return likely(bkey_packed(src))
- ? bkey_unpack_key_format_checked(b, src)
- : *packed_to_bkey_c(src);
-}
-
-static inline struct bpos
-bkey_unpack_pos_format_checked(const struct btree *b,
- const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHE_COMPILED_UNPACK
- return bkey_unpack_key_format_checked(b, src).p;
-#else
- return __bkey_unpack_pos(&b->format, src);
-#endif
-}
-
-static inline struct bpos bkey_unpack_pos(const struct btree *b,
- const struct bkey_packed *src)
-{
- return likely(bkey_packed(src))
- ? bkey_unpack_pos_format_checked(b, src)
- : packed_to_bkey_c(src)->p;
-}
-
-/* Disassembled bkeys */
-
-static inline struct bkey_s_c bkey_disassemble(struct btree *b,
- const struct bkey_packed *k,
- struct bkey *u)
-{
- *u = bkey_unpack_key(b, k);
-
- return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
-}
-
-/* non const version: */
-static inline struct bkey_s __bkey_disassemble(struct btree *b,
- struct bkey_packed *k,
- struct bkey *u)
-{
- *u = bkey_unpack_key(b, k);
-
- return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
-}
-
-#define for_each_bset(_b, _t) \
- for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
-
-extern bool bch_expensive_debug_checks;
-
-static inline bool btree_keys_expensive_checks(struct btree *b)
-{
-#ifdef CONFIG_BCACHE_DEBUG
- return bch_expensive_debug_checks || *b->expensive_debug_checks;
-#else
- return false;
-#endif
-}
-
-static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
-{
- return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
-}
-
-static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
-{
- return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
-}
-
-static inline void bch_bset_set_no_aux_tree(struct btree *b,
- struct bset_tree *t)
-{
- BUG_ON(t < b->set);
-
- for (; t < b->set + ARRAY_SIZE(b->set); t++) {
- t->size = 0;
- t->extra = BSET_NO_AUX_TREE_VAL;
- t->aux_data_offset = U16_MAX;
- }
-}
-
-static inline void btree_node_set_format(struct btree *b,
- struct bkey_format f)
-{
- int len;
-
- b->format = f;
- b->nr_key_bits = bkey_format_key_bits(&f);
-
- len = bch_compile_bkey_format(&b->format, b->aux_data);
- BUG_ON(len < 0 || len > U8_MAX);
-
- b->unpack_fn_len = len;
-
- bch_bset_set_no_aux_tree(b, b->set);
-}
-
-static inline struct bset *bset_next_set(struct btree *b,
- unsigned block_bytes)
-{
- struct bset *i = btree_bset_last(b);
-
- EBUG_ON(!is_power_of_2(block_bytes));
-
- return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
-}
-
-void bch_btree_keys_free(struct btree *);
-int bch_btree_keys_alloc(struct btree *, unsigned, gfp_t);
-void bch_btree_keys_init(struct btree *, bool *);
-
-void bch_bset_init_first(struct btree *, struct bset *);
-void bch_bset_init_next(struct btree *, struct bset *);
-void bch_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-void bch_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
- struct bkey_packed *);
-
-void bch_bset_insert(struct btree *, struct btree_node_iter *,
- struct bkey_packed *, struct bkey_i *, unsigned);
-void bch_bset_delete(struct btree *, struct bkey_packed *, unsigned);
-
-/* Bkey utility code */
-
-/* packed or unpacked */
-static inline int bkey_cmp_p_or_unp(const struct btree *b,
- const struct bkey_packed *l,
- const struct bkey_packed *r_packed,
- struct bpos *r)
-{
- EBUG_ON(r_packed && !bkey_packed(r_packed));
-
- if (unlikely(!bkey_packed(l)))
- return bkey_cmp(packed_to_bkey_c(l)->p, *r);
-
- if (likely(r_packed))
- return __bkey_cmp_packed_format_checked(l, r_packed, b);
-
- return __bkey_cmp_left_packed_format_checked(b, l, r);
-}
-
-/* Returns true if @k is after iterator position @pos */
-static inline bool btree_iter_pos_cmp(struct bpos pos, const struct bkey *k,
- bool strictly_greater)
-{
- int cmp = bkey_cmp(k->p, pos);
-
- return cmp > 0 ||
- (cmp == 0 && !strictly_greater && !bkey_deleted(k));
-}
-
-static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
- struct bpos *pos,
- const struct bkey_packed *k,
- bool strictly_greater)
-{
- int cmp = bkey_cmp_left_packed(b, k, pos);
-
- return cmp > 0 ||
- (cmp == 0 && !strictly_greater && !bkey_deleted(k));
-}
-
-static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
- struct bpos pos,
- const struct bkey_packed *pos_packed,
- const struct bkey_packed *k,
- bool strictly_greater)
-{
- int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos);
-
- return cmp > 0 ||
- (cmp == 0 && !strictly_greater && !bkey_deleted(k));
-}
-
-struct bset_tree *bch_bkey_to_bset(struct btree *, struct bkey_packed *);
-struct bkey_packed *bkey_prev_all(struct btree *, struct bset_tree *,
- struct bkey_packed *);
-struct bkey_packed *bkey_prev(struct btree *, struct bset_tree *,
- struct bkey_packed *);
-
-enum bch_extent_overlap {
- BCH_EXTENT_OVERLAP_ALL = 0,
- BCH_EXTENT_OVERLAP_BACK = 1,
- BCH_EXTENT_OVERLAP_FRONT = 2,
- BCH_EXTENT_OVERLAP_MIDDLE = 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch_extent_overlap(const struct bkey *k,
- const struct bkey *m)
-{
- int cmp1 = bkey_cmp(k->p, m->p) < 0;
- int cmp2 = bkey_cmp(bkey_start_pos(k),
- bkey_start_pos(m)) > 0;
-
- return (cmp1 << 1) + cmp2;
-}
-
-/* Btree key iteration */
-
-struct btree_node_iter {
- u8 is_extents;
- u16 used;
-
- struct btree_node_iter_set {
- u16 k, end;
- } data[MAX_BSETS];
-};
-
-static inline void __bch_btree_node_iter_init(struct btree_node_iter *iter,
- bool is_extents)
-{
- iter->used = 0;
- iter->is_extents = is_extents;
-}
-
-void bch_btree_node_iter_push(struct btree_node_iter *, struct btree *,
- const struct bkey_packed *,
- const struct bkey_packed *);
-void bch_btree_node_iter_init(struct btree_node_iter *, struct btree *,
- struct bpos, bool, bool);
-void bch_btree_node_iter_init_from_start(struct btree_node_iter *,
- struct btree *, bool);
-struct bkey_packed *bch_btree_node_iter_bset_pos(struct btree_node_iter *,
- struct btree *,
- struct bset_tree *);
-
-void bch_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
-void bch_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
-
-#define btree_node_iter_for_each(_iter, _set) \
- for (_set = (_iter)->data; \
- _set < (_iter)->data + (_iter)->used; \
- _set++)
-
-static inline bool bch_btree_node_iter_end(struct btree_node_iter *iter)
-{
- return !iter->used;
-}
-
-static inline int __btree_node_iter_cmp(bool is_extents,
- struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
-{
- /*
- * For non extents, when keys compare equal the deleted keys have to
- * come first - so that bch_btree_node_iter_next_check() can detect
- * duplicate nondeleted keys (and possibly other reasons?)
- *
- * For extents, bkey_deleted() is used as a proxy for k->size == 0, so
- * deleted keys have to sort last.
- */
- return bkey_cmp_packed(b, l, r) ?: is_extents
- ? (int) bkey_deleted(l) - (int) bkey_deleted(r)
- : (int) bkey_deleted(r) - (int) bkey_deleted(l);
-}
-
-static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
- struct btree *b,
- struct btree_node_iter_set l,
- struct btree_node_iter_set r)
-{
- return __btree_node_iter_cmp(iter->is_extents, b,
- __btree_node_offset_to_key(b, l.k),
- __btree_node_offset_to_key(b, r.k));
-}
-
-static inline void __bch_btree_node_iter_push(struct btree_node_iter *iter,
- struct btree *b,
- const struct bkey_packed *k,
- const struct bkey_packed *end)
-{
- if (k != end)
- iter->data[iter->used++] = (struct btree_node_iter_set) {
- __btree_node_key_to_offset(b, k),
- __btree_node_key_to_offset(b, end)
- };
-}
-
-static inline struct bkey_packed *
-__bch_btree_node_iter_peek_all(struct btree_node_iter *iter,
- struct btree *b)
-{
- return __btree_node_offset_to_key(b, iter->data->k);
-}
-
-static inline struct bkey_packed *
-bch_btree_node_iter_peek_all(struct btree_node_iter *iter,
- struct btree *b)
-{
- return bch_btree_node_iter_end(iter)
- ? NULL
- : __bch_btree_node_iter_peek_all(iter, b);
-}
-
-static inline struct bkey_packed *
-bch_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
-{
- struct bkey_packed *ret;
-
- while ((ret = bch_btree_node_iter_peek_all(iter, b)) &&
- bkey_deleted(ret))
- bch_btree_node_iter_advance(iter, b);
-
- return ret;
-}
-
-static inline struct bkey_packed *
-bch_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
-{
- struct bkey_packed *ret = bch_btree_node_iter_peek_all(iter, b);
-
- if (ret)
- bch_btree_node_iter_advance(iter, b);
-
- return ret;
-}
-
-struct bkey_packed *bch_btree_node_iter_prev_all(struct btree_node_iter *,
- struct btree *);
-struct bkey_packed *bch_btree_node_iter_prev(struct btree_node_iter *,
- struct btree *);
-
-/*
- * Iterates over all _live_ keys - skipping deleted (and potentially
- * overlapping) keys
- */
-#define for_each_btree_node_key(b, k, iter, _is_extents) \
- for (bch_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
- ((k) = bch_btree_node_iter_peek(iter, b)); \
- bch_btree_node_iter_advance(iter, b))
-
-struct bkey_s_c bch_btree_node_iter_peek_unpack(struct btree_node_iter *,
- struct btree *,
- struct bkey *);
-
-#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\
- for (bch_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
- (k = bch_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
- bch_btree_node_iter_advance(iter, b))
-
-/* Accounting: */
-
-static inline void btree_keys_account_key(struct btree_nr_keys *n,
- unsigned bset,
- struct bkey_packed *k,
- int sign)
-{
- n->live_u64s += k->u64s * sign;
- n->bset_u64s[bset] += k->u64s * sign;
-
- if (bkey_packed(k))
- n->packed_keys += sign;
- else
- n->unpacked_keys += sign;
-}
-
-#define btree_keys_account_key_add(_nr, _bset_idx, _k) \
- btree_keys_account_key(_nr, _bset_idx, _k, 1)
-#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \
- btree_keys_account_key(_nr, _bset_idx, _k, -1)
-
-struct bset_stats {
- struct {
- size_t nr, bytes;
- } sets[BSET_TREE_NR_TYPES];
-
- size_t floats;
- size_t failed_unpacked;
- size_t failed_prev;
- size_t failed_overflow;
-};
-
-void bch_btree_keys_stats(struct btree *, struct bset_stats *);
-int bch_bkey_print_bfloat(struct btree *, struct bkey_packed *,
- char *, size_t);
-
-/* Debug stuff */
-
-void bch_dump_bset(struct btree *, struct bset *, unsigned);
-void bch_dump_btree_node(struct btree *);
-void bch_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
-
-#ifdef CONFIG_BCACHE_DEBUG
-
-void __bch_verify_btree_nr_keys(struct btree *);
-void bch_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
-void bch_verify_key_order(struct btree *, struct btree_node_iter *,
- struct bkey_packed *);
-
-#else
-
-static inline void __bch_verify_btree_nr_keys(struct btree *b) {}
-static inline void bch_btree_node_iter_verify(struct btree_node_iter *iter,
- struct btree *b) {}
-static inline void bch_verify_key_order(struct btree *b,
- struct btree_node_iter *iter,
- struct bkey_packed *where) {}
-#endif
-
-static inline void bch_verify_btree_nr_keys(struct btree *b)
-{
- if (btree_keys_expensive_checks(b))
- __bch_verify_btree_nr_keys(b);
-}
-
-#endif
diff --git a/libbcache/btree_cache.c b/libbcache/btree_cache.c
deleted file mode 100644
index a43e12da..00000000
--- a/libbcache/btree_cache.c
+++ /dev/null
@@ -1,756 +0,0 @@
-
-#include "bcache.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "debug.h"
-#include "extents.h"
-
-#include <trace/events/bcache.h>
-
-#define DEF_BTREE_ID(kwd, val, name) name,
-
-const char * const bch_btree_ids[] = {
- DEFINE_BCH_BTREE_IDS()
- NULL
-};
-
-#undef DEF_BTREE_ID
-
-void bch_recalc_btree_reserve(struct bch_fs *c)
-{
- unsigned i, reserve = 16;
-
- if (!c->btree_roots[0].b)
- reserve += 8;
-
- for (i = 0; i < BTREE_ID_NR; i++)
- if (c->btree_roots[i].b)
- reserve += min_t(unsigned, 1,
- c->btree_roots[i].b->level) * 8;
-
- c->btree_cache_reserve = reserve;
-}
-
-#define mca_can_free(c) \
- max_t(int, 0, c->btree_cache_used - c->btree_cache_reserve)
-
-static void __mca_data_free(struct bch_fs *c, struct btree *b)
-{
- EBUG_ON(btree_node_write_in_flight(b));
-
- free_pages((unsigned long) b->data, btree_page_order(c));
- b->data = NULL;
- bch_btree_keys_free(b);
-}
-
-static void mca_data_free(struct bch_fs *c, struct btree *b)
-{
- __mca_data_free(c, b);
- c->btree_cache_used--;
- list_move(&b->list, &c->btree_cache_freed);
-}
-
-#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0])
-
-static const struct rhashtable_params bch_btree_cache_params = {
- .head_offset = offsetof(struct btree, hash),
- .key_offset = offsetof(struct btree, key.v),
- .key_len = sizeof(struct bch_extent_ptr),
-};
-
-static void mca_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
-{
- unsigned order = ilog2(btree_pages(c));
-
- b->data = (void *) __get_free_pages(gfp, order);
- if (!b->data)
- goto err;
-
- if (bch_btree_keys_alloc(b, order, gfp))
- goto err;
-
- c->btree_cache_used++;
- list_move(&b->list, &c->btree_cache_freeable);
- return;
-err:
- free_pages((unsigned long) b->data, order);
- b->data = NULL;
- list_move(&b->list, &c->btree_cache_freed);
-}
-
-static struct btree *mca_bucket_alloc(struct bch_fs *c, gfp_t gfp)
-{
- struct btree *b = kzalloc(sizeof(struct btree), gfp);
- if (!b)
- return NULL;
-
- six_lock_init(&b->lock);
- INIT_LIST_HEAD(&b->list);
- INIT_LIST_HEAD(&b->write_blocked);
-
- mca_data_alloc(c, b, gfp);
- return b->data ? b : NULL;
-}
-
-/* Btree in memory cache - hash table */
-
-void mca_hash_remove(struct bch_fs *c, struct btree *b)
-{
- BUG_ON(btree_node_dirty(b));
-
- b->nsets = 0;
-
- rhashtable_remove_fast(&c->btree_cache_table, &b->hash,
- bch_btree_cache_params);
-
- /* Cause future lookups for this node to fail: */
- bkey_i_to_extent(&b->key)->v._data[0] = 0;
-}
-
-int mca_hash_insert(struct bch_fs *c, struct btree *b,
- unsigned level, enum btree_id id)
-{
- int ret;
- b->level = level;
- b->btree_id = id;
-
- ret = rhashtable_lookup_insert_fast(&c->btree_cache_table, &b->hash,
- bch_btree_cache_params);
- if (ret)
- return ret;
-
- mutex_lock(&c->btree_cache_lock);
- list_add(&b->list, &c->btree_cache);
- mutex_unlock(&c->btree_cache_lock);
-
- return 0;
-}
-
-__flatten
-static inline struct btree *mca_find(struct bch_fs *c,
- const struct bkey_i *k)
-{
- return rhashtable_lookup_fast(&c->btree_cache_table, &PTR_HASH(k),
- bch_btree_cache_params);
-}
-
-/*
- * this version is for btree nodes that have already been freed (we're not
- * reaping a real btree node)
- */
-static int mca_reap_notrace(struct bch_fs *c, struct btree *b, bool flush)
-{
- lockdep_assert_held(&c->btree_cache_lock);
-
- if (!six_trylock_intent(&b->lock))
- return -ENOMEM;
-
- if (!six_trylock_write(&b->lock))
- goto out_unlock_intent;
-
- if (btree_node_write_error(b) ||
- btree_node_noevict(b))
- goto out_unlock;
-
- if (!list_empty(&b->write_blocked))
- goto out_unlock;
-
- if (!flush &&
- (btree_node_dirty(b) ||
- btree_node_write_in_flight(b)))
- goto out_unlock;
-
- /*
- * Using the underscore version because we don't want to compact bsets
- * after the write, since this node is about to be evicted - unless
- * btree verify mode is enabled, since it runs out of the post write
- * cleanup:
- */
- if (btree_node_dirty(b)) {
- if (verify_btree_ondisk(c))
- bch_btree_node_write(c, b, NULL, SIX_LOCK_intent, -1);
- else
- __bch_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
- }
-
- /* wait for any in flight btree write */
- wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-
- return 0;
-out_unlock:
- six_unlock_write(&b->lock);
-out_unlock_intent:
- six_unlock_intent(&b->lock);
- return -ENOMEM;
-}
-
-static int mca_reap(struct bch_fs *c, struct btree *b, bool flush)
-{
- int ret = mca_reap_notrace(c, b, flush);
-
- trace_bcache_mca_reap(c, b, ret);
- return ret;
-}
-
-static unsigned long bch_mca_scan(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_cache_shrink);
- struct btree *b, *t;
- unsigned long nr = sc->nr_to_scan;
- unsigned long can_free;
- unsigned long touched = 0;
- unsigned long freed = 0;
- unsigned i;
-
- u64 start_time = local_clock();
-
- if (btree_shrinker_disabled(c))
- return SHRINK_STOP;
-
- if (c->btree_cache_alloc_lock)
- return SHRINK_STOP;
-
- /* Return -1 if we can't do anything right now */
- if (sc->gfp_mask & __GFP_IO)
- mutex_lock(&c->btree_cache_lock);
- else if (!mutex_trylock(&c->btree_cache_lock))
- return -1;
-
- /*
- * It's _really_ critical that we don't free too many btree nodes - we
- * have to always leave ourselves a reserve. The reserve is how we
- * guarantee that allocating memory for a new btree node can always
- * succeed, so that inserting keys into the btree can always succeed and
- * IO can always make forward progress:
- */
- nr /= btree_pages(c);
- can_free = mca_can_free(c);
- nr = min_t(unsigned long, nr, can_free);
-
- i = 0;
- list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
- touched++;
-
- if (freed >= nr)
- break;
-
- if (++i > 3 &&
- !mca_reap_notrace(c, b, false)) {
- mca_data_free(c, b);
- six_unlock_write(&b->lock);
- six_unlock_intent(&b->lock);
- freed++;
- }
- }
-restart:
- list_for_each_entry_safe(b, t, &c->btree_cache, list) {
- touched++;
-
- if (freed >= nr) {
- /* Save position */
- if (&t->list != &c->btree_cache)
- list_move_tail(&c->btree_cache, &t->list);
- break;
- }
-
- if (!btree_node_accessed(b) &&
- !mca_reap(c, b, false)) {
- /* can't call mca_hash_remove under btree_cache_lock */
- freed++;
- if (&t->list != &c->btree_cache)
- list_move_tail(&c->btree_cache, &t->list);
-
- mca_data_free(c, b);
- mutex_unlock(&c->btree_cache_lock);
-
- mca_hash_remove(c, b);
- six_unlock_write(&b->lock);
- six_unlock_intent(&b->lock);
-
- if (freed >= nr)
- goto out;
-
- if (sc->gfp_mask & __GFP_IO)
- mutex_lock(&c->btree_cache_lock);
- else if (!mutex_trylock(&c->btree_cache_lock))
- goto out;
- goto restart;
- } else
- clear_btree_node_accessed(b);
- }
-
- mutex_unlock(&c->btree_cache_lock);
-out:
- bch_time_stats_update(&c->mca_scan_time, start_time);
-
- trace_bcache_mca_scan(c,
- touched * btree_pages(c),
- freed * btree_pages(c),
- can_free * btree_pages(c),
- sc->nr_to_scan);
-
- return (unsigned long) freed * btree_pages(c);
-}
-
-static unsigned long bch_mca_count(struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_cache_shrink);
-
- if (btree_shrinker_disabled(c))
- return 0;
-
- if (c->btree_cache_alloc_lock)
- return 0;
-
- return mca_can_free(c) * btree_pages(c);
-}
-
-void bch_fs_btree_exit(struct bch_fs *c)
-{
- struct btree *b;
- unsigned i;
-
- if (c->btree_cache_shrink.list.next)
- unregister_shrinker(&c->btree_cache_shrink);
-
- mutex_lock(&c->btree_cache_lock);
-
-#ifdef CONFIG_BCACHE_DEBUG
- if (c->verify_data)
- list_move(&c->verify_data->list, &c->btree_cache);
-
- free_pages((unsigned long) c->verify_ondisk, ilog2(btree_pages(c)));
-#endif
-
- for (i = 0; i < BTREE_ID_NR; i++)
- if (c->btree_roots[i].b)
- list_add(&c->btree_roots[i].b->list, &c->btree_cache);
-
- list_splice(&c->btree_cache_freeable,
- &c->btree_cache);
-
- while (!list_empty(&c->btree_cache)) {
- b = list_first_entry(&c->btree_cache, struct btree, list);
-
- if (btree_node_dirty(b))
- bch_btree_complete_write(c, b, btree_current_write(b));
- clear_btree_node_dirty(b);
-
- mca_data_free(c, b);
- }
-
- while (!list_empty(&c->btree_cache_freed)) {
- b = list_first_entry(&c->btree_cache_freed,
- struct btree, list);
- list_del(&b->list);
- kfree(b);
- }
-
- mutex_unlock(&c->btree_cache_lock);
-
- if (c->btree_cache_table_init_done)
- rhashtable_destroy(&c->btree_cache_table);
-}
-
-int bch_fs_btree_init(struct bch_fs *c)
-{
- unsigned i;
- int ret;
-
- ret = rhashtable_init(&c->btree_cache_table, &bch_btree_cache_params);
- if (ret)
- return ret;
-
- c->btree_cache_table_init_done = true;
-
- bch_recalc_btree_reserve(c);
-
- for (i = 0; i < c->btree_cache_reserve; i++)
- if (!mca_bucket_alloc(c, GFP_KERNEL))
- return -ENOMEM;
-
- list_splice_init(&c->btree_cache,
- &c->btree_cache_freeable);
-
-#ifdef CONFIG_BCACHE_DEBUG
- mutex_init(&c->verify_lock);
-
- c->verify_ondisk = (void *)
- __get_free_pages(GFP_KERNEL, ilog2(btree_pages(c)));
- if (!c->verify_ondisk)
- return -ENOMEM;
-
- c->verify_data = mca_bucket_alloc(c, GFP_KERNEL);
- if (!c->verify_data)
- return -ENOMEM;
-
- list_del_init(&c->verify_data->list);
-#endif
-
- c->btree_cache_shrink.count_objects = bch_mca_count;
- c->btree_cache_shrink.scan_objects = bch_mca_scan;
- c->btree_cache_shrink.seeks = 4;
- c->btree_cache_shrink.batch = btree_pages(c) * 2;
- register_shrinker(&c->btree_cache_shrink);
-
- return 0;
-}
-
-/*
- * We can only have one thread cannibalizing other cached btree nodes at a time,
- * or we'll deadlock. We use an open coded mutex to ensure that, which a
- * cannibalize_bucket() will take. This means every time we unlock the root of
- * the btree, we need to release this lock if we have it held.
- */
-void mca_cannibalize_unlock(struct bch_fs *c)
-{
- if (c->btree_cache_alloc_lock == current) {
- trace_bcache_mca_cannibalize_unlock(c);
- c->btree_cache_alloc_lock = NULL;
- closure_wake_up(&c->mca_wait);
- }
-}
-
-int mca_cannibalize_lock(struct bch_fs *c, struct closure *cl)
-{
- struct task_struct *old;
-
- old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
- if (old == NULL || old == current)
- goto success;
-
- if (!cl) {
- trace_bcache_mca_cannibalize_lock_fail(c);
- return -ENOMEM;
- }
-
- closure_wait(&c->mca_wait, cl);
-
- /* Try again, after adding ourselves to waitlist */
- old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
- if (old == NULL || old == current) {
- /* We raced */
- closure_wake_up(&c->mca_wait);
- goto success;
- }
-
- trace_bcache_mca_cannibalize_lock_fail(c);
- return -EAGAIN;
-
-success:
- trace_bcache_mca_cannibalize_lock(c);
- return 0;
-}
-
-static struct btree *mca_cannibalize(struct bch_fs *c)
-{
- struct btree *b;
-
- list_for_each_entry_reverse(b, &c->btree_cache, list)
- if (!mca_reap(c, b, false))
- return b;
-
- while (1) {
- list_for_each_entry_reverse(b, &c->btree_cache, list)
- if (!mca_reap(c, b, true))
- return b;
-
- /*
- * Rare case: all nodes were intent-locked.
- * Just busy-wait.
- */
- WARN_ONCE(1, "btree cache cannibalize failed\n");
- cond_resched();
- }
-}
-
-struct btree *mca_alloc(struct bch_fs *c)
-{
- struct btree *b;
- u64 start_time = local_clock();
-
- mutex_lock(&c->btree_cache_lock);
-
- /*
- * btree_free() doesn't free memory; it sticks the node on the end of
- * the list. Check if there's any freed nodes there:
- */
- list_for_each_entry(b, &c->btree_cache_freeable, list)
- if (!mca_reap_notrace(c, b, false))
- goto out_unlock;
-
- /*
- * We never free struct btree itself, just the memory that holds the on
- * disk node. Check the freed list before allocating a new one:
- */
- list_for_each_entry(b, &c->btree_cache_freed, list)
- if (!mca_reap_notrace(c, b, false)) {
- mca_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
- if (b->data)
- goto out_unlock;
-
- six_unlock_write(&b->lock);
- six_unlock_intent(&b->lock);
- goto err;
- }
-
- b = mca_bucket_alloc(c, __GFP_NOWARN|GFP_NOIO);
- if (!b)
- goto err;
-
- BUG_ON(!six_trylock_intent(&b->lock));
- BUG_ON(!six_trylock_write(&b->lock));
-out_unlock:
- BUG_ON(bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key));
- BUG_ON(btree_node_write_in_flight(b));
-
- list_del_init(&b->list);
- mutex_unlock(&c->btree_cache_lock);
-out:
- b->flags = 0;
- b->written = 0;
- b->nsets = 0;
- b->sib_u64s[0] = 0;
- b->sib_u64s[1] = 0;
- b->whiteout_u64s = 0;
- b->uncompacted_whiteout_u64s = 0;
- bch_btree_keys_init(b, &c->expensive_debug_checks);
-
- bch_time_stats_update(&c->mca_alloc_time, start_time);
-
- return b;
-err:
- /* Try to cannibalize another cached btree node: */
- if (c->btree_cache_alloc_lock == current) {
- b = mca_cannibalize(c);
- list_del_init(&b->list);
- mutex_unlock(&c->btree_cache_lock);
-
- mca_hash_remove(c, b);
-
- trace_bcache_mca_cannibalize(c);
- goto out;
- }
-
- mutex_unlock(&c->btree_cache_lock);
- return ERR_PTR(-ENOMEM);
-}
-
-/* Slowpath, don't want it inlined into btree_iter_traverse() */
-static noinline struct btree *bch_btree_node_fill(struct btree_iter *iter,
- const struct bkey_i *k,
- unsigned level,
- enum six_lock_type lock_type)
-{
- struct bch_fs *c = iter->c;
- struct btree *b;
-
- b = mca_alloc(c);
- if (IS_ERR(b))
- return b;
-
- bkey_copy(&b->key, k);
- if (mca_hash_insert(c, b, level, iter->btree_id)) {
- /* raced with another fill: */
-
- /* mark as unhashed... */
- bkey_i_to_extent(&b->key)->v._data[0] = 0;
-
- mutex_lock(&c->btree_cache_lock);
- list_add(&b->list, &c->btree_cache_freeable);
- mutex_unlock(&c->btree_cache_lock);
-
- six_unlock_write(&b->lock);
- six_unlock_intent(&b->lock);
- return NULL;
- }
-
- /*
- * If the btree node wasn't cached, we can't drop our lock on
- * the parent until after it's added to the cache - because
- * otherwise we could race with a btree_split() freeing the node
- * we're trying to lock.
- *
- * But the deadlock described below doesn't exist in this case,
- * so it's safe to not drop the parent lock until here:
- */
- if (btree_node_read_locked(iter, level + 1))
- btree_node_unlock(iter, level + 1);
-
- bch_btree_node_read(c, b);
- six_unlock_write(&b->lock);
-
- if (lock_type == SIX_LOCK_read)
- six_lock_downgrade(&b->lock);
-
- return b;
-}
-
-/**
- * bch_btree_node_get - find a btree node in the cache and lock it, reading it
- * in from disk if necessary.
- *
- * If IO is necessary and running under generic_make_request, returns -EAGAIN.
- *
- * The btree node will have either a read or a write lock held, depending on
- * the @write parameter.
- */
-struct btree *bch_btree_node_get(struct btree_iter *iter,
- const struct bkey_i *k, unsigned level,
- enum six_lock_type lock_type)
-{
- struct btree *b;
- struct bset_tree *t;
-
- BUG_ON(level >= BTREE_MAX_DEPTH);
-retry:
- rcu_read_lock();
- b = mca_find(iter->c, k);
- rcu_read_unlock();
-
- if (unlikely(!b)) {
- /*
- * We must have the parent locked to call bch_btree_node_fill(),
- * else we could read in a btree node from disk that's been
- * freed:
- */
- b = bch_btree_node_fill(iter, k, level, lock_type);
-
- /* We raced and found the btree node in the cache */
- if (!b)
- goto retry;
-
- if (IS_ERR(b))
- return b;
- } else {
- /*
- * There's a potential deadlock with splits and insertions into
- * interior nodes we have to avoid:
- *
- * The other thread might be holding an intent lock on the node
- * we want, and they want to update its parent node so they're
- * going to upgrade their intent lock on the parent node to a
- * write lock.
- *
- * But if we're holding a read lock on the parent, and we're
- * trying to get the intent lock they're holding, we deadlock.
- *
- * So to avoid this we drop the read locks on parent nodes when
- * we're starting to take intent locks - and handle the race.
- *
- * The race is that they might be about to free the node we
- * want, and dropping our read lock on the parent node lets them
- * update the parent marking the node we want as freed, and then
- * free it:
- *
- * To guard against this, btree nodes are evicted from the cache
- * when they're freed - and PTR_HASH() is zeroed out, which we
- * check for after we lock the node.
- *
- * Then, btree_node_relock() on the parent will fail - because
- * the parent was modified, when the pointer to the node we want
- * was removed - and we'll bail out:
- */
- if (btree_node_read_locked(iter, level + 1))
- btree_node_unlock(iter, level + 1);
-
- if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
- return ERR_PTR(-EINTR);
-
- if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
- b->level != level ||
- race_fault())) {
- six_unlock_type(&b->lock, lock_type);
- if (btree_node_relock(iter, level + 1))
- goto retry;
-
- return ERR_PTR(-EINTR);
- }
- }
-
- prefetch(b->aux_data);
-
- for_each_bset(b, t) {
- void *p = (u64 *) b->aux_data + t->aux_data_offset;
-
- prefetch(p + L1_CACHE_BYTES * 0);
- prefetch(p + L1_CACHE_BYTES * 1);
- prefetch(p + L1_CACHE_BYTES * 2);
- }
-
- /* avoid atomic set bit if it's not needed: */
- if (btree_node_accessed(b))
- set_btree_node_accessed(b);
-
- if (unlikely(btree_node_read_error(b))) {
- six_unlock_type(&b->lock, lock_type);
- return ERR_PTR(-EIO);
- }
-
- EBUG_ON(!b->written);
- EBUG_ON(b->btree_id != iter->btree_id ||
- BTREE_NODE_LEVEL(b->data) != level ||
- bkey_cmp(b->data->max_key, k->k.p));
-
- return b;
-}
-
-int bch_print_btree_node(struct bch_fs *c, struct btree *b,
- char *buf, size_t len)
-{
- const struct bkey_format *f = &b->format;
- struct bset_stats stats;
- char ptrs[100];
-
- memset(&stats, 0, sizeof(stats));
-
- bch_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs),
- bkey_i_to_s_c(&b->key));
- bch_btree_keys_stats(b, &stats);
-
- return scnprintf(buf, len,
- "l %u %llu:%llu - %llu:%llu:\n"
- " ptrs: %s\n"
- " format: u64s %u fields %u %u %u %u %u\n"
- " unpack fn len: %u\n"
- " bytes used %zu/%zu (%zu%% full)\n"
- " sib u64s: %u, %u (merge threshold %zu)\n"
- " nr packed keys %u\n"
- " nr unpacked keys %u\n"
- " floats %zu\n"
- " failed unpacked %zu\n"
- " failed prev %zu\n"
- " failed overflow %zu\n",
- b->level,
- b->data->min_key.inode,
- b->data->min_key.offset,
- b->data->max_key.inode,
- b->data->max_key.offset,
- ptrs,
- f->key_u64s,
- f->bits_per_field[0],
- f->bits_per_field[1],
- f->bits_per_field[2],
- f->bits_per_field[3],
- f->bits_per_field[4],
- b->unpack_fn_len,
- b->nr.live_u64s * sizeof(u64),
- btree_bytes(c) - sizeof(struct btree_node),
- b->nr.live_u64s * 100 / btree_max_u64s(c),
- b->sib_u64s[0],
- b->sib_u64s[1],
- BTREE_FOREGROUND_MERGE_THRESHOLD(c),
- b->nr.packed_keys,
- b->nr.unpacked_keys,
- stats.floats,
- stats.failed_unpacked,
- stats.failed_prev,
- stats.failed_overflow);
-}
diff --git a/libbcache/btree_cache.h b/libbcache/btree_cache.h
deleted file mode 100644
index 0d1c00c4..00000000
--- a/libbcache/btree_cache.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef _BCACHE_BTREE_CACHE_H
-#define _BCACHE_BTREE_CACHE_H
-
-#include "bcache.h"
-#include "btree_types.h"
-
-struct btree_iter;
-
-extern const char * const bch_btree_ids[];
-
-void bch_recalc_btree_reserve(struct bch_fs *);
-
-void mca_hash_remove(struct bch_fs *, struct btree *);
-int mca_hash_insert(struct bch_fs *, struct btree *,
- unsigned, enum btree_id);
-
-void mca_cannibalize_unlock(struct bch_fs *);
-int mca_cannibalize_lock(struct bch_fs *, struct closure *);
-
-struct btree *mca_alloc(struct bch_fs *);
-
-struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *,
- unsigned, enum six_lock_type);
-
-void bch_fs_btree_exit(struct bch_fs *);
-int bch_fs_btree_init(struct bch_fs *);
-
-#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
- for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl, \
- &(_c)->btree_cache_table), \
- _iter = 0; _iter < (_tbl)->size; _iter++) \
- rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
-
-static inline size_t btree_bytes(struct bch_fs *c)
-{
- return c->sb.btree_node_size << 9;
-}
-
-static inline size_t btree_max_u64s(struct bch_fs *c)
-{
- return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
-}
-
-static inline size_t btree_pages(struct bch_fs *c)
-{
- return c->sb.btree_node_size >> (PAGE_SHIFT - 9);
-}
-
-static inline size_t btree_page_order(struct bch_fs *c)
-{
- return ilog2(btree_pages(c));
-}
-
-static inline unsigned btree_blocks(struct bch_fs *c)
-{
- return c->sb.btree_node_size >> c->block_bits;
-}
-
-#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4)
-
-#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
-#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
- (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
- (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
-
-#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->btree_id].b)
-
-int bch_print_btree_node(struct bch_fs *, struct btree *,
- char *, size_t);
-
-#endif /* _BCACHE_BTREE_CACHE_H */
diff --git a/libbcache/btree_gc.c b/libbcache/btree_gc.c
deleted file mode 100644
index 5270d442..00000000
--- a/libbcache/btree_gc.c
+++ /dev/null
@@ -1,955 +0,0 @@
-/*
- * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright (C) 2014 Datera Inc.
- */
-
-#include "bcache.h"
-#include "alloc.h"
-#include "bkey_methods.h"
-#include "btree_locking.h"
-#include "btree_update.h"
-#include "btree_io.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "clock.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "super-io.h"
-#include "writeback.h"
-
-#include <linux/slab.h>
-#include <linux/bitops.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/rcupdate.h>
-#include <trace/events/bcache.h>
-
-struct range_checks {
- struct range_level {
- struct bpos min;
- struct bpos max;
- } l[BTREE_MAX_DEPTH];
- unsigned depth;
-};
-
-static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
-{
- unsigned i;
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- r->l[i].min = r->l[i].max = POS_MIN;
- r->depth = depth;
-}
-
-static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
- struct range_checks *r)
-{
- struct range_level *l = &r->l[b->level];
-
- struct bpos expected_min = bkey_cmp(l->min, l->max)
- ? btree_type_successor(b->btree_id, l->max)
- : l->max;
-
- bch_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
- "btree node has incorrect min key: %llu:%llu != %llu:%llu",
- b->data->min_key.inode,
- b->data->min_key.offset,
- expected_min.inode,
- expected_min.offset);
-
- l->max = b->data->max_key;
-
- if (b->level > r->depth) {
- l = &r->l[b->level - 1];
-
- bch_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
- "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
- b->data->min_key.inode,
- b->data->min_key.offset,
- l->min.inode,
- l->min.offset);
-
- bch_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
- "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
- b->data->max_key.inode,
- b->data->max_key.offset,
- l->max.inode,
- l->max.offset);
-
- if (bkey_cmp(b->data->max_key, POS_MAX))
- l->min = l->max =
- btree_type_successor(b->btree_id,
- b->data->max_key);
- }
-}
-
-u8 bch_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
-{
- const struct bch_extent_ptr *ptr;
- u8 max_stale = 0;
-
- if (bkey_extent_is_data(k.k)) {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-
- extent_for_each_ptr(e, ptr) {
- struct bch_dev *ca = c->devs[ptr->dev];
- size_t b = PTR_BUCKET_NR(ca, ptr);
-
- if (__gen_after(ca->oldest_gens[b], ptr->gen))
- ca->oldest_gens[b] = ptr->gen;
-
- max_stale = max(max_stale, ptr_stale(ca, ptr));
- }
- }
-
- return max_stale;
-}
-
-/*
- * For runtime mark and sweep:
- */
-static u8 bch_btree_mark_key(struct bch_fs *c, enum bkey_type type,
- struct bkey_s_c k)
-{
- switch (type) {
- case BKEY_TYPE_BTREE:
- bch_gc_mark_key(c, k, c->sb.btree_node_size, true);
- return 0;
- case BKEY_TYPE_EXTENTS:
- bch_gc_mark_key(c, k, k.k->size, false);
- return bch_btree_key_recalc_oldest_gen(c, k);
- default:
- BUG();
- }
-}
-
-u8 bch_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
- struct bkey_s_c k)
-{
- atomic64_set(&c->key_version,
- max_t(u64, k.k->version.lo,
- atomic64_read(&c->key_version)));
-
- return bch_btree_mark_key(c, type, k);
-}
-
-static bool btree_gc_mark_node(struct bch_fs *c, struct btree *b)
-{
- if (btree_node_has_ptrs(b)) {
- struct btree_node_iter iter;
- struct bkey unpacked;
- struct bkey_s_c k;
- u8 stale = 0;
-
- for_each_btree_node_key_unpack(b, k, &iter,
- btree_node_is_extents(b),
- &unpacked) {
- bkey_debugcheck(c, b, k);
- stale = max(stale, bch_btree_mark_key(c,
- btree_node_type(b), k));
- }
-
- if (btree_gc_rewrite_disabled(c))
- return false;
-
- if (stale > 10)
- return true;
- }
-
- if (btree_gc_always_rewrite(c))
- return true;
-
- return false;
-}
-
-static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
- write_seqcount_begin(&c->gc_pos_lock);
- c->gc_pos = new_pos;
- write_seqcount_end(&c->gc_pos_lock);
-}
-
-static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
-{
- BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
- __gc_pos_set(c, new_pos);
-}
-
-static int bch_gc_btree(struct bch_fs *c, enum btree_id btree_id)
-{
- struct btree_iter iter;
- struct btree *b;
- bool should_rewrite;
- struct range_checks r;
- unsigned depth = btree_id == BTREE_ID_EXTENTS ? 0 : 1;
- int ret;
-
- /*
- * if expensive_debug_checks is on, run range_checks on all leaf nodes:
- */
- if (expensive_debug_checks(c))
- depth = 0;
-
- btree_node_range_checks_init(&r, depth);
-
- for_each_btree_node(&iter, c, btree_id, POS_MIN, depth, b) {
- btree_node_range_checks(c, b, &r);
-
- bch_verify_btree_nr_keys(b);
-
- should_rewrite = btree_gc_mark_node(c, b);
-
- gc_pos_set(c, gc_pos_btree_node(b));
-
- if (should_rewrite)
- bch_btree_node_rewrite(&iter, b, NULL);
-
- bch_btree_iter_cond_resched(&iter);
- }
- ret = bch_btree_iter_unlock(&iter);
- if (ret)
- return ret;
-
- mutex_lock(&c->btree_root_lock);
-
- b = c->btree_roots[btree_id].b;
- bch_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key));
- gc_pos_set(c, gc_pos_btree_root(b->btree_id));
-
- mutex_unlock(&c->btree_root_lock);
- return 0;
-}
-
-static void bch_mark_allocator_buckets(struct bch_fs *c)
-{
- struct bch_dev *ca;
- struct open_bucket *ob;
- size_t i, j, iter;
- unsigned ci;
-
- for_each_member_device(ca, c, ci) {
- spin_lock(&ca->freelist_lock);
-
- fifo_for_each_entry(i, &ca->free_inc, iter)
- bch_mark_alloc_bucket(ca, &ca->buckets[i], true);
-
- for (j = 0; j < RESERVE_NR; j++)
- fifo_for_each_entry(i, &ca->free[j], iter)
- bch_mark_alloc_bucket(ca, &ca->buckets[i], true);
-
- spin_unlock(&ca->freelist_lock);
- }
-
- for (ob = c->open_buckets;
- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
- ob++) {
- const struct bch_extent_ptr *ptr;
-
- mutex_lock(&ob->lock);
- open_bucket_for_each_ptr(ob, ptr) {
- ca = c->devs[ptr->dev];
- bch_mark_alloc_bucket(ca, PTR_BUCKET(ca, ptr), true);
- }
- mutex_unlock(&ob->lock);
- }
-}
-
-static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end,
- enum bucket_data_type type)
-{
- u64 b = start >> ca->bucket_bits;
-
- do {
- bch_mark_metadata_bucket(ca, ca->buckets + b, type, true);
- b++;
- } while (b < end >> ca->bucket_bits);
-}
-
-static void bch_dev_mark_superblocks(struct bch_dev *ca)
-{
- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
- unsigned i;
-
- for (i = 0; i < layout->nr_superblocks; i++) {
- if (layout->sb_offset[i] == BCH_SB_SECTOR)
- mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
- BUCKET_SB);
-
- mark_metadata_sectors(ca,
- layout->sb_offset[i],
- layout->sb_offset[i] +
- (1 << layout->sb_max_size_bits),
- BUCKET_SB);
- }
-}
-
-/*
- * Mark non btree metadata - prios, journal
- */
-void bch_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca)
-{
- unsigned i;
- u64 b;
-
- lockdep_assert_held(&c->sb_lock);
-
- bch_dev_mark_superblocks(ca);
-
- spin_lock(&c->journal.lock);
-
- for (i = 0; i < ca->journal.nr; i++) {
- b = ca->journal.buckets[i];
- bch_mark_metadata_bucket(ca, ca->buckets + b,
- BUCKET_JOURNAL, true);
- }
-
- spin_unlock(&c->journal.lock);
-
- spin_lock(&ca->prio_buckets_lock);
-
- for (i = 0; i < prio_buckets(ca) * 2; i++) {
- b = ca->prio_buckets[i];
- if (b)
- bch_mark_metadata_bucket(ca, ca->buckets + b,
- BUCKET_PRIOS, true);
- }
-
- spin_unlock(&ca->prio_buckets_lock);
-}
-
-static void bch_mark_metadata(struct bch_fs *c)
-{
- struct bch_dev *ca;
- unsigned i;
-
- mutex_lock(&c->sb_lock);
- gc_pos_set(c, gc_phase(GC_PHASE_SB_METADATA));
-
- for_each_online_member(ca, c, i)
- bch_mark_dev_metadata(c, ca);
- mutex_unlock(&c->sb_lock);
-}
-
-/* Also see bch_pending_btree_node_free_insert_done() */
-static void bch_mark_pending_btree_node_frees(struct bch_fs *c)
-{
- struct bch_fs_usage stats = { 0 };
- struct btree_interior_update *as;
- struct pending_btree_node_free *d;
-
- mutex_lock(&c->btree_interior_update_lock);
- gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
-
- for_each_pending_btree_node_free(c, as, d)
- if (d->index_update_done)
- __bch_gc_mark_key(c, bkey_i_to_s_c(&d->key),
- c->sb.btree_node_size, true,
- &stats);
- /*
- * Don't apply stats - pending deletes aren't tracked in
- * bch_alloc_stats:
- */
-
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
-/**
- * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
- */
-void bch_gc(struct bch_fs *c)
-{
- struct bch_dev *ca;
- struct bucket *g;
- struct bucket_mark new;
- u64 start_time = local_clock();
- unsigned i;
- int cpu;
-
- /*
- * Walk _all_ references to buckets, and recompute them:
- *
- * Order matters here:
- * - Concurrent GC relies on the fact that we have a total ordering for
- * everything that GC walks - see gc_will_visit_node(),
- * gc_will_visit_root()
- *
- * - also, references move around in the course of index updates and
- * various other crap: everything needs to agree on the ordering
- * references are allowed to move around in - e.g., we're allowed to
- * start with a reference owned by an open_bucket (the allocator) and
- * move it to the btree, but not the reverse.
- *
- * This is necessary to ensure that gc doesn't miss references that
- * move around - if references move backwards in the ordering GC
- * uses, GC could skip past them
- */
-
- if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
- return;
-
- trace_bcache_gc_start(c);
-
- /*
- * Do this before taking gc_lock - bch_disk_reservation_get() blocks on
- * gc_lock if sectors_available goes to 0:
- */
- bch_recalc_sectors_available(c);
-
- down_write(&c->gc_lock);
-
- lg_global_lock(&c->usage_lock);
-
- /*
- * Indicates to buckets code that gc is now in progress - done under
- * usage_lock to avoid racing with bch_mark_key():
- */
- __gc_pos_set(c, GC_POS_MIN);
-
- /* Save a copy of the existing bucket stats while we recompute them: */
- for_each_member_device(ca, c, i) {
- ca->usage_cached = __bch_dev_usage_read(ca);
- for_each_possible_cpu(cpu) {
- struct bch_dev_usage *p =
- per_cpu_ptr(ca->usage_percpu, cpu);
- memset(p, 0, sizeof(*p));
- }
- }
-
- c->usage_cached = __bch_fs_usage_read(c);
- for_each_possible_cpu(cpu) {
- struct bch_fs_usage *p =
- per_cpu_ptr(c->usage_percpu, cpu);
-
- memset(p->s, 0, sizeof(p->s));
- p->persistent_reserved = 0;
- }
-
- lg_global_unlock(&c->usage_lock);
-
- /* Clear bucket marks: */
- for_each_member_device(ca, c, i)
- for_each_bucket(g, ca) {
- bucket_cmpxchg(g, new, ({
- new.owned_by_allocator = 0;
- new.data_type = 0;
- new.cached_sectors = 0;
- new.dirty_sectors = 0;
- }));
- ca->oldest_gens[g - ca->buckets] = new.gen;
- }
-
- /* Walk allocator's references: */
- bch_mark_allocator_buckets(c);
-
- /* Walk btree: */
- while (c->gc_pos.phase < (int) BTREE_ID_NR) {
- int ret = c->btree_roots[c->gc_pos.phase].b
- ? bch_gc_btree(c, (int) c->gc_pos.phase)
- : 0;
-
- if (ret) {
- bch_err(c, "btree gc failed: %d", ret);
- set_bit(BCH_FS_GC_FAILURE, &c->flags);
- up_write(&c->gc_lock);
- return;
- }
-
- gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
- }
-
- bch_mark_metadata(c);
- bch_mark_pending_btree_node_frees(c);
- bch_writeback_recalc_oldest_gens(c);
-
- for_each_member_device(ca, c, i)
- atomic_long_set(&ca->saturated_count, 0);
-
- /* Indicates that gc is no longer in progress: */
- gc_pos_set(c, gc_phase(GC_PHASE_DONE));
-
- up_write(&c->gc_lock);
- trace_bcache_gc_end(c);
- bch_time_stats_update(&c->btree_gc_time, start_time);
-
- /*
- * Wake up allocator in case it was waiting for buckets
- * because of not being able to inc gens
- */
- for_each_member_device(ca, c, i)
- bch_wake_allocator(ca);
-}
-
-/* Btree coalescing */
-
-static void recalc_packed_keys(struct btree *b)
-{
- struct bkey_packed *k;
-
- memset(&b->nr, 0, sizeof(b->nr));
-
- BUG_ON(b->nsets != 1);
-
- for (k = btree_bkey_first(b, b->set);
- k != btree_bkey_last(b, b->set);
- k = bkey_next(k))
- btree_keys_account_key_add(&b->nr, 0, k);
-}
-
-static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
- struct btree_iter *iter)
-{
- struct btree *parent = iter->nodes[old_nodes[0]->level + 1];
- struct bch_fs *c = iter->c;
- unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
- unsigned blocks = btree_blocks(c) * 2 / 3;
- struct btree *new_nodes[GC_MERGE_NODES];
- struct btree_interior_update *as;
- struct btree_reserve *res;
- struct keylist keylist;
- struct bkey_format_state format_state;
- struct bkey_format new_format;
-
- memset(new_nodes, 0, sizeof(new_nodes));
- bch_keylist_init(&keylist, NULL, 0);
-
- /* Count keys that are not deleted */
- for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
- u64s += old_nodes[i]->nr.live_u64s;
-
- nr_old_nodes = nr_new_nodes = i;
-
- /* Check if all keys in @old_nodes could fit in one fewer node */
- if (nr_old_nodes <= 1 ||
- __vstruct_blocks(struct btree_node, c->block_bits,
- DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
- return;
-
- res = bch_btree_reserve_get(c, parent, nr_old_nodes,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE,
- NULL);
- if (IS_ERR(res)) {
- trace_bcache_btree_gc_coalesce_fail(c,
- BTREE_GC_COALESCE_FAIL_RESERVE_GET);
- return;
- }
-
- if (bch_keylist_realloc(&keylist, NULL, 0,
- (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
- trace_bcache_btree_gc_coalesce_fail(c,
- BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
- goto out;
- }
-
- /* Find a format that all keys in @old_nodes can pack into */
- bch_bkey_format_init(&format_state);
-
- for (i = 0; i < nr_old_nodes; i++)
- __bch_btree_calc_format(&format_state, old_nodes[i]);
-
- new_format = bch_bkey_format_done(&format_state);
-
- /* Check if repacking would make any nodes too big to fit */
- for (i = 0; i < nr_old_nodes; i++)
- if (!bch_btree_node_format_fits(c, old_nodes[i], &new_format)) {
- trace_bcache_btree_gc_coalesce_fail(c,
- BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
- goto out;
- }
-
- trace_bcache_btree_gc_coalesce(c, parent, nr_old_nodes);
-
- as = bch_btree_interior_update_alloc(c);
-
- for (i = 0; i < nr_old_nodes; i++)
- bch_btree_interior_update_will_free_node(c, as, old_nodes[i]);
-
- /* Repack everything with @new_format and sort down to one bset */
- for (i = 0; i < nr_old_nodes; i++)
- new_nodes[i] = __btree_node_alloc_replacement(c, old_nodes[i],
- new_format, res);
-
- /*
- * Conceptually we concatenate the nodes together and slice them
- * up at different boundaries.
- */
- for (i = nr_new_nodes - 1; i > 0; --i) {
- struct btree *n1 = new_nodes[i];
- struct btree *n2 = new_nodes[i - 1];
-
- struct bset *s1 = btree_bset_first(n1);
- struct bset *s2 = btree_bset_first(n2);
- struct bkey_packed *k, *last = NULL;
-
- /* Calculate how many keys from @n2 we could fit inside @n1 */
- u64s = 0;
-
- for (k = s2->start;
- k < vstruct_last(s2) &&
- vstruct_blocks_plus(n1->data, c->block_bits,
- u64s + k->u64s) <= blocks;
- k = bkey_next(k)) {
- last = k;
- u64s += k->u64s;
- }
-
- if (u64s == le16_to_cpu(s2->u64s)) {
- /* n2 fits entirely in n1 */
- n1->key.k.p = n1->data->max_key = n2->data->max_key;
-
- memcpy_u64s(vstruct_last(s1),
- s2->start,
- le16_to_cpu(s2->u64s));
- le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
-
- set_btree_bset_end(n1, n1->set);
-
- six_unlock_write(&n2->lock);
- bch_btree_node_free_never_inserted(c, n2);
- six_unlock_intent(&n2->lock);
-
- memmove(new_nodes + i - 1,
- new_nodes + i,
- sizeof(new_nodes[0]) * (nr_new_nodes - i));
- new_nodes[--nr_new_nodes] = NULL;
- } else if (u64s) {
- /* move part of n2 into n1 */
- n1->key.k.p = n1->data->max_key =
- bkey_unpack_pos(n1, last);
-
- n2->data->min_key =
- btree_type_successor(iter->btree_id,
- n1->data->max_key);
-
- memcpy_u64s(vstruct_last(s1),
- s2->start, u64s);
- le16_add_cpu(&s1->u64s, u64s);
-
- memmove(s2->start,
- vstruct_idx(s2, u64s),
- (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
- s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
-
- set_btree_bset_end(n1, n1->set);
- set_btree_bset_end(n2, n2->set);
- }
- }
-
- for (i = 0; i < nr_new_nodes; i++) {
- struct btree *n = new_nodes[i];
-
- recalc_packed_keys(n);
- btree_node_reset_sib_u64s(n);
-
- bch_btree_build_aux_trees(n);
- six_unlock_write(&n->lock);
-
- bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
- }
-
- /*
- * The keys for the old nodes get deleted. We don't want to insert keys
- * that compare equal to the keys for the new nodes we'll also be
- * inserting - we can't because keys on a keylist must be strictly
- * greater than the previous keys, and we also don't need to since the
- * key for the new node will serve the same purpose (overwriting the key
- * for the old node).
- */
- for (i = 0; i < nr_old_nodes; i++) {
- struct bkey_i delete;
- unsigned j;
-
- for (j = 0; j < nr_new_nodes; j++)
- if (!bkey_cmp(old_nodes[i]->key.k.p,
- new_nodes[j]->key.k.p))
- goto next;
-
- bkey_init(&delete.k);
- delete.k.p = old_nodes[i]->key.k.p;
- bch_keylist_add_in_order(&keylist, &delete);
-next:
- i = i;
- }
-
- /*
- * Keys for the new nodes get inserted: bch_btree_insert_keys() only
- * does the lookup once and thus expects the keys to be in sorted order
- * so we have to make sure the new keys are correctly ordered with
- * respect to the deleted keys added in the previous loop
- */
- for (i = 0; i < nr_new_nodes; i++)
- bch_keylist_add_in_order(&keylist, &new_nodes[i]->key);
-
- /* Insert the newly coalesced nodes */
- bch_btree_insert_node(parent, iter, &keylist, res, as);
-
- BUG_ON(!bch_keylist_empty(&keylist));
-
- BUG_ON(iter->nodes[old_nodes[0]->level] != old_nodes[0]);
-
- BUG_ON(!bch_btree_iter_node_replace(iter, new_nodes[0]));
-
- for (i = 0; i < nr_new_nodes; i++)
- btree_open_bucket_put(c, new_nodes[i]);
-
- /* Free the old nodes and update our sliding window */
- for (i = 0; i < nr_old_nodes; i++) {
- bch_btree_node_free_inmem(iter, old_nodes[i]);
- six_unlock_intent(&old_nodes[i]->lock);
-
- /*
- * the index update might have triggered a split, in which case
- * the nodes we coalesced - the new nodes we just created -
- * might not be sibling nodes anymore - don't add them to the
- * sliding window (except the first):
- */
- if (!i) {
- old_nodes[i] = new_nodes[i];
- } else {
- old_nodes[i] = NULL;
- if (new_nodes[i])
- six_unlock_intent(&new_nodes[i]->lock);
- }
- }
-out:
- bch_keylist_free(&keylist, NULL);
- bch_btree_reserve_put(c, res);
-}
-
-static int bch_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
-{
- struct btree_iter iter;
- struct btree *b;
- unsigned i;
-
- /* Sliding window of adjacent btree nodes */
- struct btree *merge[GC_MERGE_NODES];
- u32 lock_seq[GC_MERGE_NODES];
-
- /*
- * XXX: We don't have a good way of positively matching on sibling nodes
- * that have the same parent - this code works by handling the cases
- * where they might not have the same parent, and is thus fragile. Ugh.
- *
- * Perhaps redo this to use multiple linked iterators?
- */
- memset(merge, 0, sizeof(merge));
-
- __for_each_btree_node(&iter, c, btree_id, POS_MIN, 0, b, U8_MAX) {
- memmove(merge + 1, merge,
- sizeof(merge) - sizeof(merge[0]));
- memmove(lock_seq + 1, lock_seq,
- sizeof(lock_seq) - sizeof(lock_seq[0]));
-
- merge[0] = b;
-
- for (i = 1; i < GC_MERGE_NODES; i++) {
- if (!merge[i] ||
- !six_relock_intent(&merge[i]->lock, lock_seq[i]))
- break;
-
- if (merge[i]->level != merge[0]->level) {
- six_unlock_intent(&merge[i]->lock);
- break;
- }
- }
- memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
-
- bch_coalesce_nodes(merge, &iter);
-
- for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
- lock_seq[i] = merge[i]->lock.state.seq;
- six_unlock_intent(&merge[i]->lock);
- }
-
- lock_seq[0] = merge[0]->lock.state.seq;
-
- if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) {
- bch_btree_iter_unlock(&iter);
- return -ESHUTDOWN;
- }
-
- bch_btree_iter_cond_resched(&iter);
-
- /*
- * If the parent node wasn't relocked, it might have been split
- * and the nodes in our sliding window might not have the same
- * parent anymore - blow away the sliding window:
- */
- if (iter.nodes[iter.level + 1] &&
- !btree_node_intent_locked(&iter, iter.level + 1))
- memset(merge + 1, 0,
- (GC_MERGE_NODES - 1) * sizeof(merge[0]));
- }
- return bch_btree_iter_unlock(&iter);
-}
-
-/**
- * bch_coalesce - coalesce adjacent nodes with low occupancy
- */
-void bch_coalesce(struct bch_fs *c)
-{
- u64 start_time;
- enum btree_id id;
-
- if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
- return;
-
- down_read(&c->gc_lock);
- trace_bcache_gc_coalesce_start(c);
- start_time = local_clock();
-
- for (id = 0; id < BTREE_ID_NR; id++) {
- int ret = c->btree_roots[id].b
- ? bch_coalesce_btree(c, id)
- : 0;
-
- if (ret) {
- if (ret != -ESHUTDOWN)
- bch_err(c, "btree coalescing failed: %d", ret);
- set_bit(BCH_FS_GC_FAILURE, &c->flags);
- return;
- }
- }
-
- bch_time_stats_update(&c->btree_coalesce_time, start_time);
- trace_bcache_gc_coalesce_end(c);
- up_read(&c->gc_lock);
-}
-
-static int bch_gc_thread(void *arg)
-{
- struct bch_fs *c = arg;
- struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last = atomic_long_read(&clock->now);
- unsigned last_kick = atomic_read(&c->kick_gc);
-
- set_freezable();
-
- while (1) {
- unsigned long next = last + c->capacity / 16;
-
- while (atomic_long_read(&clock->now) < next) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- return 0;
- }
-
- if (atomic_read(&c->kick_gc) != last_kick) {
- __set_current_state(TASK_RUNNING);
- break;
- }
-
- bch_io_clock_schedule_timeout(clock, next);
- try_to_freeze();
- }
-
- last = atomic_long_read(&clock->now);
- last_kick = atomic_read(&c->kick_gc);
-
- bch_gc(c);
- if (!btree_gc_coalesce_disabled(c))
- bch_coalesce(c);
-
- debug_check_no_locks_held();
- }
-
- return 0;
-}
-
-void bch_gc_thread_stop(struct bch_fs *c)
-{
- set_bit(BCH_FS_GC_STOPPING, &c->flags);
-
- if (c->gc_thread)
- kthread_stop(c->gc_thread);
-
- c->gc_thread = NULL;
- clear_bit(BCH_FS_GC_STOPPING, &c->flags);
-}
-
-int bch_gc_thread_start(struct bch_fs *c)
-{
- struct task_struct *p;
-
- BUG_ON(c->gc_thread);
-
- p = kthread_create(bch_gc_thread, c, "bcache_gc");
- if (IS_ERR(p))
- return PTR_ERR(p);
-
- c->gc_thread = p;
- wake_up_process(c->gc_thread);
- return 0;
-}
-
-/* Initial GC computes bucket marks during startup */
-
-static void bch_initial_gc_btree(struct bch_fs *c, enum btree_id id)
-{
- struct btree_iter iter;
- struct btree *b;
- struct range_checks r;
-
- btree_node_range_checks_init(&r, 0);
-
- if (!c->btree_roots[id].b)
- return;
-
- /*
- * We have to hit every btree node before starting journal replay, in
- * order for the journal seq blacklist machinery to work:
- */
- for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
- btree_node_range_checks(c, b, &r);
-
- if (btree_node_has_ptrs(b)) {
- struct btree_node_iter node_iter;
- struct bkey unpacked;
- struct bkey_s_c k;
-
- for_each_btree_node_key_unpack(b, k, &node_iter,
- btree_node_is_extents(b),
- &unpacked)
- bch_btree_mark_key_initial(c, btree_node_type(b), k);
- }
-
- bch_btree_iter_cond_resched(&iter);
- }
-
- bch_btree_iter_unlock(&iter);
-
- bch_btree_mark_key(c, BKEY_TYPE_BTREE,
- bkey_i_to_s_c(&c->btree_roots[id].b->key));
-}
-
-int bch_initial_gc(struct bch_fs *c, struct list_head *journal)
-{
- enum btree_id id;
-
- for (id = 0; id < BTREE_ID_NR; id++)
- bch_initial_gc_btree(c, id);
-
- if (journal)
- bch_journal_mark(c, journal);
-
- bch_mark_metadata(c);
-
- /*
- * Skip past versions that might have possibly been used (as nonces),
- * but hadn't had their pointers written:
- */
- if (c->sb.encryption_type)
- atomic64_add(1 << 16, &c->key_version);
-
- gc_pos_set(c, gc_phase(GC_PHASE_DONE));
- set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-
- return 0;
-}
diff --git a/libbcache/btree_gc.h b/libbcache/btree_gc.h
deleted file mode 100644
index f1794fdf..00000000
--- a/libbcache/btree_gc.h
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef _BCACHE_GC_H
-#define _BCACHE_GC_H
-
-#include "btree_types.h"
-
-enum bkey_type;
-
-void bch_coalesce(struct bch_fs *);
-void bch_gc(struct bch_fs *);
-void bch_gc_thread_stop(struct bch_fs *);
-int bch_gc_thread_start(struct bch_fs *);
-int bch_initial_gc(struct bch_fs *, struct list_head *);
-u8 bch_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
-u8 bch_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
- struct bkey_s_c);
-void bch_mark_dev_metadata(struct bch_fs *, struct bch_dev *);
-
-/*
- * For concurrent mark and sweep (with other index updates), we define a total
- * ordering of _all_ references GC walks:
- *
- * Note that some references will have the same GC position as others - e.g.
- * everything within the same btree node; in those cases we're relying on
- * whatever locking exists for where those references live, i.e. the write lock
- * on a btree node.
- *
- * That locking is also required to ensure GC doesn't pass the updater in
- * between the updater adding/removing the reference and updating the GC marks;
- * without that, we would at best double count sometimes.
- *
- * That part is important - whenever calling bch_mark_pointers(), a lock _must_
- * be held that prevents GC from passing the position the updater is at.
- *
- * (What about the start of gc, when we're clearing all the marks? GC clears the
- * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
- * position inside its cmpxchg loop, so crap magically works).
- */
-
-/* Position of (the start of) a gc phase: */
-static inline struct gc_pos gc_phase(enum gc_phase phase)
-{
- return (struct gc_pos) {
- .phase = phase,
- .pos = POS_MIN,
- .level = 0,
- };
-}
-
-#define GC_POS_MIN gc_phase(0)
-
-static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
-{
- if (l.phase != r.phase)
- return l.phase < r.phase ? -1 : 1;
- if (bkey_cmp(l.pos, r.pos))
- return bkey_cmp(l.pos, r.pos);
- if (l.level != r.level)
- return l.level < r.level ? -1 : 1;
- return 0;
-}
-
-/*
- * GC position of the pointers within a btree node: note, _not_ for &b->key
- * itself, that lives in the parent node:
- */
-static inline struct gc_pos gc_pos_btree_node(struct btree *b)
-{
- return (struct gc_pos) {
- .phase = b->btree_id,
- .pos = b->key.k.p,
- .level = b->level,
- };
-}
-
-/*
- * GC position of the pointer to a btree root: we don't use
- * gc_pos_pointer_to_btree_node() here to avoid a potential race with
- * btree_split() increasing the tree depth - the new root will have level > the
- * old root and thus have a greater gc position than the old root, but that
- * would be incorrect since once gc has marked the root it's not coming back.
- */
-static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
-{
- return (struct gc_pos) {
- .phase = (int) id,
- .pos = POS_MAX,
- .level = U8_MAX,
- };
-}
-
-static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
-{
- unsigned seq;
- bool ret;
-
- do {
- seq = read_seqcount_begin(&c->gc_pos_lock);
- ret = gc_pos_cmp(c->gc_pos, pos) < 0;
- } while (read_seqcount_retry(&c->gc_pos_lock, seq));
-
- return ret;
-}
-
-#endif
diff --git a/libbcache/btree_io.c b/libbcache/btree_io.c
deleted file mode 100644
index 737e54ec..00000000
--- a/libbcache/btree_io.c
+++ /dev/null
@@ -1,1738 +0,0 @@
-
-#include "bcache.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_update.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "io.h"
-#include "journal.h"
-#include "super-io.h"
-
-#include <trace/events/bcache.h>
-
-static void verify_no_dups(struct btree *b,
- struct bkey_packed *start,
- struct bkey_packed *end)
-{
-#ifdef CONFIG_BCACHE_DEBUG
- struct bkey_packed *k;
-
- for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
- struct bkey l = bkey_unpack_key(b, k);
- struct bkey r = bkey_unpack_key(b, bkey_next(k));
-
- BUG_ON(btree_node_is_extents(b)
- ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
- : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
- //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
- }
-#endif
-}
-
-static void clear_needs_whiteout(struct bset *i)
-{
- struct bkey_packed *k;
-
- for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
- k->needs_whiteout = false;
-}
-
-static void set_needs_whiteout(struct bset *i)
-{
- struct bkey_packed *k;
-
- for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
- k->needs_whiteout = true;
-}
-
-static void btree_bounce_free(struct bch_fs *c, unsigned order,
- bool used_mempool, void *p)
-{
- if (used_mempool)
- mempool_free(virt_to_page(p), &c->btree_bounce_pool);
- else
- free_pages((unsigned long) p, order);
-}
-
-static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
- bool *used_mempool)
-{
- void *p;
-
- BUG_ON(1 << order > btree_pages(c));
-
- *used_mempool = false;
- p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
- if (p)
- return p;
-
- *used_mempool = true;
- return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO));
-}
-
-typedef int (*sort_cmp_fn)(struct btree *,
- struct bkey_packed *,
- struct bkey_packed *);
-
-struct sort_iter {
- struct btree *b;
- unsigned used;
-
- struct sort_iter_set {
- struct bkey_packed *k, *end;
- } data[MAX_BSETS + 1];
-};
-
-static void sort_iter_init(struct sort_iter *iter, struct btree *b)
-{
- memset(iter, 0, sizeof(*iter));
- iter->b = b;
-}
-
-static inline void __sort_iter_sift(struct sort_iter *iter,
- unsigned from,
- sort_cmp_fn cmp)
-{
- unsigned i;
-
- for (i = from;
- i + 1 < iter->used &&
- cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
- i++)
- swap(iter->data[i], iter->data[i + 1]);
-}
-
-static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-
- __sort_iter_sift(iter, 0, cmp);
-}
-
-static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
-{
- unsigned i = iter->used;
-
- while (i--)
- __sort_iter_sift(iter, i, cmp);
-}
-
-static void sort_iter_add(struct sort_iter *iter,
- struct bkey_packed *k,
- struct bkey_packed *end)
-{
- BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
-
- if (k != end)
- iter->data[iter->used++] = (struct sort_iter_set) { k, end };
-}
-
-static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
-{
- return iter->used ? iter->data->k : NULL;
-}
-
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
-{
- iter->data->k = bkey_next(iter->data->k);
-
- BUG_ON(iter->data->k > iter->data->end);
-
- if (iter->data->k == iter->data->end)
- memmove(&iter->data[0],
- &iter->data[1],
- sizeof(iter->data[0]) * --iter->used);
- else
- sort_iter_sift(iter, cmp);
-}
-
-static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
- sort_cmp_fn cmp)
-{
- struct bkey_packed *ret = sort_iter_peek(iter);
-
- if (ret)
- sort_iter_advance(iter, cmp);
-
- return ret;
-}
-
-static inline int sort_key_whiteouts_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
-{
- return bkey_cmp_packed(b, l, r);
-}
-
-static unsigned sort_key_whiteouts(struct bkey_packed *dst,
- struct sort_iter *iter)
-{
- struct bkey_packed *in, *out = dst;
-
- sort_iter_sort(iter, sort_key_whiteouts_cmp);
-
- while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
- bkey_copy(out, in);
- out = bkey_next(out);
- }
-
- return (u64 *) out - (u64 *) dst;
-}
-
-static inline int sort_extent_whiteouts_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
-{
- struct bkey ul = bkey_unpack_key(b, l);
- struct bkey ur = bkey_unpack_key(b, r);
-
- return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
-}
-
-static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
- struct sort_iter *iter)
-{
- const struct bkey_format *f = &iter->b->format;
- struct bkey_packed *in, *out = dst;
- struct bkey_i l, r;
- bool prev = false, l_packed = false;
- u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE);
- u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET);
- u64 new_size;
-
- max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
-
- sort_iter_sort(iter, sort_extent_whiteouts_cmp);
-
- while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
- EBUG_ON(bkeyp_val_u64s(f, in));
- EBUG_ON(in->type != KEY_TYPE_DISCARD);
-
- r.k = bkey_unpack_key(iter->b, in);
-
- if (prev &&
- bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
- if (bkey_cmp(l.k.p, r.k.p) >= 0)
- continue;
-
- new_size = l_packed
- ? min(max_packed_size, max_packed_offset -
- bkey_start_offset(&l.k))
- : KEY_SIZE_MAX;
-
- new_size = min(new_size, r.k.p.offset -
- bkey_start_offset(&l.k));
-
- BUG_ON(new_size < l.k.size);
-
- bch_key_resize(&l.k, new_size);
-
- if (bkey_cmp(l.k.p, r.k.p) >= 0)
- continue;
-
- bch_cut_front(l.k.p, &r);
- }
-
- if (prev) {
- if (!bkey_pack(out, &l, f)) {
- BUG_ON(l_packed);
- bkey_copy(out, &l);
- }
- out = bkey_next(out);
- }
-
- l = r;
- prev = true;
- l_packed = bkey_packed(in);
- }
-
- if (prev) {
- if (!bkey_pack(out, &l, f)) {
- BUG_ON(l_packed);
- bkey_copy(out, &l);
- }
- out = bkey_next(out);
- }
-
- return (u64 *) out - (u64 *) dst;
-}
-
-static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
- bool compacting,
- enum compact_mode mode)
-{
- unsigned live_u64s = b->nr.bset_u64s[t - b->set];
- unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
-
- if (live_u64s == bset_u64s)
- return 0;
-
- if (mode == COMPACT_LAZY) {
- if (live_u64s * 4 < bset_u64s * 3 ||
- (compacting && bset_unwritten(b, bset(b, t))))
- return bset_u64s - live_u64s;
- } else {
- if (bset_written(b, bset(b, t)))
- return bset_u64s - live_u64s;
- }
-
- return 0;
-}
-
-bool __bch_compact_whiteouts(struct bch_fs *c, struct btree *b,
- enum compact_mode mode)
-{
- const struct bkey_format *f = &b->format;
- struct bset_tree *t;
- struct bkey_packed *whiteouts = NULL;
- struct bkey_packed *u_start, *u_pos;
- struct sort_iter sort_iter;
- unsigned order, whiteout_u64s = 0, u64s;
- bool used_mempool, compacting = false;
-
- for_each_bset(b, t)
- whiteout_u64s += should_compact_bset(b, t,
- whiteout_u64s != 0, mode);
-
- if (!whiteout_u64s)
- return false;
-
- sort_iter_init(&sort_iter, b);
-
- whiteout_u64s += b->whiteout_u64s;
- order = get_order(whiteout_u64s * sizeof(u64));
-
- whiteouts = btree_bounce_alloc(c, order, &used_mempool);
- u_start = u_pos = whiteouts;
-
- memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
- b->whiteout_u64s);
- u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
-
- sort_iter_add(&sort_iter, u_start, u_pos);
-
- for_each_bset(b, t) {
- struct bset *i = bset(b, t);
- struct bkey_packed *k, *n, *out, *start, *end;
- struct btree_node_entry *src = NULL, *dst = NULL;
-
- if (t != b->set && bset_unwritten(b, i)) {
- src = container_of(i, struct btree_node_entry, keys);
- dst = max(write_block(b),
- (void *) btree_bkey_last(b, t -1));
- }
-
- if (!should_compact_bset(b, t, compacting, mode)) {
- if (src != dst) {
- memmove(dst, src, sizeof(*src) +
- le16_to_cpu(src->keys.u64s) *
- sizeof(u64));
- i = &dst->keys;
- set_btree_bset(b, t, i);
- }
- continue;
- }
-
- compacting = true;
- u_start = u_pos;
- start = i->start;
- end = vstruct_last(i);
-
- if (src != dst) {
- memmove(dst, src, sizeof(*src));
- i = &dst->keys;
- set_btree_bset(b, t, i);
- }
-
- out = i->start;
-
- for (k = start; k != end; k = n) {
- n = bkey_next(k);
-
- if (bkey_deleted(k) && btree_node_is_extents(b))
- continue;
-
- if (bkey_whiteout(k) && !k->needs_whiteout)
- continue;
-
- if (bkey_whiteout(k)) {
- unreserve_whiteout(b, t, k);
- memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
- set_bkeyp_val_u64s(f, u_pos, 0);
- u_pos = bkey_next(u_pos);
- } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
- bkey_copy(out, k);
- out = bkey_next(out);
- }
- }
-
- sort_iter_add(&sort_iter, u_start, u_pos);
-
- if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
- i->u64s = cpu_to_le16((u64 *) out - i->_data);
- set_btree_bset_end(b, t);
- bch_bset_set_no_aux_tree(b, t);
- }
- }
-
- b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
-
- BUG_ON((void *) unwritten_whiteouts_start(c, b) <
- (void *) btree_bkey_last(b, bset_tree_last(b)));
-
- u64s = btree_node_is_extents(b)
- ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
- &sort_iter)
- : sort_key_whiteouts(unwritten_whiteouts_start(c, b),
- &sort_iter);
-
- BUG_ON(u64s > b->whiteout_u64s);
- BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
- BUG_ON(u_pos != whiteouts && !u64s);
-
- if (u64s != b->whiteout_u64s) {
- void *src = unwritten_whiteouts_start(c, b);
-
- b->whiteout_u64s = u64s;
- memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
- }
-
- verify_no_dups(b,
- unwritten_whiteouts_start(c, b),
- unwritten_whiteouts_end(c, b));
-
- btree_bounce_free(c, order, used_mempool, whiteouts);
-
- if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
- bch_btree_build_aux_trees(b);
-
- bch_btree_keys_u64s_remaining(c, b);
- bch_verify_btree_nr_keys(b);
-
- return true;
-}
-
-static bool bch_drop_whiteouts(struct btree *b)
-{
- struct bset_tree *t;
- bool ret = false;
-
- for_each_bset(b, t) {
- struct bset *i = bset(b, t);
- struct bkey_packed *k, *n, *out, *start, *end;
-
- if (!should_compact_bset(b, t, true, true))
- continue;
-
- start = btree_bkey_first(b, t);
- end = btree_bkey_last(b, t);
-
- if (bset_unwritten(b, i) &&
- t != b->set) {
- struct bset *dst =
- max_t(struct bset *, write_block(b),
- (void *) btree_bkey_last(b, t -1));
-
- memmove(dst, i, sizeof(struct bset));
- i = dst;
- set_btree_bset(b, t, i);
- }
-
- out = i->start;
-
- for (k = start; k != end; k = n) {
- n = bkey_next(k);
-
- if (!bkey_whiteout(k)) {
- bkey_copy(out, k);
- out = bkey_next(out);
- }
- }
-
- i->u64s = cpu_to_le16((u64 *) out - i->_data);
- bch_bset_set_no_aux_tree(b, t);
- ret = true;
- }
-
- bch_verify_btree_nr_keys(b);
-
- return ret;
-}
-
-static inline int sort_keys_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
-{
- return bkey_cmp_packed(b, l, r) ?:
- (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
- (int) l->needs_whiteout - (int) r->needs_whiteout;
-}
-
-static unsigned sort_keys(struct bkey_packed *dst,
- struct sort_iter *iter,
- bool filter_whiteouts)
-{
- const struct bkey_format *f = &iter->b->format;
- struct bkey_packed *in, *next, *out = dst;
-
- sort_iter_sort(iter, sort_keys_cmp);
-
- while ((in = sort_iter_next(iter, sort_keys_cmp))) {
- if (bkey_whiteout(in) &&
- (filter_whiteouts || !in->needs_whiteout))
- continue;
-
- if (bkey_whiteout(in) &&
- (next = sort_iter_peek(iter)) &&
- !bkey_cmp_packed(iter->b, in, next)) {
- BUG_ON(in->needs_whiteout &&
- next->needs_whiteout);
- /*
- * XXX racy, called with read lock from write path
- *
- * leads to spurious BUG_ON() in bkey_unpack_key() in
- * debug mode
- */
- next->needs_whiteout |= in->needs_whiteout;
- continue;
- }
-
- if (bkey_whiteout(in)) {
- memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
- set_bkeyp_val_u64s(f, out, 0);
- } else {
- bkey_copy(out, in);
- }
- out = bkey_next(out);
- }
-
- return (u64 *) out - (u64 *) dst;
-}
-
-static inline int sort_extents_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
-{
- return bkey_cmp_packed(b, l, r) ?:
- (int) bkey_deleted(l) - (int) bkey_deleted(r);
-}
-
-static unsigned sort_extents(struct bkey_packed *dst,
- struct sort_iter *iter,
- bool filter_whiteouts)
-{
- struct bkey_packed *in, *out = dst;
-
- sort_iter_sort(iter, sort_extents_cmp);
-
- while ((in = sort_iter_next(iter, sort_extents_cmp))) {
- if (bkey_deleted(in))
- continue;
-
- if (bkey_whiteout(in) &&
- (filter_whiteouts || !in->needs_whiteout))
- continue;
-
- bkey_copy(out, in);
- out = bkey_next(out);
- }
-
- return (u64 *) out - (u64 *) dst;
-}
-
-static void btree_node_sort(struct bch_fs *c, struct btree *b,
- struct btree_iter *iter,
- unsigned start_idx,
- unsigned end_idx,
- bool filter_whiteouts)
-{
- struct btree_node *out;
- struct sort_iter sort_iter;
- struct bset_tree *t;
- struct bset *start_bset = bset(b, &b->set[start_idx]);
- bool used_mempool = false;
- u64 start_time;
- unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
- bool sorting_entire_node = start_idx == 0 &&
- end_idx == b->nsets;
-
- sort_iter_init(&sort_iter, b);
-
- for (t = b->set + start_idx;
- t < b->set + end_idx;
- t++) {
- u64s += le16_to_cpu(bset(b, t)->u64s);
- sort_iter_add(&sort_iter,
- btree_bkey_first(b, t),
- btree_bkey_last(b, t));
- }
-
- order = sorting_entire_node
- ? btree_page_order(c)
- : get_order(__vstruct_bytes(struct btree_node, u64s));
-
- out = btree_bounce_alloc(c, order, &used_mempool);
-
- start_time = local_clock();
-
- if (btree_node_is_extents(b))
- filter_whiteouts = bset_written(b, start_bset);
-
- u64s = btree_node_is_extents(b)
- ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
- : sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
-
- out->keys.u64s = cpu_to_le16(u64s);
-
- BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
-
- if (sorting_entire_node)
- bch_time_stats_update(&c->btree_sort_time, start_time);
-
- /* Make sure we preserve bset journal_seq: */
- for (t = b->set + start_idx + 1;
- t < b->set + end_idx;
- t++)
- start_bset->journal_seq =
- max(start_bset->journal_seq,
- bset(b, t)->journal_seq);
-
- if (sorting_entire_node) {
- unsigned u64s = le16_to_cpu(out->keys.u64s);
-
- BUG_ON(order != btree_page_order(c));
-
- /*
- * Our temporary buffer is the same size as the btree node's
- * buffer, we can just swap buffers instead of doing a big
- * memcpy()
- */
- *out = *b->data;
- out->keys.u64s = cpu_to_le16(u64s);
- swap(out, b->data);
- set_btree_bset(b, b->set, &b->data->keys);
- } else {
- start_bset->u64s = out->keys.u64s;
- memcpy_u64s(start_bset->start,
- out->keys.start,
- le16_to_cpu(out->keys.u64s));
- }
-
- for (i = start_idx + 1; i < end_idx; i++)
- b->nr.bset_u64s[start_idx] +=
- b->nr.bset_u64s[i];
-
- b->nsets -= shift;
-
- for (i = start_idx + 1; i < b->nsets; i++) {
- b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift];
- b->set[i] = b->set[i + shift];
- }
-
- for (i = b->nsets; i < MAX_BSETS; i++)
- b->nr.bset_u64s[i] = 0;
-
- set_btree_bset_end(b, &b->set[start_idx]);
- bch_bset_set_no_aux_tree(b, &b->set[start_idx]);
-
- btree_bounce_free(c, order, used_mempool, out);
-
- bch_verify_btree_nr_keys(b);
-}
-
-/* Sort + repack in a new format: */
-static struct btree_nr_keys sort_repack(struct bset *dst,
- struct btree *src,
- struct btree_node_iter *src_iter,
- struct bkey_format *out_f,
- bool filter_whiteouts)
-{
- struct bkey_format *in_f = &src->format;
- struct bkey_packed *in, *out = vstruct_last(dst);
- struct btree_nr_keys nr;
-
- memset(&nr, 0, sizeof(nr));
-
- while ((in = bch_btree_node_iter_next_all(src_iter, src))) {
- if (filter_whiteouts && bkey_whiteout(in))
- continue;
-
- if (bch_bkey_transform(out_f, out, bkey_packed(in)
- ? in_f : &bch_bkey_format_current, in))
- out->format = KEY_FORMAT_LOCAL_BTREE;
- else
- bkey_unpack(src, (void *) out, in);
-
- btree_keys_account_key_add(&nr, 0, out);
- out = bkey_next(out);
- }
-
- dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
- return nr;
-}
-
-/* Sort, repack, and merge: */
-static struct btree_nr_keys sort_repack_merge(struct bch_fs *c,
- struct bset *dst,
- struct btree *src,
- struct btree_node_iter *iter,
- struct bkey_format *out_f,
- bool filter_whiteouts,
- key_filter_fn filter,
- key_merge_fn merge)
-{
- struct bkey_packed *k, *prev = NULL, *out;
- struct btree_nr_keys nr;
- BKEY_PADDED(k) tmp;
-
- memset(&nr, 0, sizeof(nr));
-
- while ((k = bch_btree_node_iter_next_all(iter, src))) {
- if (filter_whiteouts && bkey_whiteout(k))
- continue;
-
- /*
- * The filter might modify pointers, so we have to unpack the
- * key and values to &tmp.k:
- */
- bkey_unpack(src, &tmp.k, k);
-
- if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
- continue;
-
- /* prev is always unpacked, for key merging: */
-
- if (prev &&
- merge &&
- merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
- continue;
-
- /*
- * the current key becomes the new prev: advance prev, then
- * copy the current key - but first pack prev (in place):
- */
- if (prev) {
- bkey_pack(prev, (void *) prev, out_f);
-
- btree_keys_account_key_add(&nr, 0, prev);
- prev = bkey_next(prev);
- } else {
- prev = vstruct_last(dst);
- }
-
- bkey_copy(prev, &tmp.k);
- }
-
- if (prev) {
- bkey_pack(prev, (void *) prev, out_f);
- btree_keys_account_key_add(&nr, 0, prev);
- out = bkey_next(prev);
- } else {
- out = vstruct_last(dst);
- }
-
- dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
- return nr;
-}
-
-void bch_btree_sort_into(struct bch_fs *c,
- struct btree *dst,
- struct btree *src)
-{
- struct btree_nr_keys nr;
- struct btree_node_iter src_iter;
- u64 start_time = local_clock();
-
- BUG_ON(dst->nsets != 1);
-
- bch_bset_set_no_aux_tree(dst, dst->set);
-
- bch_btree_node_iter_init_from_start(&src_iter, src,
- btree_node_is_extents(src));
-
- if (btree_node_ops(src)->key_normalize ||
- btree_node_ops(src)->key_merge)
- nr = sort_repack_merge(c, btree_bset_first(dst),
- src, &src_iter,
- &dst->format,
- true,
- btree_node_ops(src)->key_normalize,
- btree_node_ops(src)->key_merge);
- else
- nr = sort_repack(btree_bset_first(dst),
- src, &src_iter,
- &dst->format,
- true);
-
- bch_time_stats_update(&c->btree_sort_time, start_time);
-
- set_btree_bset_end(dst, dst->set);
-
- dst->nr.live_u64s += nr.live_u64s;
- dst->nr.bset_u64s[0] += nr.bset_u64s[0];
- dst->nr.packed_keys += nr.packed_keys;
- dst->nr.unpacked_keys += nr.unpacked_keys;
-
- bch_verify_btree_nr_keys(dst);
-}
-
-#define SORT_CRIT (4096 / sizeof(u64))
-
-/*
- * We're about to add another bset to the btree node, so if there's currently
- * too many bsets - sort some of them together:
- */
-static bool btree_node_compact(struct bch_fs *c, struct btree *b,
- struct btree_iter *iter)
-{
- unsigned unwritten_idx;
- bool ret = false;
-
- for (unwritten_idx = 0;
- unwritten_idx < b->nsets;
- unwritten_idx++)
- if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
- break;
-
- if (b->nsets - unwritten_idx > 1) {
- btree_node_sort(c, b, iter, unwritten_idx,
- b->nsets, false);
- ret = true;
- }
-
- if (unwritten_idx > 1) {
- btree_node_sort(c, b, iter, 0, unwritten_idx, false);
- ret = true;
- }
-
- return ret;
-}
-
-void bch_btree_build_aux_trees(struct btree *b)
-{
- struct bset_tree *t;
-
- for_each_bset(b, t)
- bch_bset_build_aux_tree(b, t,
- bset_unwritten(b, bset(b, t)) &&
- t == bset_tree_last(b));
-}
-
-/*
- * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
- * inserted into
- *
- * Safe to call if there already is an unwritten bset - will only add a new bset
- * if @b doesn't already have one.
- *
- * Returns true if we sorted (i.e. invalidated iterators
- */
-void bch_btree_init_next(struct bch_fs *c, struct btree *b,
- struct btree_iter *iter)
-{
- struct btree_node_entry *bne;
- bool did_sort;
-
- EBUG_ON(!(b->lock.state.seq & 1));
- EBUG_ON(iter && iter->nodes[b->level] != b);
-
- did_sort = btree_node_compact(c, b, iter);
-
- bne = want_new_bset(c, b);
- if (bne)
- bch_bset_init_next(b, &bne->keys);
-
- bch_btree_build_aux_trees(b);
-
- if (iter && did_sort)
- bch_btree_iter_reinit_node(iter, b);
-}
-
-static struct nonce btree_nonce(struct btree *b,
- struct bset *i,
- unsigned offset)
-{
- return (struct nonce) {{
- [0] = cpu_to_le32(offset),
- [1] = ((__le32 *) &i->seq)[0],
- [2] = ((__le32 *) &i->seq)[1],
- [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
- }};
-}
-
-static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce)
-{
- bch_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
- vstruct_end(i) - (void *) i->_data);
-}
-
-#define btree_node_error(b, c, ptr, fmt, ...) \
- bch_fs_inconsistent(c, \
- "btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\
- (b)->btree_id, (b)->level, btree_node_root(c, b) \
- ? btree_node_root(c, b)->level : -1, \
- PTR_BUCKET_NR(ca, ptr), (b)->written, \
- le16_to_cpu((i)->u64s), ##__VA_ARGS__)
-
-static const char *validate_bset(struct bch_fs *c, struct btree *b,
- struct bch_dev *ca,
- const struct bch_extent_ptr *ptr,
- struct bset *i, unsigned sectors,
- unsigned *whiteout_u64s)
-{
- struct bkey_packed *k, *prev = NULL;
- struct bpos prev_pos = POS_MIN;
- bool seen_non_whiteout = false;
-
- if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION)
- return "unsupported bset version";
-
- if (b->written + sectors > c->sb.btree_node_size)
- return "bset past end of btree node";
-
- if (i != &b->data->keys && !i->u64s)
- btree_node_error(b, c, ptr, "empty set");
-
- if (!BSET_SEPARATE_WHITEOUTS(i)) {
- seen_non_whiteout = true;
- whiteout_u64s = 0;
- }
-
- for (k = i->start;
- k != vstruct_last(i);) {
- struct bkey_s_c u;
- struct bkey tmp;
- const char *invalid;
-
- if (!k->u64s) {
- btree_node_error(b, c, ptr,
- "KEY_U64s 0: %zu bytes of metadata lost",
- vstruct_end(i) - (void *) k);
-
- i->u64s = cpu_to_le16((u64 *) k - i->_data);
- break;
- }
-
- if (bkey_next(k) > vstruct_last(i)) {
- btree_node_error(b, c, ptr,
- "key extends past end of bset");
-
- i->u64s = cpu_to_le16((u64 *) k - i->_data);
- break;
- }
-
- if (k->format > KEY_FORMAT_CURRENT) {
- btree_node_error(b, c, ptr,
- "invalid bkey format %u", k->format);
-
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- continue;
- }
-
- if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
- bch_bkey_swab(btree_node_type(b), &b->format, k);
-
- u = bkey_disassemble(b, k, &tmp);
-
- invalid = btree_bkey_invalid(c, b, u);
- if (invalid) {
- char buf[160];
-
- bch_bkey_val_to_text(c, btree_node_type(b),
- buf, sizeof(buf), u);
- btree_node_error(b, c, ptr,
- "invalid bkey %s: %s", buf, invalid);
-
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- continue;
- }
-
- /*
- * with the separate whiteouts thing (used for extents), the
- * second set of keys actually can have whiteouts too, so we
- * can't solely go off bkey_whiteout()...
- */
-
- if (!seen_non_whiteout &&
- (!bkey_whiteout(k) ||
- (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
- *whiteout_u64s = k->_data - i->_data;
- seen_non_whiteout = true;
- } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
- btree_node_error(b, c, ptr,
- "keys out of order: %llu:%llu > %llu:%llu",
- prev_pos.inode,
- prev_pos.offset,
- u.k->p.inode,
- bkey_start_offset(u.k));
- /* XXX: repair this */
- }
-
- prev_pos = u.k->p;
- prev = k;
- k = bkey_next(k);
- }
-
- SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
- return NULL;
-}
-
-static bool extent_contains_ptr(struct bkey_s_c_extent e,
- struct bch_extent_ptr match)
-{
- const struct bch_extent_ptr *ptr;
-
- extent_for_each_ptr(e, ptr)
- if (!memcmp(ptr, &match, sizeof(*ptr)))
- return true;
-
- return false;
-}
-
-void bch_btree_node_read_done(struct bch_fs *c, struct btree *b,
- struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- struct btree_node_entry *bne;
- struct bset *i = &b->data->keys;
- struct btree_node_iter *iter;
- struct btree_node *sorted;
- bool used_mempool;
- unsigned u64s;
- const char *err;
- struct bch_csum csum;
- struct nonce nonce;
- int ret;
-
- iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
- __bch_btree_node_iter_init(iter, btree_node_is_extents(b));
-
- err = "dynamic fault";
- if (bch_meta_read_fault("btree"))
- goto err;
-
- while (b->written < c->sb.btree_node_size) {
- unsigned sectors, whiteout_u64s = 0;
-
- if (!b->written) {
- i = &b->data->keys;
-
- err = "bad magic";
- if (le64_to_cpu(b->data->magic) != bset_magic(c))
- goto err;
-
- err = "bad btree header";
- if (!b->data->keys.seq)
- goto err;
-
- err = "unknown checksum type";
- if (!bch_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
- goto err;
-
- /* XXX: retry checksum errors */
-
- nonce = btree_nonce(b, i, b->written << 9);
- csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
-
- err = "bad checksum";
- if (bch_crc_cmp(csum, b->data->csum))
- goto err;
-
- bch_encrypt(c, BSET_CSUM_TYPE(i), nonce,
- &b->data->flags,
- (void *) &b->data->keys -
- (void *) &b->data->flags);
- nonce = nonce_add(nonce,
- round_up((void *) &b->data->keys -
- (void *) &b->data->flags,
- CHACHA20_BLOCK_SIZE));
- bset_encrypt(c, i, nonce);
-
- sectors = vstruct_sectors(b->data, c->block_bits);
-
- if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
- u64 *p = (u64 *) &b->data->ptr;
-
- *p = swab64(*p);
- bch_bpos_swab(&b->data->min_key);
- bch_bpos_swab(&b->data->max_key);
- }
-
- err = "incorrect btree id";
- if (BTREE_NODE_ID(b->data) != b->btree_id)
- goto err;
-
- err = "incorrect level";
- if (BTREE_NODE_LEVEL(b->data) != b->level)
- goto err;
-
- err = "incorrect max key";
- if (bkey_cmp(b->data->max_key, b->key.k.p))
- goto err;
-
- err = "incorrect backpointer";
- if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
- b->data->ptr))
- goto err;
-
- err = bch_bkey_format_validate(&b->data->format);
- if (err)
- goto err;
-
- set_btree_bset(b, b->set, &b->data->keys);
-
- btree_node_set_format(b, b->data->format);
- } else {
- bne = write_block(b);
- i = &bne->keys;
-
- if (i->seq != b->data->keys.seq)
- break;
-
- err = "unknown checksum type";
- if (!bch_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
- goto err;
-
- nonce = btree_nonce(b, i, b->written << 9);
- csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
- err = "bad checksum";
- if (memcmp(&csum, &bne->csum, sizeof(csum)))
- goto err;
-
- bset_encrypt(c, i, nonce);
-
- sectors = vstruct_sectors(bne, c->block_bits);
- }
-
- err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s);
- if (err)
- goto err;
-
- b->written += sectors;
-
- err = "insufficient memory";
- ret = bch_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
- if (ret < 0)
- goto err;
-
- if (ret)
- continue;
-
- __bch_btree_node_iter_push(iter, b,
- i->start,
- vstruct_idx(i, whiteout_u64s));
-
- __bch_btree_node_iter_push(iter, b,
- vstruct_idx(i, whiteout_u64s),
- vstruct_last(i));
- }
-
- err = "corrupted btree";
- for (bne = write_block(b);
- bset_byte_offset(b, bne) < btree_bytes(c);
- bne = (void *) bne + block_bytes(c))
- if (bne->keys.seq == b->data->keys.seq)
- goto err;
-
- sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool);
- sorted->keys.u64s = 0;
-
- b->nr = btree_node_is_extents(b)
- ? bch_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
- : bch_key_sort_fix_overlapping(&sorted->keys, b, iter);
-
- u64s = le16_to_cpu(sorted->keys.u64s);
- *sorted = *b->data;
- sorted->keys.u64s = cpu_to_le16(u64s);
- swap(sorted, b->data);
- set_btree_bset(b, b->set, &b->data->keys);
- b->nsets = 1;
-
- BUG_ON(b->nr.live_u64s != u64s);
-
- btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted);
-
- bch_bset_build_aux_tree(b, b->set, false);
-
- set_needs_whiteout(btree_bset_first(b));
-
- btree_node_reset_sib_u64s(b);
-out:
- mempool_free(iter, &c->fill_iter);
- return;
-err:
- set_btree_node_read_error(b);
- btree_node_error(b, c, ptr, "%s", err);
- goto out;
-}
-
-static void btree_node_read_endio(struct bio *bio)
-{
- closure_put(bio->bi_private);
-}
-
-void bch_btree_node_read(struct bch_fs *c, struct btree *b)
-{
- uint64_t start_time = local_clock();
- struct closure cl;
- struct bio *bio;
- struct extent_pick_ptr pick;
-
- trace_bcache_btree_read(c, b);
-
- closure_init_stack(&cl);
-
- pick = bch_btree_pick_ptr(c, b);
- if (bch_fs_fatal_err_on(!pick.ca, c,
- "no cache device for btree node")) {
- set_btree_node_read_error(b);
- return;
- }
-
- bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
- bio->bi_bdev = pick.ca->disk_sb.bdev;
- bio->bi_iter.bi_sector = pick.ptr.offset;
- bio->bi_iter.bi_size = btree_bytes(c);
- bio->bi_end_io = btree_node_read_endio;
- bio->bi_private = &cl;
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
-
- bch_bio_map(bio, b->data);
-
- closure_get(&cl);
- bch_generic_make_request(bio, c);
- closure_sync(&cl);
-
- if (bch_dev_fatal_io_err_on(bio->bi_error,
- pick.ca, "IO error reading bucket %zu",
- PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
- bch_meta_read_fault("btree")) {
- set_btree_node_read_error(b);
- goto out;
- }
-
- bch_btree_node_read_done(c, b, pick.ca, &pick.ptr);
- bch_time_stats_update(&c->btree_read_time, start_time);
-out:
- bio_put(bio);
- percpu_ref_put(&pick.ca->io_ref);
-}
-
-int bch_btree_root_read(struct bch_fs *c, enum btree_id id,
- const struct bkey_i *k, unsigned level)
-{
- struct closure cl;
- struct btree *b;
- int ret;
-
- closure_init_stack(&cl);
-
- do {
- ret = mca_cannibalize_lock(c, &cl);
- closure_sync(&cl);
- } while (ret);
-
- b = mca_alloc(c);
- mca_cannibalize_unlock(c);
-
- BUG_ON(IS_ERR(b));
-
- bkey_copy(&b->key, k);
- BUG_ON(mca_hash_insert(c, b, level, id));
-
- bch_btree_node_read(c, b);
- six_unlock_write(&b->lock);
-
- if (btree_node_read_error(b)) {
- six_unlock_intent(&b->lock);
- return -EIO;
- }
-
- bch_btree_set_root_initial(c, b, NULL);
- six_unlock_intent(&b->lock);
-
- return 0;
-}
-
-void bch_btree_complete_write(struct bch_fs *c, struct btree *b,
- struct btree_write *w)
-{
- bch_journal_pin_drop(&c->journal, &w->journal);
- closure_wake_up(&w->wait);
-}
-
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
-{
- struct btree_write *w = btree_prev_write(b);
-
- /*
- * Before calling bch_btree_complete_write() - if the write errored, we
- * have to halt new journal writes before they see this btree node
- * write as completed:
- */
- if (btree_node_write_error(b))
- bch_journal_halt(&c->journal);
-
- bch_btree_complete_write(c, b, w);
- btree_node_io_unlock(b);
-}
-
-static void btree_node_write_endio(struct bio *bio)
-{
- struct btree *b = bio->bi_private;
- struct bch_write_bio *wbio = to_wbio(bio);
- struct bch_fs *c = wbio->c;
- struct bio *orig = wbio->split ? wbio->orig : NULL;
- struct closure *cl = !wbio->split ? wbio->cl : NULL;
- struct bch_dev *ca = wbio->ca;
-
- if (bch_dev_fatal_io_err_on(bio->bi_error, ca, "btree write") ||
- bch_meta_write_fault("btree"))
- set_btree_node_write_error(b);
-
- if (wbio->bounce)
- btree_bounce_free(c,
- wbio->order,
- wbio->used_mempool,
- page_address(bio->bi_io_vec[0].bv_page));
-
- if (wbio->put_bio)
- bio_put(bio);
-
- if (orig) {
- bio_endio(orig);
- } else {
- btree_node_write_done(c, b);
- if (cl)
- closure_put(cl);
- }
-
- if (ca)
- percpu_ref_put(&ca->io_ref);
-}
-
-void __bch_btree_node_write(struct bch_fs *c, struct btree *b,
- struct closure *parent,
- enum six_lock_type lock_type_held,
- int idx_to_write)
-{
- struct bio *bio;
- struct bch_write_bio *wbio;
- struct bset_tree *t;
- struct bset *i;
- struct btree_node *bn = NULL;
- struct btree_node_entry *bne = NULL;
- BKEY_PADDED(key) k;
- struct bkey_s_extent e;
- struct bch_extent_ptr *ptr;
- struct sort_iter sort_iter;
- struct nonce nonce;
- unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
- u64 seq = 0;
- bool used_mempool;
- unsigned long old, new;
- void *data;
-
- /*
- * We may only have a read lock on the btree node - the dirty bit is our
- * "lock" against racing with other threads that may be trying to start
- * a write, we do a write iff we clear the dirty bit. Since setting the
- * dirty bit requires a write lock, we can't race with other threads
- * redirtying it:
- */
- do {
- old = new = READ_ONCE(b->flags);
-
- if (!(old & (1 << BTREE_NODE_dirty)))
- return;
-
- if (idx_to_write >= 0 &&
- idx_to_write != !!(old & (1 << BTREE_NODE_write_idx)))
- return;
-
- if (old & (1 << BTREE_NODE_write_in_flight)) {
- wait_on_bit_io(&b->flags,
- BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
- continue;
- }
-
- new &= ~(1 << BTREE_NODE_dirty);
- new |= (1 << BTREE_NODE_write_in_flight);
- new |= (1 << BTREE_NODE_just_written);
- new ^= (1 << BTREE_NODE_write_idx);
- } while (cmpxchg_acquire(&b->flags, old, new) != old);
-
- BUG_ON(!list_empty(&b->write_blocked));
-
- BUG_ON(b->written >= c->sb.btree_node_size);
- BUG_ON(bset_written(b, btree_bset_last(b)));
- BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
- BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
-
- if (lock_type_held == SIX_LOCK_intent) {
- six_lock_write(&b->lock);
- __bch_compact_whiteouts(c, b, COMPACT_WRITTEN);
- six_unlock_write(&b->lock);
- } else {
- __bch_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
- }
-
- BUG_ON(b->uncompacted_whiteout_u64s);
-
- sort_iter_init(&sort_iter, b);
-
- bytes = !b->written
- ? sizeof(struct btree_node)
- : sizeof(struct btree_node_entry);
-
- bytes += b->whiteout_u64s * sizeof(u64);
-
- for_each_bset(b, t) {
- i = bset(b, t);
-
- if (bset_written(b, i))
- continue;
-
- bytes += le16_to_cpu(i->u64s) * sizeof(u64);
- sort_iter_add(&sort_iter,
- btree_bkey_first(b, t),
- btree_bkey_last(b, t));
- seq = max(seq, le64_to_cpu(i->journal_seq));
- }
-
- order = get_order(bytes);
- data = btree_bounce_alloc(c, order, &used_mempool);
-
- if (!b->written) {
- bn = data;
- *bn = *b->data;
- i = &bn->keys;
- } else {
- bne = data;
- bne->keys = b->data->keys;
- i = &bne->keys;
- }
-
- i->journal_seq = cpu_to_le64(seq);
- i->u64s = 0;
-
- if (!btree_node_is_extents(b)) {
- sort_iter_add(&sort_iter,
- unwritten_whiteouts_start(c, b),
- unwritten_whiteouts_end(c, b));
- SET_BSET_SEPARATE_WHITEOUTS(i, false);
- } else {
- memcpy_u64s(i->start,
- unwritten_whiteouts_start(c, b),
- b->whiteout_u64s);
- i->u64s = cpu_to_le16(b->whiteout_u64s);
- SET_BSET_SEPARATE_WHITEOUTS(i, true);
- }
-
- b->whiteout_u64s = 0;
-
- u64s = btree_node_is_extents(b)
- ? sort_extents(vstruct_last(i), &sort_iter, false)
- : sort_keys(i->start, &sort_iter, false);
- le16_add_cpu(&i->u64s, u64s);
-
- clear_needs_whiteout(i);
-
- if (b->written && !i->u64s) {
- /* Nothing to write: */
- btree_bounce_free(c, order, used_mempool, data);
- btree_node_write_done(c, b);
- return;
- }
-
- BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
- BUG_ON(i->seq != b->data->keys.seq);
-
- i->version = cpu_to_le16(BCACHE_BSET_VERSION);
- SET_BSET_CSUM_TYPE(i, bch_meta_checksum_type(c));
-
- nonce = btree_nonce(b, i, b->written << 9);
-
- if (bn) {
- bch_encrypt(c, BSET_CSUM_TYPE(i), nonce,
- &bn->flags,
- (void *) &b->data->keys -
- (void *) &b->data->flags);
- nonce = nonce_add(nonce,
- round_up((void *) &b->data->keys -
- (void *) &b->data->flags,
- CHACHA20_BLOCK_SIZE));
- bset_encrypt(c, i, nonce);
-
- nonce = btree_nonce(b, i, b->written << 9);
- bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
- } else {
- bset_encrypt(c, i, nonce);
-
- bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
- }
-
- bytes_to_write = vstruct_end(i) - data;
- sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
-
- memset(data + bytes_to_write, 0,
- (sectors_to_write << 9) - bytes_to_write);
-
- BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
-
- trace_bcache_btree_write(b, bytes_to_write, sectors_to_write);
-
- /*
- * We handle btree write errors by immediately halting the journal -
- * after we've done that, we can't issue any subsequent btree writes
- * because they might have pointers to new nodes that failed to write.
- *
- * Furthermore, there's no point in doing any more btree writes because
- * with the journal stopped, we're never going to update the journal to
- * reflect that those writes were done and the data flushed from the
- * journal:
- *
- * Make sure to update b->written so bch_btree_init_next() doesn't
- * break:
- */
- if (bch_journal_error(&c->journal) ||
- c->opts.nochanges) {
- set_btree_node_noevict(b);
- b->written += sectors_to_write;
-
- btree_bounce_free(c, order, used_mempool, data);
- btree_node_write_done(c, b);
- return;
- }
-
- bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
-
- wbio = to_wbio(bio);
- wbio->cl = parent;
- wbio->bounce = true;
- wbio->put_bio = true;
- wbio->order = order;
- wbio->used_mempool = used_mempool;
- bio->bi_iter.bi_size = sectors_to_write << 9;
- bio->bi_end_io = btree_node_write_endio;
- bio->bi_private = b;
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
-
- if (parent)
- closure_get(parent);
-
- bch_bio_map(bio, data);
-
- /*
- * If we're appending to a leaf node, we don't technically need FUA -
- * this write just needs to be persisted before the next journal write,
- * which will be marked FLUSH|FUA.
- *
- * Similarly if we're writing a new btree root - the pointer is going to
- * be in the next journal entry.
- *
- * But if we're writing a new btree node (that isn't a root) or
- * appending to a non leaf btree node, we need either FUA or a flush
- * when we write the parent with the new pointer. FUA is cheaper than a
- * flush, and writes appending to leaf nodes aren't blocking anything so
- * just make all btree node writes FUA to keep things sane.
- */
-
- bkey_copy(&k.key, &b->key);
- e = bkey_i_to_s_extent(&k.key);
-
- extent_for_each_ptr(e, ptr)
- ptr->offset += b->written;
-
- extent_for_each_ptr(e, ptr)
- atomic64_add(sectors_to_write,
- &c->devs[ptr->dev]->btree_sectors_written);
-
- b->written += sectors_to_write;
-
- bch_submit_wbio_replicas(wbio, c, &k.key, true);
-}
-
-/*
- * Work that must be done with write lock held:
- */
-bool bch_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
-{
- bool invalidated_iter = false;
- struct btree_node_entry *bne;
- struct bset_tree *t;
-
- if (!btree_node_just_written(b))
- return false;
-
- BUG_ON(b->whiteout_u64s);
- BUG_ON(b->uncompacted_whiteout_u64s);
-
- clear_btree_node_just_written(b);
-
- /*
- * Note: immediately after write, bset_unwritten()/bset_written() don't
- * work - the amount of data we had to write after compaction might have
- * been smaller than the offset of the last bset.
- *
- * However, we know that all bsets have been written here, as long as
- * we're still holding the write lock:
- */
-
- /*
- * XXX: decide if we really want to unconditionally sort down to a
- * single bset:
- */
- if (b->nsets > 1) {
- btree_node_sort(c, b, NULL, 0, b->nsets, true);
- invalidated_iter = true;
- } else {
- invalidated_iter = bch_drop_whiteouts(b);
- }
-
- for_each_bset(b, t)
- set_needs_whiteout(bset(b, t));
-
- bch_btree_verify(c, b);
-
- /*
- * If later we don't unconditionally sort down to a single bset, we have
- * to ensure this is still true:
- */
- BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
-
- bne = want_new_bset(c, b);
- if (bne)
- bch_bset_init_next(b, &bne->keys);
-
- bch_btree_build_aux_trees(b);
-
- return invalidated_iter;
-}
-
-/*
- * Use this one if the node is intent locked:
- */
-void bch_btree_node_write(struct bch_fs *c, struct btree *b,
- struct closure *parent,
- enum six_lock_type lock_type_held,
- int idx_to_write)
-{
- BUG_ON(lock_type_held == SIX_LOCK_write);
-
- if (lock_type_held == SIX_LOCK_intent ||
- six_trylock_convert(&b->lock, SIX_LOCK_read,
- SIX_LOCK_intent)) {
- __bch_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write);
-
- six_lock_write(&b->lock);
- bch_btree_post_write_cleanup(c, b);
- six_unlock_write(&b->lock);
-
- if (lock_type_held == SIX_LOCK_read)
- six_lock_downgrade(&b->lock);
- } else {
- __bch_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write);
- }
-}
-
-static void bch_btree_node_write_dirty(struct bch_fs *c, struct btree *b,
- struct closure *parent)
-{
- six_lock_read(&b->lock);
- BUG_ON(b->level);
-
- bch_btree_node_write(c, b, parent, SIX_LOCK_read, -1);
- six_unlock_read(&b->lock);
-}
-
-/*
- * Write all dirty btree nodes to disk, including roots
- */
-void bch_btree_flush(struct bch_fs *c)
-{
- struct closure cl;
- struct btree *b;
- struct bucket_table *tbl;
- struct rhash_head *pos;
- bool dropped_lock;
- unsigned i;
-
- closure_init_stack(&cl);
-
- rcu_read_lock();
-
- do {
- dropped_lock = false;
- i = 0;
-restart:
- tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
- &c->btree_cache_table);
-
- for (; i < tbl->size; i++)
- rht_for_each_entry_rcu(b, pos, tbl, i, hash)
- /*
- * XXX - locking for b->level, when called from
- * bch_journal_move()
- */
- if (!b->level && btree_node_dirty(b)) {
- rcu_read_unlock();
- bch_btree_node_write_dirty(c, b, &cl);
- dropped_lock = true;
- rcu_read_lock();
- goto restart;
- }
- } while (dropped_lock);
-
- rcu_read_unlock();
-
- closure_sync(&cl);
-}
-
-/**
- * bch_btree_node_flush_journal - flush any journal entries that contain keys
- * from this node
- *
- * The bset's journal sequence number is used for preserving ordering of index
- * updates across unclean shutdowns - it's used to ignore bsets newer than the
- * most recent journal entry.
- *
- * But when rewriting btree nodes we compact all the bsets in a btree node - and
- * if we compacted a bset that should be ignored with bsets we do need, that
- * would be bad. So to avoid that, prior to making the new node visible ensure
- * that the journal has been flushed so that all the bsets we compacted should
- * be visible.
- */
-void bch_btree_node_flush_journal_entries(struct bch_fs *c,
- struct btree *b,
- struct closure *cl)
-{
- int i = b->nsets;
-
- /*
- * Journal sequence numbers in the different bsets will always be in
- * ascending order, we only need to flush the highest - except that the
- * most recent bset might not have a journal sequence number yet, so we
- * need to loop:
- */
- while (i--) {
- u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq);
-
- if (seq) {
- bch_journal_flush_seq_async(&c->journal, seq, cl);
- break;
- }
- }
-}
diff --git a/libbcache/btree_io.h b/libbcache/btree_io.h
deleted file mode 100644
index 0f75f456..00000000
--- a/libbcache/btree_io.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef _BCACHE_BTREE_IO_H
-#define _BCACHE_BTREE_IO_H
-
-struct bch_fs;
-struct btree_write;
-struct btree;
-struct btree_iter;
-
-static inline void btree_node_io_unlock(struct btree *b)
-{
- EBUG_ON(!btree_node_write_in_flight(b));
- clear_btree_node_write_in_flight(b);
- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-static inline void btree_node_io_lock(struct btree *b)
-{
- wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-enum compact_mode {
- COMPACT_LAZY,
- COMPACT_WRITTEN,
- COMPACT_WRITTEN_NO_WRITE_LOCK,
-};
-
-bool __bch_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode);
-
-static inline bool bch_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
-{
- struct bset_tree *t;
-
- for_each_bset(b, t) {
- unsigned live_u64s = b->nr.bset_u64s[t - b->set];
- unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
-
- if (live_u64s * 4 < bset_u64s * 3)
- goto compact;
- }
-
- return false;
-compact:
- return __bch_compact_whiteouts(c, b, COMPACT_LAZY);
-}
-
-void bch_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
-
-void bch_btree_build_aux_trees(struct btree *);
-void bch_btree_init_next(struct bch_fs *, struct btree *,
- struct btree_iter *);
-
-void bch_btree_node_read_done(struct bch_fs *, struct btree *,
- struct bch_dev *, const struct bch_extent_ptr *);
-void bch_btree_node_read(struct bch_fs *, struct btree *);
-int bch_btree_root_read(struct bch_fs *, enum btree_id,
- const struct bkey_i *, unsigned);
-
-void bch_btree_complete_write(struct bch_fs *, struct btree *,
- struct btree_write *);
-
-void __bch_btree_node_write(struct bch_fs *, struct btree *,
- struct closure *, enum six_lock_type, int);
-bool bch_btree_post_write_cleanup(struct bch_fs *, struct btree *);
-
-void bch_btree_node_write(struct bch_fs *, struct btree *,
- struct closure *, enum six_lock_type, int);
-
-void bch_btree_flush(struct bch_fs *);
-void bch_btree_node_flush_journal_entries(struct bch_fs *, struct btree *,
- struct closure *);
-
-#endif /* _BCACHE_BTREE_IO_H */
diff --git a/libbcache/btree_iter.c b/libbcache/btree_iter.c
deleted file mode 100644
index 04b4bc2e..00000000
--- a/libbcache/btree_iter.c
+++ /dev/null
@@ -1,1150 +0,0 @@
-
-#include "bcache.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "debug.h"
-#include "extents.h"
-
-#include <trace/events/bcache.h>
-
-#define BTREE_ITER_NOT_END ((struct btree *) 1)
-
-static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
-{
- return iter->nodes[l] && iter->nodes[l] != BTREE_ITER_NOT_END;
-}
-
-/* Btree node locking: */
-
-/*
- * Updates the saved lock sequence number, so that btree_node_relock() will
- * succeed:
- */
-void btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
-{
- struct btree_iter *linked;
-
- EBUG_ON(iter->nodes[b->level] != b);
- EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
-
- for_each_linked_btree_node(iter, b, linked)
- linked->lock_seq[b->level] += 2;
-
- iter->lock_seq[b->level] += 2;
-
- six_unlock_write(&b->lock);
-}
-
-void btree_node_lock_write(struct btree *b, struct btree_iter *iter)
-{
- struct btree_iter *linked;
- unsigned readers = 0;
-
- EBUG_ON(iter->nodes[b->level] != b);
- EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
-
- if (six_trylock_write(&b->lock))
- return;
-
- for_each_linked_btree_iter(iter, linked)
- if (linked->nodes[b->level] == b &&
- btree_node_read_locked(linked, b->level))
- readers++;
-
- if (likely(!readers)) {
- six_lock_write(&b->lock);
- } else {
- /*
- * Must drop our read locks before calling six_lock_write() -
- * six_unlock() won't do wakeups until the reader count
- * goes to 0, and it's safe because we have the node intent
- * locked:
- */
- atomic64_sub(__SIX_VAL(read_lock, readers),
- &b->lock.state.counter);
- six_lock_write(&b->lock);
- atomic64_add(__SIX_VAL(read_lock, readers),
- &b->lock.state.counter);
- }
-}
-
-/* versions that allow iter to be null: */
-void __btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
-{
- if (likely(iter))
- btree_node_unlock_write(b, iter);
- else
- six_unlock_write(&b->lock);
-}
-
-void __btree_node_lock_write(struct btree *b, struct btree_iter *iter)
-{
- if (likely(iter))
- btree_node_lock_write(b, iter);
- else
- six_lock_write(&b->lock);
-}
-
-bool btree_node_relock(struct btree_iter *iter, unsigned level)
-{
- struct btree_iter *linked;
- struct btree *b = iter->nodes[level];
- enum btree_node_locked_type want = btree_lock_want(iter, level);
- enum btree_node_locked_type have = btree_node_locked_type(iter, level);
-
- if (want == have)
- return true;
-
- if (!is_btree_node(iter, level))
- return false;
-
- if (race_fault())
- return false;
-
- if (have != BTREE_NODE_UNLOCKED
- ? six_trylock_convert(&b->lock, have, want)
- : six_relock_type(&b->lock, want, iter->lock_seq[level]))
- goto success;
-
- for_each_linked_btree_iter(iter, linked)
- if (linked->nodes[level] == b &&
- btree_node_locked_type(linked, level) == want &&
- iter->lock_seq[level] == b->lock.state.seq) {
- btree_node_unlock(iter, level);
- six_lock_increment(&b->lock, want);
- goto success;
- }
-
- return false;
-success:
- mark_btree_node_unlocked(iter, level);
- mark_btree_node_locked(iter, level, want);
- return true;
-}
-
-/* Slowpath: */
-bool __bch_btree_node_lock(struct btree *b, struct bpos pos,
- unsigned level,
- struct btree_iter *iter,
- enum six_lock_type type)
-{
- struct btree_iter *linked;
-
- /* Can't have children locked before ancestors: */
- EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
-
- /*
- * Can't hold any read locks while we block taking an intent lock - see
- * below for reasoning, and we should have already dropped any read
- * locks in the current iterator
- */
- EBUG_ON(type == SIX_LOCK_intent &&
- iter->nodes_locked != iter->nodes_intent_locked);
-
- for_each_linked_btree_iter(iter, linked)
- if (linked->nodes[level] == b &&
- btree_node_locked_type(linked, level) == type) {
- six_lock_increment(&b->lock, type);
- return true;
- }
-
- /*
- * Must lock btree nodes in key order - this case hapens when locking
- * the prev sibling in btree node merging:
- */
- if (iter->nodes_locked &&
- __ffs(iter->nodes_locked) == level &&
- __btree_iter_cmp(iter->btree_id, pos, iter))
- return false;
-
- for_each_linked_btree_iter(iter, linked) {
- if (!linked->nodes_locked)
- continue;
-
- /*
- * Can't block taking an intent lock if we have _any_ nodes read
- * locked:
- *
- * - Our read lock blocks another thread with an intent lock on
- * the same node from getting a write lock, and thus from
- * dropping its intent lock
- *
- * - And the other thread may have multiple nodes intent locked:
- * both the node we want to intent lock, and the node we
- * already have read locked - deadlock:
- */
- if (type == SIX_LOCK_intent &&
- linked->nodes_locked != linked->nodes_intent_locked) {
- linked->locks_want = max(linked->locks_want,
- iter->locks_want);
- return false;
- }
-
- /* We have to lock btree nodes in key order: */
- if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
- return false;
-
- /*
- * Interior nodes must be locked before their descendants: if
- * another iterator has possible descendants locked of the node
- * we're about to lock, it must have the ancestors locked too:
- */
- if (linked->btree_id == iter->btree_id &&
- level > __fls(linked->nodes_locked)) {
- linked->locks_want = max(linked->locks_want,
- iter->locks_want);
- return false;
- }
- }
-
- six_lock_type(&b->lock, type);
- return true;
-}
-
-/* Btree iterator locking: */
-
-
-static void btree_iter_drop_extra_locks(struct btree_iter *iter)
-{
- unsigned l;
-
- while (iter->nodes_locked &&
- (l = __fls(iter->nodes_locked)) > iter->locks_want) {
- if (!btree_node_locked(iter, l))
- panic("l %u nodes_locked %u\n", l, iter->nodes_locked);
-
- if (l > iter->level) {
- btree_node_unlock(iter, l);
- } else if (btree_node_intent_locked(iter, l)) {
- six_lock_downgrade(&iter->nodes[l]->lock);
- iter->nodes_intent_locked ^= 1 << l;
- }
- }
-}
-
-bool __bch_btree_iter_set_locks_want(struct btree_iter *iter,
- unsigned new_locks_want)
-{
- struct btree_iter *linked;
- unsigned l;
-
- /* Drop locks we don't want anymore: */
- if (new_locks_want < iter->locks_want)
- for_each_linked_btree_iter(iter, linked)
- if (linked->locks_want > new_locks_want) {
- linked->locks_want = max_t(unsigned, 1,
- new_locks_want);
- btree_iter_drop_extra_locks(linked);
- }
-
- iter->locks_want = new_locks_want;
- btree_iter_drop_extra_locks(iter);
-
- for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
- if (!btree_node_relock(iter, l))
- goto fail;
-
- return true;
-fail:
- /*
- * Just an optimization: ancestor nodes must be locked before child
- * nodes, so set locks_want on iterators that might lock ancestors
- * before us to avoid getting -EINTR later:
- */
- for_each_linked_btree_iter(iter, linked)
- if (linked->btree_id == iter->btree_id &&
- btree_iter_cmp(linked, iter) <= 0)
- linked->locks_want = max_t(unsigned, linked->locks_want,
- new_locks_want);
- return false;
-}
-
-static int __bch_btree_iter_unlock(struct btree_iter *iter)
-{
- BUG_ON(iter->error == -EINTR);
-
- while (iter->nodes_locked)
- btree_node_unlock(iter, __ffs(iter->nodes_locked));
-
- return iter->error;
-}
-
-int bch_btree_iter_unlock(struct btree_iter *iter)
-{
- struct btree_iter *linked;
-
- for_each_linked_btree_iter(iter, linked)
- __bch_btree_iter_unlock(linked);
- return __bch_btree_iter_unlock(iter);
-}
-
-/* Btree iterator: */
-
-#ifdef CONFIG_BCACHE_DEBUG
-
-static void __bch_btree_iter_verify(struct btree_iter *iter,
- struct btree *b)
-{
- struct btree_node_iter *node_iter = &iter->node_iters[b->level];
- struct btree_node_iter tmp = *node_iter;
- struct bkey_packed *k;
-
- bch_btree_node_iter_verify(node_iter, b);
-
- /*
- * For interior nodes, the iterator will have skipped past
- * deleted keys:
- */
- k = b->level
- ? bch_btree_node_iter_prev(&tmp, b)
- : bch_btree_node_iter_prev_all(&tmp, b);
- if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k,
- iter->is_extents)) {
- char buf[100];
- struct bkey uk = bkey_unpack_key(b, k);
-
- bch_bkey_to_text(buf, sizeof(buf), &uk);
- panic("prev key should be before after pos:\n%s\n%llu:%llu\n",
- buf, iter->pos.inode, iter->pos.offset);
- }
-
- k = bch_btree_node_iter_peek_all(node_iter, b);
- if (k && !btree_iter_pos_cmp_packed(b, &iter->pos, k,
- iter->is_extents)) {
- char buf[100];
- struct bkey uk = bkey_unpack_key(b, k);
-
- bch_bkey_to_text(buf, sizeof(buf), &uk);
- panic("next key should be before iter pos:\n%llu:%llu\n%s\n",
- iter->pos.inode, iter->pos.offset, buf);
- }
-}
-
-void bch_btree_iter_verify(struct btree_iter *iter, struct btree *b)
-{
- struct btree_iter *linked;
-
- if (iter->nodes[b->level] == b)
- __bch_btree_iter_verify(iter, b);
-
- for_each_linked_btree_node(iter, b, linked)
- __bch_btree_iter_verify(iter, b);
-}
-
-#endif
-
-static void __bch_btree_node_iter_fix(struct btree_iter *iter,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bset_tree *t,
- struct bkey_packed *where,
- unsigned clobber_u64s,
- unsigned new_u64s)
-{
- const struct bkey_packed *end = btree_bkey_last(b, t);
- struct btree_node_iter_set *set;
- unsigned offset = __btree_node_key_to_offset(b, where);
- int shift = new_u64s - clobber_u64s;
- unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift;
-
- btree_node_iter_for_each(node_iter, set)
- if (set->end == old_end)
- goto found;
-
- /* didn't find the bset in the iterator - might have to readd it: */
- if (new_u64s &&
- btree_iter_pos_cmp_packed(b, &iter->pos, where,
- iter->is_extents))
- bch_btree_node_iter_push(node_iter, b, where, end);
- return;
-found:
- set->end = (int) set->end + shift;
-
- /* Iterator hasn't gotten to the key that changed yet: */
- if (set->k < offset)
- return;
-
- if (new_u64s &&
- btree_iter_pos_cmp_packed(b, &iter->pos, where,
- iter->is_extents)) {
- set->k = offset;
- bch_btree_node_iter_sort(node_iter, b);
- } else if (set->k < offset + clobber_u64s) {
- set->k = offset + new_u64s;
- if (set->k == set->end)
- *set = node_iter->data[--node_iter->used];
- bch_btree_node_iter_sort(node_iter, b);
- } else {
- set->k = (int) set->k + shift;
- }
-
- /*
- * Interior nodes are special because iterators for interior nodes don't
- * obey the usual invariants regarding the iterator position:
- *
- * We may have whiteouts that compare greater than the iterator
- * position, and logically should be in the iterator, but that we
- * skipped past to find the first live key greater than the iterator
- * position. This becomes an issue when we insert a new key that is
- * greater than the current iterator position, but smaller than the
- * whiteouts we've already skipped past - this happens in the course of
- * a btree split.
- *
- * We have to rewind the iterator past to before those whiteouts here,
- * else bkey_node_iter_prev() is not going to work and who knows what
- * else would happen. And we have to do it manually, because here we've
- * already done the insert and the iterator is currently inconsistent:
- *
- * We've got multiple competing invariants, here - we have to be careful
- * about rewinding iterators for interior nodes, because they should
- * always point to the key for the child node the btree iterator points
- * to.
- */
- if (b->level && new_u64s && !bkey_deleted(where) &&
- btree_iter_pos_cmp_packed(b, &iter->pos, where,
- iter->is_extents)) {
- struct bset_tree *t;
- struct bkey_packed *k;
-
- for_each_bset(b, t) {
- if (bch_bkey_to_bset(b, where) == t)
- continue;
-
- k = bkey_prev_all(b, t,
- bch_btree_node_iter_bset_pos(node_iter, b, t));
- if (k &&
- __btree_node_iter_cmp(node_iter, b,
- k, where) > 0) {
- struct btree_node_iter_set *set;
- unsigned offset =
- __btree_node_key_to_offset(b, bkey_next(k));
-
- btree_node_iter_for_each(node_iter, set)
- if (set->k == offset) {
- set->k = __btree_node_key_to_offset(b, k);
- bch_btree_node_iter_sort(node_iter, b);
- goto next_bset;
- }
-
- bch_btree_node_iter_push(node_iter, b, k,
- btree_bkey_last(b, t));
- }
-next_bset:
- t = t;
- }
- }
-}
-
-void bch_btree_node_iter_fix(struct btree_iter *iter,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bset_tree *t,
- struct bkey_packed *where,
- unsigned clobber_u64s,
- unsigned new_u64s)
-{
- struct btree_iter *linked;
-
- if (node_iter != &iter->node_iters[b->level])
- __bch_btree_node_iter_fix(iter, b, node_iter, t,
- where, clobber_u64s, new_u64s);
-
- if (iter->nodes[b->level] == b)
- __bch_btree_node_iter_fix(iter, b,
- &iter->node_iters[b->level], t,
- where, clobber_u64s, new_u64s);
-
- for_each_linked_btree_node(iter, b, linked)
- __bch_btree_node_iter_fix(linked, b,
- &linked->node_iters[b->level], t,
- where, clobber_u64s, new_u64s);
-
- /* interior node iterators are... special... */
- if (!b->level)
- bch_btree_iter_verify(iter, b);
-}
-
-/* peek_all() doesn't skip deleted keys */
-static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter)
-{
- struct btree *b = iter->nodes[iter->level];
- struct bkey_packed *k =
- bch_btree_node_iter_peek_all(&iter->node_iters[iter->level], b);
- struct bkey_s_c ret;
-
- EBUG_ON(!btree_node_locked(iter, iter->level));
-
- if (!k)
- return bkey_s_c_null;
-
- ret = bkey_disassemble(b, k, &iter->k);
-
- if (debug_check_bkeys(iter->c))
- bkey_debugcheck(iter->c, b, ret);
-
- return ret;
-}
-
-static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter)
-{
- struct btree *b = iter->nodes[iter->level];
- struct bkey_packed *k =
- bch_btree_node_iter_peek(&iter->node_iters[iter->level], b);
- struct bkey_s_c ret;
-
- EBUG_ON(!btree_node_locked(iter, iter->level));
-
- if (!k)
- return bkey_s_c_null;
-
- ret = bkey_disassemble(b, k, &iter->k);
-
- if (debug_check_bkeys(iter->c))
- bkey_debugcheck(iter->c, b, ret);
-
- return ret;
-}
-
-static inline void __btree_iter_advance(struct btree_iter *iter)
-{
- bch_btree_node_iter_advance(&iter->node_iters[iter->level],
- iter->nodes[iter->level]);
-}
-
-/*
- * Verify that iterator for parent node points to child node:
- */
-static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
-{
- bool parent_locked;
- struct bkey_packed *k;
-
- if (!IS_ENABLED(CONFIG_BCACHE_DEBUG) ||
- !iter->nodes[b->level + 1])
- return;
-
- parent_locked = btree_node_locked(iter, b->level + 1);
-
- if (!btree_node_relock(iter, b->level + 1))
- return;
-
- k = bch_btree_node_iter_peek_all(&iter->node_iters[b->level + 1],
- iter->nodes[b->level + 1]);
- if (!k ||
- bkey_deleted(k) ||
- bkey_cmp_left_packed(iter->nodes[b->level + 1],
- k, &b->key.k.p)) {
- char buf[100];
- struct bkey uk = bkey_unpack_key(b, k);
-
- bch_bkey_to_text(buf, sizeof(buf), &uk);
- panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
- buf, b->key.k.p.inode, b->key.k.p.offset);
- }
-
- if (!parent_locked)
- btree_node_unlock(iter, b->level + 1);
-}
-
-static inline void __btree_iter_init(struct btree_iter *iter,
- struct btree *b)
-{
- bch_btree_node_iter_init(&iter->node_iters[b->level], b,
- iter->pos, iter->is_extents,
- btree_node_is_extents(b));
-
- /* Skip to first non whiteout: */
- if (b->level)
- bch_btree_node_iter_peek(&iter->node_iters[b->level], b);
-}
-
-static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
- struct btree *b)
-{
- return iter->btree_id == b->btree_id &&
- bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
- btree_iter_pos_cmp(iter->pos, &b->key.k, iter->is_extents);
-}
-
-static inline void btree_iter_node_set(struct btree_iter *iter,
- struct btree *b)
-{
- btree_iter_verify_new_node(iter, b);
-
- EBUG_ON(!btree_iter_pos_in_node(iter, b));
- EBUG_ON(b->lock.state.seq & 1);
-
- iter->lock_seq[b->level] = b->lock.state.seq;
- iter->nodes[b->level] = b;
- __btree_iter_init(iter, b);
-}
-
-/*
- * A btree node is being replaced - update the iterator to point to the new
- * node:
- */
-bool bch_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
-{
- struct btree_iter *linked;
-
- for_each_linked_btree_iter(iter, linked)
- if (btree_iter_pos_in_node(linked, b)) {
- /*
- * bch_btree_iter_node_drop() has already been called -
- * the old node we're replacing has already been
- * unlocked and the pointer invalidated
- */
- BUG_ON(btree_node_locked(linked, b->level));
-
- /*
- * If @linked wants this node read locked, we don't want
- * to actually take the read lock now because it's not
- * legal to hold read locks on other nodes while we take
- * write locks, so the journal can make forward
- * progress...
- *
- * Instead, btree_iter_node_set() sets things up so
- * btree_node_relock() will succeed:
- */
-
- if (btree_want_intent(linked, b->level)) {
- six_lock_increment(&b->lock, SIX_LOCK_intent);
- mark_btree_node_intent_locked(linked, b->level);
- }
-
- btree_iter_node_set(linked, b);
- }
-
- if (!btree_iter_pos_in_node(iter, b)) {
- six_unlock_intent(&b->lock);
- return false;
- }
-
- mark_btree_node_intent_locked(iter, b->level);
- btree_iter_node_set(iter, b);
- return true;
-}
-
-void bch_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b)
-{
- struct btree_iter *linked;
- unsigned level = b->level;
-
- for_each_linked_btree_iter(iter, linked)
- if (linked->nodes[level] == b) {
- btree_node_unlock(linked, level);
- linked->nodes[level] = BTREE_ITER_NOT_END;
- }
-}
-
-void bch_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
-{
- unsigned level = b->level;
-
- if (iter->nodes[level] == b) {
- BUG_ON(b->lock.state.intent_lock != 1);
- btree_node_unlock(iter, level);
- iter->nodes[level] = BTREE_ITER_NOT_END;
- }
-}
-
-/*
- * A btree node has been modified in such a way as to invalidate iterators - fix
- * them:
- */
-void bch_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
-{
- struct btree_iter *linked;
-
- for_each_linked_btree_node(iter, b, linked)
- __btree_iter_init(linked, b);
- __btree_iter_init(iter, b);
-}
-
-static inline int btree_iter_lock_root(struct btree_iter *iter,
- unsigned depth_want)
-{
- struct bch_fs *c = iter->c;
- struct btree *b;
- enum six_lock_type lock_type;
- unsigned i;
-
- EBUG_ON(iter->nodes_locked);
-
- while (1) {
- b = READ_ONCE(c->btree_roots[iter->btree_id].b);
- iter->level = READ_ONCE(b->level);
-
- if (unlikely(iter->level < depth_want)) {
- /*
- * the root is at a lower depth than the depth we want:
- * got to the end of the btree, or we're walking nodes
- * greater than some depth and there are no nodes >=
- * that depth
- */
- iter->level = depth_want;
- iter->nodes[iter->level] = NULL;
- return 0;
- }
-
- lock_type = btree_lock_want(iter, iter->level);
- if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
- iter, lock_type)))
- return -EINTR;
-
- if (likely(b == c->btree_roots[iter->btree_id].b &&
- b->level == iter->level &&
- !race_fault())) {
- for (i = 0; i < iter->level; i++)
- iter->nodes[i] = BTREE_ITER_NOT_END;
- iter->nodes[iter->level] = b;
-
- mark_btree_node_locked(iter, iter->level, lock_type);
- btree_iter_node_set(iter, b);
- return 0;
-
- }
-
- six_unlock_type(&b->lock, lock_type);
- }
-}
-
-static inline int btree_iter_down(struct btree_iter *iter)
-{
- struct btree *b;
- struct bkey_s_c k = __btree_iter_peek(iter);
- unsigned level = iter->level - 1;
- enum six_lock_type lock_type = btree_lock_want(iter, level);
- BKEY_PADDED(k) tmp;
-
- bkey_reassemble(&tmp.k, k);
-
- b = bch_btree_node_get(iter, &tmp.k, level, lock_type);
- if (unlikely(IS_ERR(b)))
- return PTR_ERR(b);
-
- iter->level = level;
- mark_btree_node_locked(iter, level, lock_type);
- btree_iter_node_set(iter, b);
- return 0;
-}
-
-static void btree_iter_up(struct btree_iter *iter)
-{
- btree_node_unlock(iter, iter->level++);
-}
-
-int __must_check __bch_btree_iter_traverse(struct btree_iter *);
-
-static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
-{
- struct bch_fs *c = iter->c;
- struct btree_iter *linked, *sorted_iters, **i;
-retry_all:
- bch_btree_iter_unlock(iter);
-
- if (ret != -ENOMEM && ret != -EINTR)
- goto io_error;
-
- if (ret == -ENOMEM) {
- struct closure cl;
-
- closure_init_stack(&cl);
-
- do {
- ret = mca_cannibalize_lock(c, &cl);
- closure_sync(&cl);
- } while (ret);
- }
-
- /*
- * Linked iters are normally a circular singly linked list - break cycle
- * while we sort them:
- */
- linked = iter->next;
- iter->next = NULL;
- sorted_iters = NULL;
-
- while (linked) {
- iter = linked;
- linked = linked->next;
-
- i = &sorted_iters;
- while (*i && btree_iter_cmp(iter, *i) > 0)
- i = &(*i)->next;
-
- iter->next = *i;
- *i = iter;
- }
-
- /* Make list circular again: */
- iter = sorted_iters;
- while (iter->next)
- iter = iter->next;
- iter->next = sorted_iters;
-
- /* Now, redo traversals in correct order: */
-
- iter = sorted_iters;
- do {
-retry:
- ret = __bch_btree_iter_traverse(iter);
- if (unlikely(ret)) {
- if (ret == -EINTR)
- goto retry;
- goto retry_all;
- }
-
- iter = iter->next;
- } while (iter != sorted_iters);
-
- ret = btree_iter_linked(iter) ? -EINTR : 0;
-out:
- mca_cannibalize_unlock(c);
- return ret;
-io_error:
- BUG_ON(ret != -EIO);
-
- iter->error = ret;
- iter->nodes[iter->level] = NULL;
- goto out;
-}
-
-/*
- * This is the main state machine for walking down the btree - walks down to a
- * specified depth
- *
- * Returns 0 on success, -EIO on error (error reading in a btree node).
- *
- * On error, caller (peek_node()/peek_key()) must return NULL; the error is
- * stashed in the iterator and returned from bch_btree_iter_unlock().
- */
-int __must_check __bch_btree_iter_traverse(struct btree_iter *iter)
-{
- unsigned depth_want = iter->level;
-
- /* make sure we have all the intent locks we need - ugh */
- if (unlikely(iter->nodes[iter->level] &&
- iter->level + 1 < iter->locks_want)) {
- unsigned i;
-
- for (i = iter->level + 1;
- i < iter->locks_want && iter->nodes[i];
- i++)
- if (!btree_node_relock(iter, i)) {
- while (iter->nodes[iter->level] &&
- iter->level + 1 < iter->locks_want)
- btree_iter_up(iter);
- break;
- }
- }
-
- /*
- * If the current node isn't locked, go up until we have a locked node
- * or run out of nodes:
- */
- while (iter->nodes[iter->level] &&
- !(is_btree_node(iter, iter->level) &&
- btree_node_relock(iter, iter->level) &&
- btree_iter_pos_cmp(iter->pos,
- &iter->nodes[iter->level]->key.k,
- iter->is_extents)))
- btree_iter_up(iter);
-
- /*
- * If we've got a btree node locked (i.e. we aren't about to relock the
- * root) - advance its node iterator if necessary:
- */
- if (iter->nodes[iter->level]) {
- struct bkey_s_c k;
-
- while ((k = __btree_iter_peek_all(iter)).k &&
- !btree_iter_pos_cmp(iter->pos, k.k, iter->is_extents))
- __btree_iter_advance(iter);
- }
-
- /*
- * Note: iter->nodes[iter->level] may be temporarily NULL here - that
- * would indicate to other code that we got to the end of the btree,
- * here it indicates that relocking the root failed - it's critical that
- * btree_iter_lock_root() comes next and that it can't fail
- */
- while (iter->level > depth_want) {
- int ret = iter->nodes[iter->level]
- ? btree_iter_down(iter)
- : btree_iter_lock_root(iter, depth_want);
- if (unlikely(ret)) {
- iter->level = depth_want;
- return ret;
- }
- }
-
- return 0;
-}
-
-int __must_check bch_btree_iter_traverse(struct btree_iter *iter)
-{
- int ret;
-
- if (unlikely(!iter->nodes[iter->level]))
- return 0;
-
- iter->at_end_of_leaf = false;
-
- ret = __bch_btree_iter_traverse(iter);
- if (unlikely(ret))
- ret = btree_iter_traverse_error(iter, ret);
-
- return ret;
-}
-
-/* Iterate across nodes (leaf and interior nodes) */
-
-struct btree *bch_btree_iter_peek_node(struct btree_iter *iter)
-{
- struct btree *b;
- int ret;
-
- EBUG_ON(iter->is_extents);
-
- ret = bch_btree_iter_traverse(iter);
- if (ret)
- return NULL;
-
- b = iter->nodes[iter->level];
-
- if (b) {
- EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
- iter->pos = b->key.k.p;
- }
-
- return b;
-}
-
-struct btree *bch_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
-{
- struct btree *b;
- int ret;
-
- EBUG_ON(iter->is_extents);
-
- btree_iter_up(iter);
-
- if (!iter->nodes[iter->level])
- return NULL;
-
- /* parent node usually won't be locked: redo traversal if necessary */
- ret = bch_btree_iter_traverse(iter);
- if (ret)
- return NULL;
-
- b = iter->nodes[iter->level];
- if (!b)
- return b;
-
- if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
- /* Haven't gotten to the end of the parent node: */
-
- /* ick: */
- iter->pos = iter->btree_id == BTREE_ID_INODES
- ? btree_type_successor(iter->btree_id, iter->pos)
- : bkey_successor(iter->pos);
- iter->level = depth;
-
- ret = bch_btree_iter_traverse(iter);
- if (ret)
- return NULL;
-
- b = iter->nodes[iter->level];
- }
-
- iter->pos = b->key.k.p;
-
- return b;
-}
-
-/* Iterate across keys (in leaf nodes only) */
-
-void bch_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
-{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
- struct bkey_packed *k;
-
- EBUG_ON(iter->level != 0);
- EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
- EBUG_ON(!btree_node_locked(iter, 0));
- EBUG_ON(bkey_cmp(new_pos, b->key.k.p) > 0);
-
- while ((k = bch_btree_node_iter_peek_all(node_iter, b)) &&
- !btree_iter_pos_cmp_packed(b, &new_pos, k,
- iter->is_extents))
- bch_btree_node_iter_advance(node_iter, b);
-
- if (!k &&
- !btree_iter_pos_cmp(new_pos, &b->key.k, iter->is_extents))
- iter->at_end_of_leaf = true;
-
- iter->pos = new_pos;
-}
-
-void bch_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
-{
- EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); /* XXX handle this */
- iter->pos = new_pos;
-}
-
-void bch_btree_iter_advance_pos(struct btree_iter *iter)
-{
- /*
- * We use iter->k instead of iter->pos for extents: iter->pos will be
- * equal to the start of the extent we returned, but we need to advance
- * to the end of the extent we returned.
- */
- bch_btree_iter_set_pos(iter,
- btree_type_successor(iter->btree_id, iter->k.p));
-}
-
-/* XXX: expensive */
-void bch_btree_iter_rewind(struct btree_iter *iter, struct bpos pos)
-{
- /* incapable of rewinding across nodes: */
- BUG_ON(bkey_cmp(pos, iter->nodes[iter->level]->data->min_key) < 0);
-
- iter->pos = pos;
- __btree_iter_init(iter, iter->nodes[iter->level]);
-}
-
-struct bkey_s_c bch_btree_iter_peek(struct btree_iter *iter)
-{
- struct bkey_s_c k;
- int ret;
-
- while (1) {
- ret = bch_btree_iter_traverse(iter);
- if (unlikely(ret)) {
- iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
- return bkey_s_c_err(ret);
- }
-
- k = __btree_iter_peek(iter);
- if (likely(k.k)) {
- /*
- * iter->pos should always be equal to the key we just
- * returned - except extents can straddle iter->pos:
- */
- if (!iter->is_extents ||
- bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
- bch_btree_iter_set_pos(iter, bkey_start_pos(k.k));
- return k;
- }
-
- iter->pos = iter->nodes[0]->key.k.p;
-
- if (!bkey_cmp(iter->pos, POS_MAX)) {
- iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
- bch_btree_iter_unlock(iter);
- return bkey_s_c_null;
- }
-
- iter->pos = btree_type_successor(iter->btree_id, iter->pos);
- }
-}
-
-struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *iter)
-{
- struct bkey_s_c k;
- struct bkey n;
- int ret;
-
- while (1) {
- ret = bch_btree_iter_traverse(iter);
- if (unlikely(ret)) {
- iter->k = KEY(iter->pos.inode, iter->pos.offset, 0);
- return bkey_s_c_err(ret);
- }
-
- k = __btree_iter_peek_all(iter);
-recheck:
- if (!k.k || bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) {
- /* hole */
- bkey_init(&n);
- n.p = iter->pos;
-
- if (iter->is_extents) {
- if (n.p.offset == KEY_OFFSET_MAX) {
- iter->pos = bkey_successor(iter->pos);
- goto recheck;
- }
-
- if (!k.k)
- k.k = &iter->nodes[0]->key.k;
-
- bch_key_resize(&n,
- min_t(u64, KEY_SIZE_MAX,
- (k.k->p.inode == n.p.inode
- ? bkey_start_offset(k.k)
- : KEY_OFFSET_MAX) -
- n.p.offset));
-
- EBUG_ON(!n.size);
- }
-
- iter->k = n;
- return (struct bkey_s_c) { &iter->k, NULL };
- } else if (!bkey_deleted(k.k)) {
- return k;
- } else {
- __btree_iter_advance(iter);
- }
- }
-}
-
-void __bch_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
- enum btree_id btree_id, struct bpos pos,
- unsigned locks_want, unsigned depth)
-{
- iter->level = depth;
- /* bch_bkey_ops isn't used much, this would be a cache miss */
- /* iter->is_extents = bch_bkey_ops[btree_id]->is_extents; */
- iter->is_extents = btree_id == BTREE_ID_EXTENTS;
- iter->nodes_locked = 0;
- iter->nodes_intent_locked = 0;
- iter->locks_want = min(locks_want, BTREE_MAX_DEPTH);
- iter->btree_id = btree_id;
- iter->at_end_of_leaf = 0;
- iter->error = 0;
- iter->c = c;
- iter->pos = pos;
- memset(iter->nodes, 0, sizeof(iter->nodes));
- iter->nodes[iter->level] = BTREE_ITER_NOT_END;
- iter->next = iter;
-
- prefetch(c->btree_roots[btree_id].b);
-}
-
-void bch_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
-{
- BUG_ON(btree_iter_linked(new));
-
- new->next = iter->next;
- iter->next = new;
-
- if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
- unsigned nr_iters = 1;
-
- for_each_linked_btree_iter(iter, new)
- nr_iters++;
-
- BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE);
- }
-}
-
-void bch_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
-{
- bch_btree_iter_unlock(dst);
- memcpy(dst, src, offsetof(struct btree_iter, next));
- dst->nodes_locked = dst->nodes_intent_locked = 0;
-}
diff --git a/libbcache/btree_iter.h b/libbcache/btree_iter.h
deleted file mode 100644
index acca2c68..00000000
--- a/libbcache/btree_iter.h
+++ /dev/null
@@ -1,282 +0,0 @@
-#ifndef _BCACHE_BTREE_ITER_H
-#define _BCACHE_BTREE_ITER_H
-
-#include "btree_types.h"
-
-struct btree_iter {
- /* Current btree depth */
- u8 level;
-
- /*
- * Used in bch_btree_iter_traverse(), to indicate whether we're
- * searching for @pos or the first key strictly greater than @pos
- */
- u8 is_extents;
-
- /* Bitmasks for read/intent locks held per level */
- u8 nodes_locked;
- u8 nodes_intent_locked;
-
- /* Btree level below which we start taking intent locks */
- u8 locks_want;
-
- enum btree_id btree_id:8;
-
- /*
- * indicates we need to call bch_btree_iter_traverse() to revalidate
- * iterator:
- */
- u8 at_end_of_leaf;
-
- s8 error;
-
- struct bch_fs *c;
-
- /* Current position of the iterator */
- struct bpos pos;
-
- u32 lock_seq[BTREE_MAX_DEPTH];
-
- /*
- * NOTE: Never set iter->nodes to NULL except in btree_iter_lock_root().
- *
- * This is because iter->nodes[iter->level] == NULL is how
- * btree_iter_next_node() knows that it's finished with a depth first
- * traversal. Just unlocking a node (with btree_node_unlock()) is fine,
- * and if you really don't want that node used again (e.g. btree_split()
- * freed it) decrementing lock_seq will cause btree_node_relock() to
- * always fail (but since freeing a btree node takes a write lock on the
- * node, which increments the node's lock seq, that's not actually
- * necessary in that example).
- *
- * One extra slot for a sentinel NULL:
- */
- struct btree *nodes[BTREE_MAX_DEPTH + 1];
- struct btree_node_iter node_iters[BTREE_MAX_DEPTH];
-
- /*
- * Current unpacked key - so that bch_btree_iter_next()/
- * bch_btree_iter_next_with_holes() can correctly advance pos.
- */
- struct bkey k;
-
- /*
- * Circular linked list of linked iterators: linked iterators share
- * locks (e.g. two linked iterators may have the same node intent
- * locked, or read and write locked, at the same time), and insertions
- * through one iterator won't invalidate the other linked iterators.
- */
-
- /* Must come last: */
- struct btree_iter *next;
-};
-
-static inline bool btree_iter_linked(const struct btree_iter *iter)
-{
- return iter->next != iter;
-}
-
-/**
- * for_each_linked_btree_iter - iterate over all iterators linked with @_iter
- */
-#define for_each_linked_btree_iter(_iter, _linked) \
- for ((_linked) = (_iter)->next; \
- (_linked) != (_iter); \
- (_linked) = (_linked)->next)
-
-static inline struct btree_iter *
-__next_linked_btree_node(struct btree_iter *iter, struct btree *b,
- struct btree_iter *linked)
-{
- do {
- linked = linked->next;
-
- if (linked == iter)
- return NULL;
-
- /*
- * We don't compare the low bits of the lock sequence numbers
- * because @iter might have taken a write lock on @b, and we
- * don't want to skip the linked iterator if the sequence
- * numbers were equal before taking that write lock. The lock
- * sequence number is incremented by taking and releasing write
- * locks and is even when unlocked:
- */
- } while (linked->nodes[b->level] != b ||
- linked->lock_seq[b->level] >> 1 != b->lock.state.seq >> 1);
-
- return linked;
-}
-
-/**
- * for_each_linked_btree_node - iterate over all iterators linked with @_iter
- * that also point to @_b
- *
- * @_b is assumed to be locked by @_iter
- *
- * Filters out iterators that don't have a valid btree_node iterator for @_b -
- * i.e. iterators for which btree_node_relock() would not succeed.
- */
-#define for_each_linked_btree_node(_iter, _b, _linked) \
- for ((_linked) = (_iter); \
- ((_linked) = __next_linked_btree_node(_iter, _b, _linked));)
-
-#ifdef CONFIG_BCACHE_DEBUG
-void bch_btree_iter_verify(struct btree_iter *, struct btree *);
-#else
-static inline void bch_btree_iter_verify(struct btree_iter *iter,
- struct btree *b) {}
-#endif
-
-void bch_btree_node_iter_fix(struct btree_iter *, struct btree *,
- struct btree_node_iter *, struct bset_tree *,
- struct bkey_packed *, unsigned, unsigned);
-
-int bch_btree_iter_unlock(struct btree_iter *);
-bool __bch_btree_iter_set_locks_want(struct btree_iter *, unsigned);
-
-static inline bool bch_btree_iter_set_locks_want(struct btree_iter *iter,
- unsigned new_locks_want)
-{
- new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
-
- if (iter->locks_want == new_locks_want &&
- iter->nodes_intent_locked == (1 << new_locks_want) - 1)
- return true;
-
- return __bch_btree_iter_set_locks_want(iter, new_locks_want);
-}
-
-bool bch_btree_iter_node_replace(struct btree_iter *, struct btree *);
-void bch_btree_iter_node_drop_linked(struct btree_iter *, struct btree *);
-void bch_btree_iter_node_drop(struct btree_iter *, struct btree *);
-
-void bch_btree_iter_reinit_node(struct btree_iter *, struct btree *);
-
-int __must_check bch_btree_iter_traverse(struct btree_iter *);
-
-struct btree *bch_btree_iter_peek_node(struct btree_iter *);
-struct btree *bch_btree_iter_next_node(struct btree_iter *, unsigned);
-
-struct bkey_s_c bch_btree_iter_peek(struct btree_iter *);
-struct bkey_s_c bch_btree_iter_peek_with_holes(struct btree_iter *);
-void bch_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
-void bch_btree_iter_set_pos(struct btree_iter *, struct bpos);
-void bch_btree_iter_advance_pos(struct btree_iter *);
-void bch_btree_iter_rewind(struct btree_iter *, struct bpos);
-
-void __bch_btree_iter_init(struct btree_iter *, struct bch_fs *,
- enum btree_id, struct bpos, unsigned , unsigned);
-
-static inline void bch_btree_iter_init(struct btree_iter *iter,
- struct bch_fs *c,
- enum btree_id btree_id,
- struct bpos pos)
-{
- __bch_btree_iter_init(iter, c, btree_id, pos, 0, 0);
-}
-
-static inline void bch_btree_iter_init_intent(struct btree_iter *iter,
- struct bch_fs *c,
- enum btree_id btree_id,
- struct bpos pos)
-{
- __bch_btree_iter_init(iter, c, btree_id, pos, 1, 0);
-}
-
-void bch_btree_iter_link(struct btree_iter *, struct btree_iter *);
-void bch_btree_iter_copy(struct btree_iter *, struct btree_iter *);
-
-static inline struct bpos btree_type_successor(enum btree_id id,
- struct bpos pos)
-{
- if (id == BTREE_ID_INODES) {
- pos.inode++;
- pos.offset = 0;
- } else if (id != BTREE_ID_EXTENTS) {
- pos = bkey_successor(pos);
- }
-
- return pos;
-}
-
-static inline int __btree_iter_cmp(enum btree_id id,
- struct bpos pos,
- const struct btree_iter *r)
-{
- if (id != r->btree_id)
- return id < r->btree_id ? -1 : 1;
- return bkey_cmp(pos, r->pos);
-}
-
-static inline int btree_iter_cmp(const struct btree_iter *l,
- const struct btree_iter *r)
-{
- return __btree_iter_cmp(l->btree_id, l->pos, r);
-}
-
-#define __for_each_btree_node(_iter, _c, _btree_id, _start, _depth, \
- _b, _locks_want) \
- for (__bch_btree_iter_init((_iter), (_c), (_btree_id), \
- _start, _locks_want, _depth), \
- (_iter)->is_extents = false, \
- _b = bch_btree_iter_peek_node(_iter); \
- (_b); \
- (_b) = bch_btree_iter_next_node(_iter, _depth))
-
-#define for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b) \
- __for_each_btree_node(_iter, _c, _btree_id, _start, _depth, _b, 0)
-
-#define __for_each_btree_key(_iter, _c, _btree_id, _start, \
- _k, _locks_want) \
- for (__bch_btree_iter_init((_iter), (_c), (_btree_id), \
- _start, _locks_want, 0); \
- !IS_ERR_OR_NULL(((_k) = bch_btree_iter_peek(_iter)).k); \
- bch_btree_iter_advance_pos(_iter))
-
-#define for_each_btree_key(_iter, _c, _btree_id, _start, _k) \
- __for_each_btree_key(_iter, _c, _btree_id, _start, _k, 0)
-
-#define for_each_btree_key_intent(_iter, _c, _btree_id, _start, _k) \
- __for_each_btree_key(_iter, _c, _btree_id, _start, _k, 1)
-
-#define __for_each_btree_key_with_holes(_iter, _c, _btree_id, \
- _start, _k, _locks_want) \
- for (__bch_btree_iter_init((_iter), (_c), (_btree_id), \
- _start, _locks_want, 0); \
- !IS_ERR_OR_NULL(((_k) = bch_btree_iter_peek_with_holes(_iter)).k);\
- bch_btree_iter_advance_pos(_iter))
-
-#define for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k) \
- __for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 0)
-
-#define for_each_btree_key_with_holes_intent(_iter, _c, _btree_id, \
- _start, _k) \
- __for_each_btree_key_with_holes(_iter, _c, _btree_id, _start, _k, 1)
-
-static inline int btree_iter_err(struct bkey_s_c k)
-{
- return IS_ERR(k.k) ? PTR_ERR(k.k) : 0;
-}
-
-/*
- * Unlocks before scheduling
- * Note: does not revalidate iterator
- */
-static inline void bch_btree_iter_cond_resched(struct btree_iter *iter)
-{
- struct btree_iter *linked;
-
- if (need_resched()) {
- for_each_linked_btree_iter(iter, linked)
- bch_btree_iter_unlock(linked);
- bch_btree_iter_unlock(iter);
- schedule();
- } else if (race_fault()) {
- for_each_linked_btree_iter(iter, linked)
- bch_btree_iter_unlock(linked);
- bch_btree_iter_unlock(iter);
- }
-}
-
-#endif /* _BCACHE_BTREE_ITER_H */
diff --git a/libbcache/btree_locking.h b/libbcache/btree_locking.h
deleted file mode 100644
index 76f85c0d..00000000
--- a/libbcache/btree_locking.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef _BCACHE_BTREE_LOCKING_H
-#define _BCACHE_BTREE_LOCKING_H
-
-/*
- * Only for internal btree use:
- *
- * The btree iterator tracks what locks it wants to take, and what locks it
- * currently has - here we have wrappers for locking/unlocking btree nodes and
- * updating the iterator state
- */
-
-#include "btree_iter.h"
-#include "six.h"
-
-/* matches six lock types */
-enum btree_node_locked_type {
- BTREE_NODE_UNLOCKED = -1,
- BTREE_NODE_READ_LOCKED = SIX_LOCK_read,
- BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
-};
-
-static inline int btree_node_locked_type(struct btree_iter *iter,
- unsigned level)
-{
- /*
- * We're relying on the fact that if nodes_intent_locked is set
- * nodes_locked must be set as well, so that we can compute without
- * branches:
- */
- return BTREE_NODE_UNLOCKED +
- ((iter->nodes_locked >> level) & 1) +
- ((iter->nodes_intent_locked >> level) & 1);
-}
-
-static inline bool btree_node_intent_locked(struct btree_iter *iter,
- unsigned level)
-{
- return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
-}
-
-static inline bool btree_node_read_locked(struct btree_iter *iter,
- unsigned level)
-{
- return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
-}
-
-static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
-{
- return iter->nodes_locked & (1 << level);
-}
-
-static inline void mark_btree_node_unlocked(struct btree_iter *iter,
- unsigned level)
-{
- iter->nodes_locked &= ~(1 << level);
- iter->nodes_intent_locked &= ~(1 << level);
-}
-
-static inline void mark_btree_node_locked(struct btree_iter *iter,
- unsigned level,
- enum six_lock_type type)
-{
- /* relying on this to avoid a branch */
- BUILD_BUG_ON(SIX_LOCK_read != 0);
- BUILD_BUG_ON(SIX_LOCK_intent != 1);
-
- iter->nodes_locked |= 1 << level;
- iter->nodes_intent_locked |= type << level;
-}
-
-static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
- unsigned level)
-{
- mark_btree_node_locked(iter, level, SIX_LOCK_intent);
-}
-
-static inline enum six_lock_type
-btree_lock_want(struct btree_iter *iter, int level)
-{
- return level < iter->locks_want
- ? SIX_LOCK_intent
- : SIX_LOCK_read;
-}
-
-static inline bool btree_want_intent(struct btree_iter *iter, int level)
-{
- return btree_lock_want(iter, level) == SIX_LOCK_intent;
-}
-
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
-{
- int lock_type = btree_node_locked_type(iter, level);
-
- if (lock_type != BTREE_NODE_UNLOCKED)
- six_unlock_type(&iter->nodes[level]->lock, lock_type);
- mark_btree_node_unlocked(iter, level);
-}
-
-bool __bch_btree_node_lock(struct btree *, struct bpos, unsigned,
- struct btree_iter *, enum six_lock_type);
-
-static inline bool btree_node_lock(struct btree *b, struct bpos pos,
- unsigned level,
- struct btree_iter *iter,
- enum six_lock_type type)
-{
- return likely(six_trylock_type(&b->lock, type)) ||
- __bch_btree_node_lock(b, pos, level, iter, type);
-}
-
-bool btree_node_relock(struct btree_iter *, unsigned);
-
-void btree_node_unlock_write(struct btree *, struct btree_iter *);
-void btree_node_lock_write(struct btree *, struct btree_iter *);
-
-void __btree_node_unlock_write(struct btree *, struct btree_iter *);
-void __btree_node_lock_write(struct btree *, struct btree_iter *);
-
-#endif /* _BCACHE_BTREE_LOCKING_H */
diff --git a/libbcache/btree_types.h b/libbcache/btree_types.h
deleted file mode 100644
index cfca12ea..00000000
--- a/libbcache/btree_types.h
+++ /dev/null
@@ -1,311 +0,0 @@
-#ifndef _BCACHE_BTREE_TYPES_H
-#define _BCACHE_BTREE_TYPES_H
-
-#include <linux/bcache.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/rhashtable.h>
-#include <linux/semaphore.h>
-#include <linux/workqueue.h>
-
-#include "bkey_methods.h"
-#include "journal_types.h"
-#include "six.h"
-
-struct open_bucket;
-struct btree_interior_update;
-
-#define MAX_BSETS 3U
-
-struct btree_nr_keys {
-
- /*
- * Amount of live metadata (i.e. size of node after a compaction) in
- * units of u64s
- */
- u16 live_u64s;
- u16 bset_u64s[MAX_BSETS];
-
- /* live keys only: */
- u16 packed_keys;
- u16 unpacked_keys;
-};
-
-struct bset_tree {
- /*
- * We construct a binary tree in an array as if the array
- * started at 1, so that things line up on the same cachelines
- * better: see comments in bset.c at cacheline_to_bkey() for
- * details
- */
-
- /* size of the binary tree and prev array */
- u16 size;
-
- /* function of size - precalculated for to_inorder() */
- u16 extra;
-
- u16 data_offset;
- u16 aux_data_offset;
- u16 end_offset;
-
- struct bpos max_key;
-};
-
-struct btree_write {
- struct journal_entry_pin journal;
- struct closure_waitlist wait;
-};
-
-struct btree {
- /* Hottest entries first */
- struct rhash_head hash;
-
- /* Key/pointer for this btree node */
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-
- struct six_lock lock;
-
- unsigned long flags;
- u16 written;
- u8 level;
- u8 btree_id;
- u8 nsets;
- u8 nr_key_bits;
-
- struct bkey_format format;
-
- struct btree_node *data;
- void *aux_data;
-
- /*
- * Sets of sorted keys - the real btree node - plus a binary search tree
- *
- * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
- * to the memory we have allocated for this btree node. Additionally,
- * set[0]->data points to the entire btree node as it exists on disk.
- */
- struct bset_tree set[MAX_BSETS];
-
- struct btree_nr_keys nr;
- u16 sib_u64s[2];
- u16 whiteout_u64s;
- u16 uncompacted_whiteout_u64s;
- u8 page_order;
- u8 unpack_fn_len;
-
- /*
- * XXX: add a delete sequence number, so when btree_node_relock() fails
- * because the lock sequence number has changed - i.e. the contents were
- * modified - we can still relock the node if it's still the one we
- * want, without redoing the traversal
- */
-
- /*
- * For asynchronous splits/interior node updates:
- * When we do a split, we allocate new child nodes and update the parent
- * node to point to them: we update the parent in memory immediately,
- * but then we must wait until the children have been written out before
- * the update to the parent can be written - this is a list of the
- * btree_interior_updates that are blocking this node from being
- * written:
- */
- struct list_head write_blocked;
-
- struct open_bucket *ob;
-
- /* lru list */
- struct list_head list;
-
- struct btree_write writes[2];
-
-#ifdef CONFIG_BCACHE_DEBUG
- bool *expensive_debug_checks;
-#endif
-};
-
-#define BTREE_FLAG(flag) \
-static inline bool btree_node_ ## flag(struct btree *b) \
-{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
- \
-static inline void set_btree_node_ ## flag(struct btree *b) \
-{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
- \
-static inline void clear_btree_node_ ## flag(struct btree *b) \
-{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
-
-enum btree_flags {
- BTREE_NODE_read_error,
- BTREE_NODE_write_error,
- BTREE_NODE_dirty,
- BTREE_NODE_noevict,
- BTREE_NODE_write_idx,
- BTREE_NODE_accessed,
- BTREE_NODE_write_in_flight,
- BTREE_NODE_just_written,
-};
-
-BTREE_FLAG(read_error);
-BTREE_FLAG(write_error);
-BTREE_FLAG(dirty);
-BTREE_FLAG(noevict);
-BTREE_FLAG(write_idx);
-BTREE_FLAG(accessed);
-BTREE_FLAG(write_in_flight);
-BTREE_FLAG(just_written);
-
-static inline struct btree_write *btree_current_write(struct btree *b)
-{
- return b->writes + btree_node_write_idx(b);
-}
-
-static inline struct btree_write *btree_prev_write(struct btree *b)
-{
- return b->writes + (btree_node_write_idx(b) ^ 1);
-}
-
-static inline struct bset_tree *bset_tree_last(struct btree *b)
-{
- EBUG_ON(!b->nsets);
- return b->set + b->nsets - 1;
-}
-
-static inline struct bset *bset(const struct btree *b,
- const struct bset_tree *t)
-{
- return (void *) b->data + t->data_offset * sizeof(u64);
-}
-
-static inline struct bset *btree_bset_first(struct btree *b)
-{
- return bset(b, b->set);
-}
-
-static inline struct bset *btree_bset_last(struct btree *b)
-{
- return bset(b, bset_tree_last(b));
-}
-
-static inline u16
-__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
-{
- size_t ret = (u64 *) k - (u64 *) b->data - 1;
-
- EBUG_ON(ret > U16_MAX);
- return ret;
-}
-
-static inline struct bkey_packed *
-__btree_node_offset_to_key(const struct btree *b, u16 k)
-{
- return (void *) ((u64 *) b->data + k + 1);
-}
-
-#define btree_bkey_first(_b, _t) (bset(_b, _t)->start)
-
-#define btree_bkey_last(_b, _t) \
-({ \
- EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \
- vstruct_last(bset(_b, _t))); \
- \
- __btree_node_offset_to_key(_b, (_t)->end_offset); \
-})
-
-static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
-{
- t->end_offset =
- __btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
- btree_bkey_last(b, t);
-}
-
-static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
- const struct bset *i)
-{
- t->data_offset = (u64 *) i - (u64 *) b->data;
-
- EBUG_ON(bset(b, t) != i);
-
- set_btree_bset_end(b, t);
-}
-
-static inline unsigned bset_byte_offset(struct btree *b, void *i)
-{
- return i - (void *) b->data;
-}
-
-/* Type of keys @b contains: */
-static inline enum bkey_type btree_node_type(struct btree *b)
-{
- return b->level ? BKEY_TYPE_BTREE : b->btree_id;
-}
-
-static inline const struct bkey_ops *btree_node_ops(struct btree *b)
-{
- return bch_bkey_ops[btree_node_type(b)];
-}
-
-static inline bool btree_node_has_ptrs(struct btree *b)
-{
- return btree_type_has_ptrs(btree_node_type(b));
-}
-
-static inline bool btree_node_is_extents(struct btree *b)
-{
- return btree_node_type(b) == BKEY_TYPE_EXTENTS;
-}
-
-struct btree_root {
- struct btree *b;
-
- struct btree_interior_update *as;
-
- /* On disk root - see async splits: */
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
- u8 level;
- u8 alive;
-};
-
-/*
- * Optional hook that will be called just prior to a btree node update, when
- * we're holding the write lock and we know what key is about to be overwritten:
- */
-
-struct btree_iter;
-struct btree_node_iter;
-
-enum extent_insert_hook_ret {
- BTREE_HOOK_DO_INSERT,
- BTREE_HOOK_NO_INSERT,
- BTREE_HOOK_RESTART_TRANS,
-};
-
-struct extent_insert_hook {
- enum extent_insert_hook_ret
- (*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
- struct bkey_s_c, const struct bkey_i *);
-};
-
-enum btree_insert_ret {
- BTREE_INSERT_OK,
- /* extent spanned multiple leaf nodes: have to traverse to next node: */
- BTREE_INSERT_NEED_TRAVERSE,
- /* write lock held for too long */
- BTREE_INSERT_NEED_RESCHED,
- /* leaf node needs to be split */
- BTREE_INSERT_BTREE_NODE_FULL,
- BTREE_INSERT_JOURNAL_RES_FULL,
- BTREE_INSERT_ENOSPC,
- BTREE_INSERT_NEED_GC_LOCK,
-};
-
-enum btree_gc_coalesce_fail_reason {
- BTREE_GC_COALESCE_FAIL_RESERVE_GET,
- BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
- BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
-};
-
-typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
- struct btree *,
- struct btree_node_iter *);
-
-#endif /* _BCACHE_BTREE_TYPES_H */
diff --git a/libbcache/btree_update.c b/libbcache/btree_update.c
deleted file mode 100644
index 751a51c2..00000000
--- a/libbcache/btree_update.c
+++ /dev/null
@@ -1,2345 +0,0 @@
-
-#include "bcache.h"
-#include "alloc.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_locking.h"
-#include "buckets.h"
-#include "extents.h"
-#include "journal.h"
-#include "keylist.h"
-#include "super-io.h"
-
-#include <linux/random.h>
-#include <linux/sort.h>
-#include <trace/events/bcache.h>
-
-static void btree_interior_update_updated_root(struct bch_fs *,
- struct btree_interior_update *,
- enum btree_id);
-
-/* Calculate ideal packed bkey format for new btree nodes: */
-
-void __bch_btree_calc_format(struct bkey_format_state *s, struct btree *b)
-{
- struct bkey_packed *k;
- struct bset_tree *t;
- struct bkey uk;
-
- bch_bkey_format_add_pos(s, b->data->min_key);
-
- for_each_bset(b, t)
- for (k = btree_bkey_first(b, t);
- k != btree_bkey_last(b, t);
- k = bkey_next(k))
- if (!bkey_whiteout(k)) {
- uk = bkey_unpack_key(b, k);
- bch_bkey_format_add_key(s, &uk);
- }
-}
-
-static struct bkey_format bch_btree_calc_format(struct btree *b)
-{
- struct bkey_format_state s;
-
- bch_bkey_format_init(&s);
- __bch_btree_calc_format(&s, b);
-
- return bch_bkey_format_done(&s);
-}
-
-static size_t btree_node_u64s_with_format(struct btree *b,
- struct bkey_format *new_f)
-{
- struct bkey_format *old_f = &b->format;
-
- /* stupid integer promotion rules */
- ssize_t delta =
- (((int) new_f->key_u64s - old_f->key_u64s) *
- (int) b->nr.packed_keys) +
- (((int) new_f->key_u64s - BKEY_U64s) *
- (int) b->nr.unpacked_keys);
-
- BUG_ON(delta + b->nr.live_u64s < 0);
-
- return b->nr.live_u64s + delta;
-}
-
-/**
- * btree_node_format_fits - check if we could rewrite node with a new format
- *
- * This assumes all keys can pack with the new format -- it just checks if
- * the re-packed keys would fit inside the node itself.
- */
-bool bch_btree_node_format_fits(struct bch_fs *c, struct btree *b,
- struct bkey_format *new_f)
-{
- size_t u64s = btree_node_u64s_with_format(b, new_f);
-
- return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
-}
-
-/* Btree node freeing/allocation: */
-
-/*
- * We're doing the index update that makes @b unreachable, update stuff to
- * reflect that:
- *
- * Must be called _before_ btree_interior_update_updated_root() or
- * btree_interior_update_updated_btree:
- */
-static void bch_btree_node_free_index(struct bch_fs *c, struct btree *b,
- enum btree_id id, struct bkey_s_c k,
- struct bch_fs_usage *stats)
-{
- struct btree_interior_update *as;
- struct pending_btree_node_free *d;
-
- mutex_lock(&c->btree_interior_update_lock);
-
- for_each_pending_btree_node_free(c, as, d)
- if (!bkey_cmp(k.k->p, d->key.k.p) &&
- bkey_val_bytes(k.k) == bkey_val_bytes(&d->key.k) &&
- !memcmp(k.v, &d->key.v, bkey_val_bytes(k.k)))
- goto found;
-
- BUG();
-found:
- d->index_update_done = true;
-
- /*
- * Btree nodes are accounted as freed in bch_alloc_stats when they're
- * freed from the index:
- */
- stats->s[S_COMPRESSED][S_META] -= c->sb.btree_node_size;
- stats->s[S_UNCOMPRESSED][S_META] -= c->sb.btree_node_size;
-
- /*
- * We're dropping @k from the btree, but it's still live until the
- * index update is persistent so we need to keep a reference around for
- * mark and sweep to find - that's primarily what the
- * btree_node_pending_free list is for.
- *
- * So here (when we set index_update_done = true), we're moving an
- * existing reference to a different part of the larger "gc keyspace" -
- * and the new position comes after the old position, since GC marks
- * the pending free list after it walks the btree.
- *
- * If we move the reference while mark and sweep is _between_ the old
- * and the new position, mark and sweep will see the reference twice
- * and it'll get double accounted - so check for that here and subtract
- * to cancel out one of mark and sweep's markings if necessary:
- */
-
- /*
- * bch_mark_key() compares the current gc pos to the pos we're
- * moving this reference from, hence one comparison here:
- */
- if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
- struct bch_fs_usage tmp = { 0 };
-
- bch_mark_key(c, bkey_i_to_s_c(&d->key),
- -c->sb.btree_node_size, true, b
- ? gc_pos_btree_node(b)
- : gc_pos_btree_root(id),
- &tmp, 0);
- /*
- * Don't apply tmp - pending deletes aren't tracked in
- * bch_alloc_stats:
- */
- }
-
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static void __btree_node_free(struct bch_fs *c, struct btree *b,
- struct btree_iter *iter)
-{
- trace_bcache_btree_node_free(c, b);
-
- BUG_ON(b == btree_node_root(c, b));
- BUG_ON(b->ob);
- BUG_ON(!list_empty(&b->write_blocked));
-
- six_lock_write(&b->lock);
-
- if (btree_node_dirty(b))
- bch_btree_complete_write(c, b, btree_current_write(b));
- clear_btree_node_dirty(b);
-
- mca_hash_remove(c, b);
-
- mutex_lock(&c->btree_cache_lock);
- list_move(&b->list, &c->btree_cache_freeable);
- mutex_unlock(&c->btree_cache_lock);
-
- /*
- * By using six_unlock_write() directly instead of
- * btree_node_unlock_write(), we don't update the iterator's sequence
- * numbers and cause future btree_node_relock() calls to fail:
- */
- six_unlock_write(&b->lock);
-}
-
-void bch_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
-{
- struct open_bucket *ob = b->ob;
-
- b->ob = NULL;
-
- __btree_node_free(c, b, NULL);
-
- bch_open_bucket_put(c, ob);
-}
-
-void bch_btree_node_free_inmem(struct btree_iter *iter, struct btree *b)
-{
- bch_btree_iter_node_drop_linked(iter, b);
-
- __btree_node_free(iter->c, b, iter);
-
- bch_btree_iter_node_drop(iter, b);
-}
-
-static void bch_btree_node_free_ondisk(struct bch_fs *c,
- struct pending_btree_node_free *pending)
-{
- struct bch_fs_usage stats = { 0 };
-
- BUG_ON(!pending->index_update_done);
-
- bch_mark_key(c, bkey_i_to_s_c(&pending->key),
- -c->sb.btree_node_size, true,
- gc_phase(GC_PHASE_PENDING_DELETE),
- &stats, 0);
- /*
- * Don't apply stats - pending deletes aren't tracked in
- * bch_alloc_stats:
- */
-}
-
-void btree_open_bucket_put(struct bch_fs *c, struct btree *b)
-{
- bch_open_bucket_put(c, b->ob);
- b->ob = NULL;
-}
-
-static struct btree *__bch_btree_node_alloc(struct bch_fs *c,
- bool use_reserve,
- struct disk_reservation *res,
- struct closure *cl)
-{
- BKEY_PADDED(k) tmp;
- struct open_bucket *ob;
- struct btree *b;
- unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE;
-
- mutex_lock(&c->btree_reserve_cache_lock);
- if (c->btree_reserve_cache_nr > reserve) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
- ob = a->ob;
- bkey_copy(&tmp.k, &a->k);
- mutex_unlock(&c->btree_reserve_cache_lock);
- goto mem_alloc;
- }
- mutex_unlock(&c->btree_reserve_cache_lock);
-
-retry:
- /* alloc_sectors is weird, I suppose */
- bkey_extent_init(&tmp.k);
- tmp.k.k.size = c->sb.btree_node_size,
-
- ob = bch_alloc_sectors(c, &c->btree_write_point,
- bkey_i_to_extent(&tmp.k),
- res->nr_replicas,
- c->opts.metadata_replicas_required,
- use_reserve ? RESERVE_BTREE : RESERVE_NONE,
- cl);
- if (IS_ERR(ob))
- return ERR_CAST(ob);
-
- if (tmp.k.k.size < c->sb.btree_node_size) {
- bch_open_bucket_put(c, ob);
- goto retry;
- }
-mem_alloc:
- b = mca_alloc(c);
-
- /* we hold cannibalize_lock: */
- BUG_ON(IS_ERR(b));
- BUG_ON(b->ob);
-
- bkey_copy(&b->key, &tmp.k);
- b->key.k.size = 0;
- b->ob = ob;
-
- return b;
-}
-
-static struct btree *bch_btree_node_alloc(struct bch_fs *c,
- unsigned level, enum btree_id id,
- struct btree_reserve *reserve)
-{
- struct btree *b;
-
- BUG_ON(!reserve->nr);
-
- b = reserve->b[--reserve->nr];
-
- BUG_ON(mca_hash_insert(c, b, level, id));
-
- set_btree_node_accessed(b);
- set_btree_node_dirty(b);
-
- bch_bset_init_first(b, &b->data->keys);
- memset(&b->nr, 0, sizeof(b->nr));
- b->data->magic = cpu_to_le64(bset_magic(c));
- b->data->flags = 0;
- SET_BTREE_NODE_ID(b->data, id);
- SET_BTREE_NODE_LEVEL(b->data, level);
- b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
-
- bch_btree_build_aux_trees(b);
-
- bch_check_mark_super(c, &b->key, true);
-
- trace_bcache_btree_node_alloc(c, b);
- return b;
-}
-
-struct btree *__btree_node_alloc_replacement(struct bch_fs *c,
- struct btree *b,
- struct bkey_format format,
- struct btree_reserve *reserve)
-{
- struct btree *n;
-
- n = bch_btree_node_alloc(c, b->level, b->btree_id, reserve);
-
- n->data->min_key = b->data->min_key;
- n->data->max_key = b->data->max_key;
- n->data->format = format;
-
- btree_node_set_format(n, format);
-
- bch_btree_sort_into(c, n, b);
-
- btree_node_reset_sib_u64s(n);
-
- n->key.k.p = b->key.k.p;
- trace_bcache_btree_node_alloc_replacement(c, b, n);
-
- return n;
-}
-
-struct btree *btree_node_alloc_replacement(struct bch_fs *c,
- struct btree *b,
- struct btree_reserve *reserve)
-{
- struct bkey_format new_f = bch_btree_calc_format(b);
-
- /*
- * The keys might expand with the new format - if they wouldn't fit in
- * the btree node anymore, use the old format for now:
- */
- if (!bch_btree_node_format_fits(c, b, &new_f))
- new_f = b->format;
-
- return __btree_node_alloc_replacement(c, b, new_f, reserve);
-}
-
-static void bch_btree_set_root_inmem(struct bch_fs *c, struct btree *b,
- struct btree_reserve *btree_reserve)
-{
- struct btree *old = btree_node_root(c, b);
-
- /* Root nodes cannot be reaped */
- mutex_lock(&c->btree_cache_lock);
- list_del_init(&b->list);
- mutex_unlock(&c->btree_cache_lock);
-
- mutex_lock(&c->btree_root_lock);
- btree_node_root(c, b) = b;
- mutex_unlock(&c->btree_root_lock);
-
- if (btree_reserve) {
- /*
- * New allocation (we're not being called because we're in
- * bch_btree_root_read()) - do marking while holding
- * btree_root_lock:
- */
- struct bch_fs_usage stats = { 0 };
-
- bch_mark_key(c, bkey_i_to_s_c(&b->key),
- c->sb.btree_node_size, true,
- gc_pos_btree_root(b->btree_id),
- &stats, 0);
-
- if (old)
- bch_btree_node_free_index(c, NULL, old->btree_id,
- bkey_i_to_s_c(&old->key),
- &stats);
- bch_fs_usage_apply(c, &stats, &btree_reserve->disk_res,
- gc_pos_btree_root(b->btree_id));
- }
-
- bch_recalc_btree_reserve(c);
-}
-
-static void bch_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
-{
- struct btree_root *r = &c->btree_roots[b->btree_id];
-
- mutex_lock(&c->btree_root_lock);
-
- BUG_ON(b != r->b);
- bkey_copy(&r->key, &b->key);
- r->level = b->level;
- r->alive = true;
-
- mutex_unlock(&c->btree_root_lock);
-}
-
-/*
- * Only for filesystem bringup, when first reading the btree roots or allocating
- * btree roots when initializing a new filesystem:
- */
-void bch_btree_set_root_initial(struct bch_fs *c, struct btree *b,
- struct btree_reserve *btree_reserve)
-{
- BUG_ON(btree_node_root(c, b));
-
- bch_btree_set_root_inmem(c, b, btree_reserve);
- bch_btree_set_root_ondisk(c, b);
-}
-
-/**
- * bch_btree_set_root - update the root in memory and on disk
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks. However, you must hold an intent lock on the
- * old root.
- *
- * Note: This allocates a journal entry but doesn't add any keys to
- * it. All the btree roots are part of every journal write, so there
- * is nothing new to be done. This just guarantees that there is a
- * journal write.
- */
-static void bch_btree_set_root(struct btree_iter *iter, struct btree *b,
- struct btree_interior_update *as,
- struct btree_reserve *btree_reserve)
-{
- struct bch_fs *c = iter->c;
- struct btree *old;
-
- trace_bcache_btree_set_root(c, b);
- BUG_ON(!b->written);
-
- old = btree_node_root(c, b);
-
- /*
- * Ensure no one is using the old root while we switch to the
- * new root:
- */
- btree_node_lock_write(old, iter);
-
- bch_btree_set_root_inmem(c, b, btree_reserve);
-
- btree_interior_update_updated_root(c, as, iter->btree_id);
-
- /*
- * Unlock old root after new root is visible:
- *
- * The new root isn't persistent, but that's ok: we still have
- * an intent lock on the new root, and any updates that would
- * depend on the new root would have to update the new root.
- */
- btree_node_unlock_write(old, iter);
-}
-
-static struct btree *__btree_root_alloc(struct bch_fs *c, unsigned level,
- enum btree_id id,
- struct btree_reserve *reserve)
-{
- struct btree *b = bch_btree_node_alloc(c, level, id, reserve);
-
- b->data->min_key = POS_MIN;
- b->data->max_key = POS_MAX;
- b->data->format = bch_btree_calc_format(b);
- b->key.k.p = POS_MAX;
-
- btree_node_set_format(b, b->data->format);
- bch_btree_build_aux_trees(b);
-
- six_unlock_write(&b->lock);
-
- return b;
-}
-
-void bch_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
-{
- bch_disk_reservation_put(c, &reserve->disk_res);
-
- mutex_lock(&c->btree_reserve_cache_lock);
-
- while (reserve->nr) {
- struct btree *b = reserve->b[--reserve->nr];
-
- six_unlock_write(&b->lock);
-
- if (c->btree_reserve_cache_nr <
- ARRAY_SIZE(c->btree_reserve_cache)) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
-
- a->ob = b->ob;
- b->ob = NULL;
- bkey_copy(&a->k, &b->key);
- } else {
- bch_open_bucket_put(c, b->ob);
- b->ob = NULL;
- }
-
- __btree_node_free(c, b, NULL);
-
- six_unlock_intent(&b->lock);
- }
-
- mutex_unlock(&c->btree_reserve_cache_lock);
-
- mempool_free(reserve, &c->btree_reserve_pool);
-}
-
-static struct btree_reserve *__bch_btree_reserve_get(struct bch_fs *c,
- unsigned nr_nodes,
- unsigned flags,
- struct closure *cl)
-{
- struct btree_reserve *reserve;
- struct btree *b;
- struct disk_reservation disk_res = { 0, 0 };
- unsigned sectors = nr_nodes * c->sb.btree_node_size;
- int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD|
- BCH_DISK_RESERVATION_METADATA;
-
- if (flags & BTREE_INSERT_NOFAIL)
- disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
-
- /*
- * This check isn't necessary for correctness - it's just to potentially
- * prevent us from doing a lot of work that'll end up being wasted:
- */
- ret = bch_journal_error(&c->journal);
- if (ret)
- return ERR_PTR(ret);
-
- if (bch_disk_reservation_get(c, &disk_res, sectors, disk_res_flags))
- return ERR_PTR(-ENOSPC);
-
- BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
-
- /*
- * Protects reaping from the btree node cache and using the btree node
- * open bucket reserve:
- */
- ret = mca_cannibalize_lock(c, cl);
- if (ret) {
- bch_disk_reservation_put(c, &disk_res);
- return ERR_PTR(ret);
- }
-
- reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
-
- reserve->disk_res = disk_res;
- reserve->nr = 0;
-
- while (reserve->nr < nr_nodes) {
- b = __bch_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE,
- &disk_res, cl);
- if (IS_ERR(b)) {
- ret = PTR_ERR(b);
- goto err_free;
- }
-
- reserve->b[reserve->nr++] = b;
- }
-
- mca_cannibalize_unlock(c);
- return reserve;
-err_free:
- bch_btree_reserve_put(c, reserve);
- mca_cannibalize_unlock(c);
- trace_bcache_btree_reserve_get_fail(c, nr_nodes, cl);
- return ERR_PTR(ret);
-}
-
-struct btree_reserve *bch_btree_reserve_get(struct bch_fs *c,
- struct btree *b,
- unsigned extra_nodes,
- unsigned flags,
- struct closure *cl)
-{
- unsigned depth = btree_node_root(c, b)->level - b->level;
- unsigned nr_nodes = btree_reserve_required_nodes(depth) + extra_nodes;
-
- return __bch_btree_reserve_get(c, nr_nodes, flags, cl);
-
-}
-
-int bch_btree_root_alloc(struct bch_fs *c, enum btree_id id,
- struct closure *writes)
-{
- struct closure cl;
- struct btree_reserve *reserve;
- struct btree *b;
-
- closure_init_stack(&cl);
-
- while (1) {
- /* XXX haven't calculated capacity yet :/ */
- reserve = __bch_btree_reserve_get(c, 1, 0, &cl);
- if (!IS_ERR(reserve))
- break;
-
- if (PTR_ERR(reserve) == -ENOSPC)
- return PTR_ERR(reserve);
-
- closure_sync(&cl);
- }
-
- b = __btree_root_alloc(c, 0, id, reserve);
-
- bch_btree_node_write(c, b, writes, SIX_LOCK_intent, -1);
-
- bch_btree_set_root_initial(c, b, reserve);
- btree_open_bucket_put(c, b);
- six_unlock_intent(&b->lock);
-
- bch_btree_reserve_put(c, reserve);
-
- return 0;
-}
-
-static void bch_insert_fixup_btree_ptr(struct btree_iter *iter,
- struct btree *b,
- struct bkey_i *insert,
- struct btree_node_iter *node_iter,
- struct disk_reservation *disk_res)
-{
- struct bch_fs *c = iter->c;
- struct bch_fs_usage stats = { 0 };
- struct bkey_packed *k;
- struct bkey tmp;
-
- if (bkey_extent_is_data(&insert->k))
- bch_mark_key(c, bkey_i_to_s_c(insert),
- c->sb.btree_node_size, true,
- gc_pos_btree_node(b), &stats, 0);
-
- while ((k = bch_btree_node_iter_peek_all(node_iter, b)) &&
- !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
- bch_btree_node_iter_advance(node_iter, b);
-
- /*
- * If we're overwriting, look up pending delete and mark so that gc
- * marks it on the pending delete list:
- */
- if (k && !bkey_cmp_packed(b, k, &insert->k))
- bch_btree_node_free_index(c, b, iter->btree_id,
- bkey_disassemble(b, k, &tmp),
- &stats);
-
- bch_fs_usage_apply(c, &stats, disk_res, gc_pos_btree_node(b));
-
- bch_btree_bset_insert_key(iter, b, node_iter, insert);
- set_btree_node_dirty(b);
-}
-
-/* Inserting into a given leaf node (last stage of insert): */
-
-/* Handle overwrites and do insert, for non extents: */
-bool bch_btree_bset_insert_key(struct btree_iter *iter,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bkey_i *insert)
-{
- const struct bkey_format *f = &b->format;
- struct bkey_packed *k;
- struct bset_tree *t;
- unsigned clobber_u64s;
-
- EBUG_ON(btree_node_just_written(b));
- EBUG_ON(bset_written(b, btree_bset_last(b)));
- EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
- EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
- bkey_cmp(insert->k.p, b->data->max_key) > 0);
- BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->c, b));
-
- k = bch_btree_node_iter_peek_all(node_iter, b);
- if (k && !bkey_cmp_packed(b, k, &insert->k)) {
- BUG_ON(bkey_whiteout(k));
-
- t = bch_bkey_to_bset(b, k);
-
- if (bset_unwritten(b, bset(b, t)) &&
- bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
- BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k));
-
- k->type = insert->k.type;
- memcpy_u64s(bkeyp_val(f, k), &insert->v,
- bkey_val_u64s(&insert->k));
- return true;
- }
-
- insert->k.needs_whiteout = k->needs_whiteout;
-
- btree_keys_account_key_drop(&b->nr, t - b->set, k);
-
- if (t == bset_tree_last(b)) {
- clobber_u64s = k->u64s;
-
- /*
- * If we're deleting, and the key we're deleting doesn't
- * need a whiteout (it wasn't overwriting a key that had
- * been written to disk) - just delete it:
- */
- if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
- bch_bset_delete(b, k, clobber_u64s);
- bch_btree_node_iter_fix(iter, b, node_iter, t,
- k, clobber_u64s, 0);
- return true;
- }
-
- goto overwrite;
- }
-
- k->type = KEY_TYPE_DELETED;
- bch_btree_node_iter_fix(iter, b, node_iter, t, k,
- k->u64s, k->u64s);
-
- if (bkey_whiteout(&insert->k)) {
- reserve_whiteout(b, t, k);
- return true;
- } else {
- k->needs_whiteout = false;
- }
- } else {
- /*
- * Deleting, but the key to delete wasn't found - nothing to do:
- */
- if (bkey_whiteout(&insert->k))
- return false;
-
- insert->k.needs_whiteout = false;
- }
-
- t = bset_tree_last(b);
- k = bch_btree_node_iter_bset_pos(node_iter, b, t);
- clobber_u64s = 0;
-overwrite:
- bch_bset_insert(b, node_iter, k, insert, clobber_u64s);
- if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
- bch_btree_node_iter_fix(iter, b, node_iter, t, k,
- clobber_u64s, k->u64s);
- return true;
-}
-
-static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
- unsigned i)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct btree_write *w = container_of(pin, struct btree_write, journal);
- struct btree *b = container_of(w, struct btree, writes[i]);
-
- six_lock_read(&b->lock);
- /*
- * Reusing a btree node can race with the journal reclaim code calling
- * the journal pin flush fn, and there's no good fix for this: we don't
- * really want journal_pin_drop() to block until the flush fn is no
- * longer running, because journal_pin_drop() is called from the btree
- * node write endio function, and we can't wait on the flush fn to
- * finish running in mca_reap() - where we make reused btree nodes ready
- * to use again - because there, we're holding the lock this function
- * needs - deadlock.
- *
- * So, the b->level check is a hack so we don't try to write nodes we
- * shouldn't:
- */
- if (!b->level)
- bch_btree_node_write(c, b, NULL, SIX_LOCK_read, i);
- six_unlock_read(&b->lock);
-}
-
-static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin)
-{
- return __btree_node_flush(j, pin, 0);
-}
-
-static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin)
-{
- return __btree_node_flush(j, pin, 1);
-}
-
-void bch_btree_journal_key(struct btree_insert *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct bch_fs *c = trans->c;
- struct journal *j = &c->journal;
- struct btree *b = iter->nodes[0];
- struct btree_write *w = btree_current_write(b);
-
- EBUG_ON(iter->level || b->level);
- EBUG_ON(!trans->journal_res.ref &&
- test_bit(JOURNAL_REPLAY_DONE, &j->flags));
-
- if (!journal_pin_active(&w->journal))
- bch_journal_pin_add(j, &w->journal,
- btree_node_write_idx(b) == 0
- ? btree_node_flush0
- : btree_node_flush1);
-
- if (trans->journal_res.ref) {
- u64 seq = trans->journal_res.seq;
- bool needs_whiteout = insert->k.needs_whiteout;
-
- /*
- * have a bug where we're seeing an extent with an invalid crc
- * entry in the journal, trying to track it down:
- */
- BUG_ON(bkey_invalid(c, b->btree_id, bkey_i_to_s_c(insert)));
-
- /* ick */
- insert->k.needs_whiteout = false;
- bch_journal_add_keys(j, &trans->journal_res,
- b->btree_id, insert);
- insert->k.needs_whiteout = needs_whiteout;
-
- if (trans->journal_seq)
- *trans->journal_seq = seq;
- btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
- }
-
- if (!btree_node_dirty(b))
- set_btree_node_dirty(b);
-}
-
-static enum btree_insert_ret
-bch_insert_fixup_key(struct btree_insert *trans,
- struct btree_insert_entry *insert)
-{
- struct btree_iter *iter = insert->iter;
-
- BUG_ON(iter->level);
-
- if (bch_btree_bset_insert_key(iter,
- iter->nodes[0],
- &iter->node_iters[0],
- insert->k))
- bch_btree_journal_key(trans, iter, insert->k);
-
- trans->did_work = true;
- return BTREE_INSERT_OK;
-}
-
-static void verify_keys_sorted(struct keylist *l)
-{
-#ifdef CONFIG_BCACHE_DEBUG
- struct bkey_i *k;
-
- for_each_keylist_key(l, k)
- BUG_ON(bkey_next(k) != l->top &&
- bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
-#endif
-}
-
-static void btree_node_lock_for_insert(struct btree *b, struct btree_iter *iter)
-{
- struct bch_fs *c = iter->c;
-
- btree_node_lock_write(b, iter);
-
- if (btree_node_just_written(b) &&
- bch_btree_post_write_cleanup(c, b))
- bch_btree_iter_reinit_node(iter, b);
-
- /*
- * If the last bset has been written, or if it's gotten too big - start
- * a new bset to insert into:
- */
- if (want_new_bset(c, b))
- bch_btree_init_next(c, b, iter);
-}
-
-/* Asynchronous interior node update machinery */
-
-struct btree_interior_update *
-bch_btree_interior_update_alloc(struct bch_fs *c)
-{
- struct btree_interior_update *as;
-
- as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
- memset(as, 0, sizeof(*as));
- closure_init(&as->cl, &c->cl);
- as->c = c;
- as->mode = BTREE_INTERIOR_NO_UPDATE;
-
- bch_keylist_init(&as->parent_keys, as->inline_keys,
- ARRAY_SIZE(as->inline_keys));
-
- mutex_lock(&c->btree_interior_update_lock);
- list_add(&as->list, &c->btree_interior_update_list);
- mutex_unlock(&c->btree_interior_update_lock);
-
- return as;
-}
-
-static void btree_interior_update_free(struct closure *cl)
-{
- struct btree_interior_update *as = container_of(cl, struct btree_interior_update, cl);
-
- mempool_free(as, &as->c->btree_interior_update_pool);
-}
-
-static void btree_interior_update_nodes_reachable(struct closure *cl)
-{
- struct btree_interior_update *as =
- container_of(cl, struct btree_interior_update, cl);
- struct bch_fs *c = as->c;
- unsigned i;
-
- bch_journal_pin_drop(&c->journal, &as->journal);
-
- mutex_lock(&c->btree_interior_update_lock);
-
- for (i = 0; i < as->nr_pending; i++)
- bch_btree_node_free_ondisk(c, &as->pending[i]);
- as->nr_pending = 0;
-
- mutex_unlock(&c->btree_interior_update_lock);
-
- mutex_lock(&c->btree_interior_update_lock);
- list_del(&as->list);
- mutex_unlock(&c->btree_interior_update_lock);
-
- closure_wake_up(&as->wait);
-
- closure_return_with_destructor(cl, btree_interior_update_free);
-}
-
-static void btree_interior_update_nodes_written(struct closure *cl)
-{
- struct btree_interior_update *as =
- container_of(cl, struct btree_interior_update, cl);
- struct bch_fs *c = as->c;
- struct btree *b;
-
- if (bch_journal_error(&c->journal)) {
- /* XXX what? */
- }
-
- /* XXX: missing error handling, damnit */
-
- /* check for journal error, bail out if we flushed */
-
- /*
- * We did an update to a parent node where the pointers we added pointed
- * to child nodes that weren't written yet: now, the child nodes have
- * been written so we can write out the update to the interior node.
- */
-retry:
- mutex_lock(&c->btree_interior_update_lock);
- switch (as->mode) {
- case BTREE_INTERIOR_NO_UPDATE:
- BUG();
- case BTREE_INTERIOR_UPDATING_NODE:
- /* The usual case: */
- b = READ_ONCE(as->b);
-
- if (!six_trylock_read(&b->lock)) {
- mutex_unlock(&c->btree_interior_update_lock);
- six_lock_read(&b->lock);
- six_unlock_read(&b->lock);
- goto retry;
- }
-
- BUG_ON(!btree_node_dirty(b));
- closure_wait(&btree_current_write(b)->wait, cl);
-
- list_del(&as->write_blocked_list);
-
- if (list_empty(&b->write_blocked))
- bch_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
- six_unlock_read(&b->lock);
- break;
-
- case BTREE_INTERIOR_UPDATING_AS:
- /*
- * The btree node we originally updated has been freed and is
- * being rewritten - so we need to write anything here, we just
- * need to signal to that btree_interior_update that it's ok to make the
- * new replacement node visible:
- */
- closure_put(&as->parent_as->cl);
-
- /*
- * and then we have to wait on that btree_interior_update to finish:
- */
- closure_wait(&as->parent_as->wait, cl);
- break;
-
- case BTREE_INTERIOR_UPDATING_ROOT:
- /* b is the new btree root: */
- b = READ_ONCE(as->b);
-
- if (!six_trylock_read(&b->lock)) {
- mutex_unlock(&c->btree_interior_update_lock);
- six_lock_read(&b->lock);
- six_unlock_read(&b->lock);
- goto retry;
- }
-
- BUG_ON(c->btree_roots[b->btree_id].as != as);
- c->btree_roots[b->btree_id].as = NULL;
-
- bch_btree_set_root_ondisk(c, b);
-
- /*
- * We don't have to wait anything anything here (before
- * btree_interior_update_nodes_reachable frees the old nodes
- * ondisk) - we've ensured that the very next journal write will
- * have the pointer to the new root, and before the allocator
- * can reuse the old nodes it'll have to do a journal commit:
- */
- six_unlock_read(&b->lock);
- }
- mutex_unlock(&c->btree_interior_update_lock);
-
- continue_at(cl, btree_interior_update_nodes_reachable, system_wq);
-}
-
-/*
- * We're updating @b with pointers to nodes that haven't finished writing yet:
- * block @b from being written until @as completes
- */
-static void btree_interior_update_updated_btree(struct bch_fs *c,
- struct btree_interior_update *as,
- struct btree *b)
-{
- mutex_lock(&c->btree_interior_update_lock);
-
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
- BUG_ON(!btree_node_dirty(b));
-
- as->mode = BTREE_INTERIOR_UPDATING_NODE;
- as->b = b;
- list_add(&as->write_blocked_list, &b->write_blocked);
-
- mutex_unlock(&c->btree_interior_update_lock);
-
- bch_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
-
- continue_at(&as->cl, btree_interior_update_nodes_written,
- system_freezable_wq);
-}
-
-static void btree_interior_update_updated_root(struct bch_fs *c,
- struct btree_interior_update *as,
- enum btree_id btree_id)
-{
- struct btree_root *r = &c->btree_roots[btree_id];
-
- mutex_lock(&c->btree_interior_update_lock);
-
- BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
-
- /*
- * Old root might not be persistent yet - if so, redirect its
- * btree_interior_update operation to point to us:
- */
- if (r->as) {
- BUG_ON(r->as->mode != BTREE_INTERIOR_UPDATING_ROOT);
-
- r->as->b = NULL;
- r->as->mode = BTREE_INTERIOR_UPDATING_AS;
- r->as->parent_as = as;
- closure_get(&as->cl);
- }
-
- as->mode = BTREE_INTERIOR_UPDATING_ROOT;
- as->b = r->b;
- r->as = as;
-
- mutex_unlock(&c->btree_interior_update_lock);
-
- bch_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
-
- continue_at(&as->cl, btree_interior_update_nodes_written,
- system_freezable_wq);
-}
-
-static void interior_update_flush(struct journal *j, struct journal_entry_pin *pin)
-{
- struct btree_interior_update *as =
- container_of(pin, struct btree_interior_update, journal);
-
- bch_journal_flush_seq_async(j, as->journal_seq, NULL);
-}
-
-/*
- * @b is being split/rewritten: it may have pointers to not-yet-written btree
- * nodes and thus outstanding btree_interior_updates - redirect @b's
- * btree_interior_updates to point to this btree_interior_update:
- */
-void bch_btree_interior_update_will_free_node(struct bch_fs *c,
- struct btree_interior_update *as,
- struct btree *b)
-{
- struct btree_interior_update *p, *n;
- struct pending_btree_node_free *d;
- struct bset_tree *t;
-
- /*
- * Does this node have data that hasn't been written in the journal?
- *
- * If so, we have to wait for the corresponding journal entry to be
- * written before making the new nodes reachable - we can't just carry
- * over the bset->journal_seq tracking, since we'll be mixing those keys
- * in with keys that aren't in the journal anymore:
- */
- for_each_bset(b, t)
- as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
-
- /*
- * Does this node have unwritten data that has a pin on the journal?
- *
- * If so, transfer that pin to the btree_interior_update operation -
- * note that if we're freeing multiple nodes, we only need to keep the
- * oldest pin of any of the nodes we're freeing. We'll release the pin
- * when the new nodes are persistent and reachable on disk:
- */
- bch_journal_pin_add_if_older(&c->journal,
- &b->writes[0].journal,
- &as->journal, interior_update_flush);
- bch_journal_pin_add_if_older(&c->journal,
- &b->writes[1].journal,
- &as->journal, interior_update_flush);
-
- mutex_lock(&c->btree_interior_update_lock);
-
- /*
- * Does this node have any btree_interior_update operations preventing
- * it from being written?
- *
- * If so, redirect them to point to this btree_interior_update: we can
- * write out our new nodes, but we won't make them visible until those
- * operations complete
- */
- list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
- BUG_ON(p->mode != BTREE_INTERIOR_UPDATING_NODE);
-
- p->mode = BTREE_INTERIOR_UPDATING_AS;
- list_del(&p->write_blocked_list);
- p->b = NULL;
- p->parent_as = as;
- closure_get(&as->cl);
- }
-
- /* Add this node to the list of nodes being freed: */
- BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
-
- d = &as->pending[as->nr_pending++];
- d->index_update_done = false;
- d->seq = b->data->keys.seq;
- d->btree_id = b->btree_id;
- d->level = b->level;
- bkey_copy(&d->key, &b->key);
-
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
-static void btree_node_interior_verify(struct btree *b)
-{
- struct btree_node_iter iter;
- struct bkey_packed *k;
-
- BUG_ON(!b->level);
-
- bch_btree_node_iter_init(&iter, b, b->key.k.p, false, false);
-#if 1
- BUG_ON(!(k = bch_btree_node_iter_peek(&iter, b)) ||
- bkey_cmp_left_packed(b, k, &b->key.k.p));
-
- BUG_ON((bch_btree_node_iter_advance(&iter, b),
- !bch_btree_node_iter_end(&iter)));
-#else
- const char *msg;
-
- msg = "not found";
- k = bch_btree_node_iter_peek(&iter, b);
- if (!k)
- goto err;
-
- msg = "isn't what it should be";
- if (bkey_cmp_left_packed(b, k, &b->key.k.p))
- goto err;
-
- bch_btree_node_iter_advance(&iter, b);
-
- msg = "isn't last key";
- if (!bch_btree_node_iter_end(&iter))
- goto err;
- return;
-err:
- bch_dump_btree_node(b);
- printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
- b->key.k.p.offset, msg);
- BUG();
-#endif
-}
-
-static enum btree_insert_ret
-bch_btree_insert_keys_interior(struct btree *b,
- struct btree_iter *iter,
- struct keylist *insert_keys,
- struct btree_interior_update *as,
- struct btree_reserve *res)
-{
- struct bch_fs *c = iter->c;
- struct btree_iter *linked;
- struct btree_node_iter node_iter;
- struct bkey_i *insert = bch_keylist_front(insert_keys);
- struct bkey_packed *k;
-
- BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
- BUG_ON(!b->level);
- BUG_ON(!as || as->b);
- verify_keys_sorted(insert_keys);
-
- btree_node_lock_for_insert(b, iter);
-
- if (bch_keylist_u64s(insert_keys) >
- bch_btree_keys_u64s_remaining(c, b)) {
- btree_node_unlock_write(b, iter);
- return BTREE_INSERT_BTREE_NODE_FULL;
- }
-
- /* Don't screw up @iter's position: */
- node_iter = iter->node_iters[b->level];
-
- /*
- * btree_split(), btree_gc_coalesce() will insert keys before
- * the iterator's current position - they know the keys go in
- * the node the iterator points to:
- */
- while ((k = bch_btree_node_iter_prev_all(&node_iter, b)) &&
- (bkey_cmp_packed(b, k, &insert->k) >= 0))
- ;
-
- while (!bch_keylist_empty(insert_keys)) {
- insert = bch_keylist_front(insert_keys);
-
- bch_insert_fixup_btree_ptr(iter, b, insert,
- &node_iter, &res->disk_res);
- bch_keylist_pop_front(insert_keys);
- }
-
- btree_interior_update_updated_btree(c, as, b);
-
- for_each_linked_btree_node(iter, b, linked)
- bch_btree_node_iter_peek(&linked->node_iters[b->level],
- b);
- bch_btree_node_iter_peek(&iter->node_iters[b->level], b);
-
- bch_btree_iter_verify(iter, b);
-
- if (bch_maybe_compact_whiteouts(c, b))
- bch_btree_iter_reinit_node(iter, b);
-
- btree_node_unlock_write(b, iter);
-
- btree_node_interior_verify(b);
- return BTREE_INSERT_OK;
-}
-
-/*
- * Move keys from n1 (original replacement node, now lower node) to n2 (higher
- * node)
- */
-static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n1,
- struct btree_reserve *reserve)
-{
- size_t nr_packed = 0, nr_unpacked = 0;
- struct btree *n2;
- struct bset *set1, *set2;
- struct bkey_packed *k, *prev = NULL;
-
- n2 = bch_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve);
- n2->data->max_key = n1->data->max_key;
- n2->data->format = n1->format;
- n2->key.k.p = n1->key.k.p;
-
- btree_node_set_format(n2, n2->data->format);
-
- set1 = btree_bset_first(n1);
- set2 = btree_bset_first(n2);
-
- /*
- * Has to be a linear search because we don't have an auxiliary
- * search tree yet
- */
- k = set1->start;
- while (1) {
- if (bkey_next(k) == vstruct_last(set1))
- break;
- if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
- break;
-
- if (bkey_packed(k))
- nr_packed++;
- else
- nr_unpacked++;
-
- prev = k;
- k = bkey_next(k);
- }
-
- BUG_ON(!prev);
-
- n1->key.k.p = bkey_unpack_pos(n1, prev);
- n1->data->max_key = n1->key.k.p;
- n2->data->min_key =
- btree_type_successor(n1->btree_id, n1->key.k.p);
-
- set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
- set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
-
- set_btree_bset_end(n1, n1->set);
- set_btree_bset_end(n2, n2->set);
-
- n2->nr.live_u64s = le16_to_cpu(set2->u64s);
- n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s);
- n2->nr.packed_keys = n1->nr.packed_keys - nr_packed;
- n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked;
-
- n1->nr.live_u64s = le16_to_cpu(set1->u64s);
- n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s);
- n1->nr.packed_keys = nr_packed;
- n1->nr.unpacked_keys = nr_unpacked;
-
- BUG_ON(!set1->u64s);
- BUG_ON(!set2->u64s);
-
- memcpy_u64s(set2->start,
- vstruct_end(set1),
- le16_to_cpu(set2->u64s));
-
- btree_node_reset_sib_u64s(n1);
- btree_node_reset_sib_u64s(n2);
-
- bch_verify_btree_nr_keys(n1);
- bch_verify_btree_nr_keys(n2);
-
- if (n1->level) {
- btree_node_interior_verify(n1);
- btree_node_interior_verify(n2);
- }
-
- return n2;
-}
-
-/*
- * For updates to interior nodes, we've got to do the insert before we split
- * because the stuff we're inserting has to be inserted atomically. Post split,
- * the keys might have to go in different nodes and the split would no longer be
- * atomic.
- *
- * Worse, if the insert is from btree node coalescing, if we do the insert after
- * we do the split (and pick the pivot) - the pivot we pick might be between
- * nodes that were coalesced, and thus in the middle of a child node post
- * coalescing:
- */
-static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b,
- struct keylist *keys,
- struct btree_reserve *res)
-{
- struct btree_node_iter node_iter;
- struct bkey_i *k = bch_keylist_front(keys);
- struct bkey_packed *p;
- struct bset *i;
-
- BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
-
- bch_btree_node_iter_init(&node_iter, b, k->k.p, false, false);
-
- while (!bch_keylist_empty(keys)) {
- k = bch_keylist_front(keys);
-
- BUG_ON(bch_keylist_u64s(keys) >
- bch_btree_keys_u64s_remaining(iter->c, b));
- BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
- BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
-
- bch_insert_fixup_btree_ptr(iter, b, k, &node_iter, &res->disk_res);
- bch_keylist_pop_front(keys);
- }
-
- /*
- * We can't tolerate whiteouts here - with whiteouts there can be
- * duplicate keys, and it would be rather bad if we picked a duplicate
- * for the pivot:
- */
- i = btree_bset_first(b);
- p = i->start;
- while (p != vstruct_last(i))
- if (bkey_deleted(p)) {
- le16_add_cpu(&i->u64s, -p->u64s);
- set_btree_bset_end(b, b->set);
- memmove_u64s_down(p, bkey_next(p),
- (u64 *) vstruct_last(i) -
- (u64 *) p);
- } else
- p = bkey_next(p);
-
- BUG_ON(b->nsets != 1 ||
- b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
-
- btree_node_interior_verify(b);
-}
-
-static void btree_split(struct btree *b, struct btree_iter *iter,
- struct keylist *insert_keys,
- struct btree_reserve *reserve,
- struct btree_interior_update *as)
-{
- struct bch_fs *c = iter->c;
- struct btree *parent = iter->nodes[b->level + 1];
- struct btree *n1, *n2 = NULL, *n3 = NULL;
- u64 start_time = local_clock();
-
- BUG_ON(!parent && (b != btree_node_root(c, b)));
- BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
-
- bch_btree_interior_update_will_free_node(c, as, b);
-
- n1 = btree_node_alloc_replacement(c, b, reserve);
- if (b->level)
- btree_split_insert_keys(iter, n1, insert_keys, reserve);
-
- if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
- trace_bcache_btree_node_split(c, b, b->nr.live_u64s);
-
- n2 = __btree_split_node(iter, n1, reserve);
-
- bch_btree_build_aux_trees(n2);
- bch_btree_build_aux_trees(n1);
- six_unlock_write(&n2->lock);
- six_unlock_write(&n1->lock);
-
- bch_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent, -1);
-
- /*
- * Note that on recursive parent_keys == insert_keys, so we
- * can't start adding new keys to parent_keys before emptying it
- * out (which we did with btree_split_insert_keys() above)
- */
- bch_keylist_add(&as->parent_keys, &n1->key);
- bch_keylist_add(&as->parent_keys, &n2->key);
-
- if (!parent) {
- /* Depth increases, make a new root */
- n3 = __btree_root_alloc(c, b->level + 1,
- iter->btree_id,
- reserve);
- n3->sib_u64s[0] = U16_MAX;
- n3->sib_u64s[1] = U16_MAX;
-
- btree_split_insert_keys(iter, n3, &as->parent_keys,
- reserve);
- bch_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent, -1);
- }
- } else {
- trace_bcache_btree_node_compact(c, b, b->nr.live_u64s);
-
- bch_btree_build_aux_trees(n1);
- six_unlock_write(&n1->lock);
-
- bch_keylist_add(&as->parent_keys, &n1->key);
- }
-
- bch_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent, -1);
-
- /* New nodes all written, now make them visible: */
-
- if (parent) {
- /* Split a non root node */
- bch_btree_insert_node(parent, iter, &as->parent_keys,
- reserve, as);
- } else if (n3) {
- bch_btree_set_root(iter, n3, as, reserve);
- } else {
- /* Root filled up but didn't need to be split */
- bch_btree_set_root(iter, n1, as, reserve);
- }
-
- btree_open_bucket_put(c, n1);
- if (n2)
- btree_open_bucket_put(c, n2);
- if (n3)
- btree_open_bucket_put(c, n3);
-
- /*
- * Note - at this point other linked iterators could still have @b read
- * locked; we're depending on the bch_btree_iter_node_replace() calls
- * below removing all references to @b so we don't return with other
- * iterators pointing to a node they have locked that's been freed.
- *
- * We have to free the node first because the bch_iter_node_replace()
- * calls will drop _our_ iterator's reference - and intent lock - to @b.
- */
- bch_btree_node_free_inmem(iter, b);
-
- /* Successful split, update the iterator to point to the new nodes: */
-
- if (n3)
- bch_btree_iter_node_replace(iter, n3);
- if (n2)
- bch_btree_iter_node_replace(iter, n2);
- bch_btree_iter_node_replace(iter, n1);
-
- bch_time_stats_update(&c->btree_split_time, start_time);
-}
-
-/**
- * bch_btree_insert_node - insert bkeys into a given btree node
- *
- * @iter: btree iterator
- * @insert_keys: list of keys to insert
- * @hook: insert callback
- * @persistent: if not null, @persistent will wait on journal write
- *
- * Inserts as many keys as it can into a given btree node, splitting it if full.
- * If a split occurred, this function will return early. This can only happen
- * for leaf nodes -- inserts into interior nodes have to be atomic.
- */
-void bch_btree_insert_node(struct btree *b,
- struct btree_iter *iter,
- struct keylist *insert_keys,
- struct btree_reserve *reserve,
- struct btree_interior_update *as)
-{
- BUG_ON(!b->level);
- BUG_ON(!reserve || !as);
-
- switch (bch_btree_insert_keys_interior(b, iter, insert_keys,
- as, reserve)) {
- case BTREE_INSERT_OK:
- break;
- case BTREE_INSERT_BTREE_NODE_FULL:
- btree_split(b, iter, insert_keys, reserve, as);
- break;
- default:
- BUG();
- }
-}
-
-static int bch_btree_split_leaf(struct btree_iter *iter, unsigned flags)
-{
- struct bch_fs *c = iter->c;
- struct btree *b = iter->nodes[0];
- struct btree_reserve *reserve;
- struct btree_interior_update *as;
- struct closure cl;
- int ret = 0;
-
- closure_init_stack(&cl);
-
- /* Hack, because gc and splitting nodes doesn't mix yet: */
- if (!down_read_trylock(&c->gc_lock)) {
- bch_btree_iter_unlock(iter);
- down_read(&c->gc_lock);
- }
-
- /*
- * XXX: figure out how far we might need to split,
- * instead of locking/reserving all the way to the root:
- */
- if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) {
- ret = -EINTR;
- goto out;
- }
-
- reserve = bch_btree_reserve_get(c, b, 0, flags, &cl);
- if (IS_ERR(reserve)) {
- ret = PTR_ERR(reserve);
- if (ret == -EAGAIN) {
- bch_btree_iter_unlock(iter);
- up_read(&c->gc_lock);
- closure_sync(&cl);
- return -EINTR;
- }
- goto out;
- }
-
- as = bch_btree_interior_update_alloc(c);
-
- btree_split(b, iter, NULL, reserve, as);
- bch_btree_reserve_put(c, reserve);
-
- bch_btree_iter_set_locks_want(iter, 1);
-out:
- up_read(&c->gc_lock);
- return ret;
-}
-
-enum btree_node_sibling {
- btree_prev_sib,
- btree_next_sib,
-};
-
-static struct btree *btree_node_get_sibling(struct btree_iter *iter,
- struct btree *b,
- enum btree_node_sibling sib)
-{
- struct btree *parent;
- struct btree_node_iter node_iter;
- struct bkey_packed *k;
- BKEY_PADDED(k) tmp;
- struct btree *ret;
- unsigned level = b->level;
-
- parent = iter->nodes[level + 1];
- if (!parent)
- return NULL;
-
- if (!btree_node_relock(iter, level + 1)) {
- bch_btree_iter_set_locks_want(iter, level + 2);
- return ERR_PTR(-EINTR);
- }
-
- node_iter = iter->node_iters[parent->level];
-
- k = bch_btree_node_iter_peek_all(&node_iter, parent);
- BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
-
- do {
- k = sib == btree_prev_sib
- ? bch_btree_node_iter_prev_all(&node_iter, parent)
- : (bch_btree_node_iter_advance(&node_iter, parent),
- bch_btree_node_iter_peek_all(&node_iter, parent));
- if (!k)
- return NULL;
- } while (bkey_deleted(k));
-
- bkey_unpack(parent, &tmp.k, k);
-
- ret = bch_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent);
-
- if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) {
- btree_node_unlock(iter, level);
- ret = bch_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent);
- }
-
- if (!IS_ERR(ret) && !btree_node_relock(iter, level)) {
- six_unlock_intent(&ret->lock);
- ret = ERR_PTR(-EINTR);
- }
-
- return ret;
-}
-
-static int __foreground_maybe_merge(struct btree_iter *iter,
- enum btree_node_sibling sib)
-{
- struct bch_fs *c = iter->c;
- struct btree_reserve *reserve;
- struct btree_interior_update *as;
- struct bkey_format_state new_s;
- struct bkey_format new_f;
- struct bkey_i delete;
- struct btree *b, *m, *n, *prev, *next, *parent;
- struct closure cl;
- size_t sib_u64s;
- int ret = 0;
-
- closure_init_stack(&cl);
-retry:
- if (!btree_node_relock(iter, iter->level))
- return 0;
-
- b = iter->nodes[iter->level];
-
- parent = iter->nodes[b->level + 1];
- if (!parent)
- return 0;
-
- if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
- return 0;
-
- /* XXX: can't be holding read locks */
- m = btree_node_get_sibling(iter, b, sib);
- if (IS_ERR(m)) {
- ret = PTR_ERR(m);
- goto out;
- }
-
- /* NULL means no sibling: */
- if (!m) {
- b->sib_u64s[sib] = U16_MAX;
- return 0;
- }
-
- if (sib == btree_prev_sib) {
- prev = m;
- next = b;
- } else {
- prev = b;
- next = m;
- }
-
- bch_bkey_format_init(&new_s);
- __bch_btree_calc_format(&new_s, b);
- __bch_btree_calc_format(&new_s, m);
- new_f = bch_bkey_format_done(&new_s);
-
- sib_u64s = btree_node_u64s_with_format(b, &new_f) +
- btree_node_u64s_with_format(m, &new_f);
-
- if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
- sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
- sib_u64s /= 2;
- sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
- }
-
- sib_u64s = min(sib_u64s, btree_max_u64s(c));
- b->sib_u64s[sib] = sib_u64s;
-
- if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
- six_unlock_intent(&m->lock);
- return 0;
- }
-
- /* We're changing btree topology, doesn't mix with gc: */
- if (!down_read_trylock(&c->gc_lock)) {
- six_unlock_intent(&m->lock);
- bch_btree_iter_unlock(iter);
-
- down_read(&c->gc_lock);
- up_read(&c->gc_lock);
- ret = -EINTR;
- goto out;
- }
-
- if (!bch_btree_iter_set_locks_want(iter, U8_MAX)) {
- ret = -EINTR;
- goto out_unlock;
- }
-
- reserve = bch_btree_reserve_get(c, b, 0,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE,
- &cl);
- if (IS_ERR(reserve)) {
- ret = PTR_ERR(reserve);
- goto out_unlock;
- }
-
- as = bch_btree_interior_update_alloc(c);
-
- bch_btree_interior_update_will_free_node(c, as, b);
- bch_btree_interior_update_will_free_node(c, as, m);
-
- n = bch_btree_node_alloc(c, b->level, b->btree_id, reserve);
- n->data->min_key = prev->data->min_key;
- n->data->max_key = next->data->max_key;
- n->data->format = new_f;
- n->key.k.p = next->key.k.p;
-
- btree_node_set_format(n, new_f);
-
- bch_btree_sort_into(c, n, prev);
- bch_btree_sort_into(c, n, next);
-
- bch_btree_build_aux_trees(n);
- six_unlock_write(&n->lock);
-
- bkey_init(&delete.k);
- delete.k.p = prev->key.k.p;
- bch_keylist_add(&as->parent_keys, &delete);
- bch_keylist_add(&as->parent_keys, &n->key);
-
- bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
-
- bch_btree_insert_node(parent, iter, &as->parent_keys, reserve, as);
-
- btree_open_bucket_put(c, n);
- bch_btree_node_free_inmem(iter, b);
- bch_btree_node_free_inmem(iter, m);
- bch_btree_iter_node_replace(iter, n);
-
- bch_btree_iter_verify(iter, n);
-
- bch_btree_reserve_put(c, reserve);
-out_unlock:
- if (ret != -EINTR && ret != -EAGAIN)
- bch_btree_iter_set_locks_want(iter, 1);
- six_unlock_intent(&m->lock);
- up_read(&c->gc_lock);
-out:
- if (ret == -EAGAIN || ret == -EINTR) {
- bch_btree_iter_unlock(iter);
- ret = -EINTR;
- }
-
- closure_sync(&cl);
-
- if (ret == -EINTR) {
- ret = bch_btree_iter_traverse(iter);
- if (!ret)
- goto retry;
- }
-
- return ret;
-}
-
-static int inline foreground_maybe_merge(struct btree_iter *iter,
- enum btree_node_sibling sib)
-{
- struct bch_fs *c = iter->c;
- struct btree *b;
-
- if (!btree_node_locked(iter, iter->level))
- return 0;
-
- b = iter->nodes[iter->level];
- if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
- return 0;
-
- return __foreground_maybe_merge(iter, sib);
-}
-
-/**
- * btree_insert_key - insert a key one key into a leaf node
- */
-static enum btree_insert_ret
-btree_insert_key(struct btree_insert *trans,
- struct btree_insert_entry *insert)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter *iter = insert->iter;
- struct btree *b = iter->nodes[0];
- enum btree_insert_ret ret;
- int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
- int old_live_u64s = b->nr.live_u64s;
- int live_u64s_added, u64s_added;
-
- ret = !btree_node_is_extents(b)
- ? bch_insert_fixup_key(trans, insert)
- : bch_insert_fixup_extent(trans, insert);
-
- live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
- u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
-
- if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
- b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
- if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
- b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
- if (u64s_added > live_u64s_added &&
- bch_maybe_compact_whiteouts(iter->c, b))
- bch_btree_iter_reinit_node(iter, b);
-
- trace_bcache_btree_insert_key(c, b, insert->k);
- return ret;
-}
-
-static bool same_leaf_as_prev(struct btree_insert *trans,
- struct btree_insert_entry *i)
-{
- /*
- * Because we sorted the transaction entries, if multiple iterators
- * point to the same leaf node they'll always be adjacent now:
- */
- return i != trans->entries &&
- i[0].iter->nodes[0] == i[-1].iter->nodes[0];
-}
-
-#define trans_for_each_entry(trans, i) \
- for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
-
-static void multi_lock_write(struct btree_insert *trans)
-{
- struct btree_insert_entry *i;
-
- trans_for_each_entry(trans, i)
- if (!same_leaf_as_prev(trans, i))
- btree_node_lock_for_insert(i->iter->nodes[0], i->iter);
-}
-
-static void multi_unlock_write(struct btree_insert *trans)
-{
- struct btree_insert_entry *i;
-
- trans_for_each_entry(trans, i)
- if (!same_leaf_as_prev(trans, i))
- btree_node_unlock_write(i->iter->nodes[0], i->iter);
-}
-
-static int btree_trans_entry_cmp(const void *_l, const void *_r)
-{
- const struct btree_insert_entry *l = _l;
- const struct btree_insert_entry *r = _r;
-
- return btree_iter_cmp(l->iter, r->iter);
-}
-
-/* Normal update interface: */
-
-/**
- * __bch_btree_insert_at - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- * if passed BTREE_INSERT_ATOMIC.
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-int __bch_btree_insert_at(struct btree_insert *trans)
-{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- struct btree_iter *split = NULL;
- bool cycle_gc_lock = false;
- unsigned u64s;
- int ret;
-
- trans_for_each_entry(trans, i) {
- EBUG_ON(i->iter->level);
- EBUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
- }
-
- sort(trans->entries, trans->nr, sizeof(trans->entries[0]),
- btree_trans_entry_cmp, NULL);
-
- if (unlikely(!percpu_ref_tryget(&c->writes)))
- return -EROFS;
-retry_locks:
- ret = -EINTR;
- trans_for_each_entry(trans, i)
- if (!bch_btree_iter_set_locks_want(i->iter, 1))
- goto err;
-retry:
- trans->did_work = false;
- u64s = 0;
- trans_for_each_entry(trans, i)
- if (!i->done)
- u64s += jset_u64s(i->k->k.u64s + i->extra_res);
-
- memset(&trans->journal_res, 0, sizeof(trans->journal_res));
-
- ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
- ? bch_journal_res_get(&c->journal,
- &trans->journal_res,
- u64s, u64s)
- : 0;
- if (ret)
- goto err;
-
- multi_lock_write(trans);
-
- u64s = 0;
- trans_for_each_entry(trans, i) {
- /* Multiple inserts might go to same leaf: */
- if (!same_leaf_as_prev(trans, i))
- u64s = 0;
-
- /*
- * bch_btree_node_insert_fits() must be called under write lock:
- * with only an intent lock, another thread can still call
- * bch_btree_node_write(), converting an unwritten bset to a
- * written one
- */
- if (!i->done) {
- u64s += i->k->k.u64s + i->extra_res;
- if (!bch_btree_node_insert_fits(c,
- i->iter->nodes[0], u64s)) {
- split = i->iter;
- goto unlock;
- }
- }
- }
-
- ret = 0;
- split = NULL;
- cycle_gc_lock = false;
-
- trans_for_each_entry(trans, i) {
- if (i->done)
- continue;
-
- switch (btree_insert_key(trans, i)) {
- case BTREE_INSERT_OK:
- i->done = true;
- break;
- case BTREE_INSERT_JOURNAL_RES_FULL:
- case BTREE_INSERT_NEED_TRAVERSE:
- ret = -EINTR;
- break;
- case BTREE_INSERT_NEED_RESCHED:
- ret = -EAGAIN;
- break;
- case BTREE_INSERT_BTREE_NODE_FULL:
- split = i->iter;
- break;
- case BTREE_INSERT_ENOSPC:
- ret = -ENOSPC;
- break;
- case BTREE_INSERT_NEED_GC_LOCK:
- cycle_gc_lock = true;
- ret = -EINTR;
- break;
- default:
- BUG();
- }
-
- if (!trans->did_work && (ret || split))
- break;
- }
-unlock:
- multi_unlock_write(trans);
- bch_journal_res_put(&c->journal, &trans->journal_res);
-
- if (split)
- goto split;
- if (ret)
- goto err;
-
- /*
- * hack: iterators are inconsistent when they hit end of leaf, until
- * traversed again
- */
- trans_for_each_entry(trans, i)
- if (i->iter->at_end_of_leaf)
- goto out;
-
- trans_for_each_entry(trans, i)
- if (!same_leaf_as_prev(trans, i)) {
- foreground_maybe_merge(i->iter, btree_prev_sib);
- foreground_maybe_merge(i->iter, btree_next_sib);
- }
-out:
- /* make sure we didn't lose an error: */
- if (!ret && IS_ENABLED(CONFIG_BCACHE_DEBUG))
- trans_for_each_entry(trans, i)
- BUG_ON(!i->done);
-
- percpu_ref_put(&c->writes);
- return ret;
-split:
- /*
- * have to drop journal res before splitting, because splitting means
- * allocating new btree nodes, and holding a journal reservation
- * potentially blocks the allocator:
- */
- ret = bch_btree_split_leaf(split, trans->flags);
- if (ret)
- goto err;
- /*
- * if the split didn't have to drop locks the insert will still be
- * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
- * and is overwriting won't have changed)
- */
- goto retry_locks;
-err:
- if (cycle_gc_lock) {
- down_read(&c->gc_lock);
- up_read(&c->gc_lock);
- }
-
- if (ret == -EINTR) {
- trans_for_each_entry(trans, i) {
- int ret2 = bch_btree_iter_traverse(i->iter);
- if (ret2) {
- ret = ret2;
- goto out;
- }
- }
-
- /*
- * BTREE_ITER_ATOMIC means we have to return -EINTR if we
- * dropped locks:
- */
- if (!(trans->flags & BTREE_INSERT_ATOMIC))
- goto retry;
- }
-
- goto out;
-}
-
-int bch_btree_insert_list_at(struct btree_iter *iter,
- struct keylist *keys,
- struct disk_reservation *disk_res,
- struct extent_insert_hook *hook,
- u64 *journal_seq, unsigned flags)
-{
- BUG_ON(flags & BTREE_INSERT_ATOMIC);
- BUG_ON(bch_keylist_empty(keys));
- verify_keys_sorted(keys);
-
- while (!bch_keylist_empty(keys)) {
- /* need to traverse between each insert */
- int ret = bch_btree_iter_traverse(iter);
- if (ret)
- return ret;
-
- ret = bch_btree_insert_at(iter->c, disk_res, hook,
- journal_seq, flags,
- BTREE_INSERT_ENTRY(iter, bch_keylist_front(keys)));
- if (ret)
- return ret;
-
- bch_keylist_pop_front(keys);
- }
-
- return 0;
-}
-
-/**
- * bch_btree_insert_check_key - insert dummy key into btree
- *
- * We insert a random key on a cache miss, then compare exchange on it
- * once the cache promotion or backing device read completes. This
- * ensures that if this key is written to after the read, the read will
- * lose and not overwrite the key with stale data.
- *
- * Return values:
- * -EAGAIN: @iter->cl was put on a waitlist waiting for btree node allocation
- * -EINTR: btree node was changed while upgrading to write lock
- */
-int bch_btree_insert_check_key(struct btree_iter *iter,
- struct bkey_i *check_key)
-{
- struct bpos saved_pos = iter->pos;
- struct bkey_i_cookie *cookie;
- BKEY_PADDED(key) tmp;
- int ret;
-
- BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&check_key->k)));
-
- check_key->k.type = KEY_TYPE_COOKIE;
- set_bkey_val_bytes(&check_key->k, sizeof(struct bch_cookie));
-
- cookie = bkey_i_to_cookie(check_key);
- get_random_bytes(&cookie->v, sizeof(cookie->v));
-
- bkey_copy(&tmp.key, check_key);
-
- ret = bch_btree_insert_at(iter->c, NULL, NULL, NULL,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(iter, &tmp.key));
-
- bch_btree_iter_rewind(iter, saved_pos);
-
- return ret;
-}
-
-/**
- * bch_btree_insert - insert keys into the extent btree
- * @c: pointer to struct bch_fs
- * @id: btree to insert into
- * @insert_keys: list of keys to insert
- * @hook: insert callback
- */
-int bch_btree_insert(struct bch_fs *c, enum btree_id id,
- struct bkey_i *k,
- struct disk_reservation *disk_res,
- struct extent_insert_hook *hook,
- u64 *journal_seq, int flags)
-{
- struct btree_iter iter;
- int ret, ret2;
-
- bch_btree_iter_init_intent(&iter, c, id, bkey_start_pos(&k->k));
-
- ret = bch_btree_iter_traverse(&iter);
- if (unlikely(ret))
- goto out;
-
- ret = bch_btree_insert_at(c, disk_res, hook, journal_seq, flags,
- BTREE_INSERT_ENTRY(&iter, k));
-out: ret2 = bch_btree_iter_unlock(&iter);
-
- return ret ?: ret2;
-}
-
-/**
- * bch_btree_update - like bch_btree_insert(), but asserts that we're
- * overwriting an existing key
- */
-int bch_btree_update(struct bch_fs *c, enum btree_id id,
- struct bkey_i *k, u64 *journal_seq)
-{
- struct btree_iter iter;
- struct bkey_s_c u;
- int ret;
-
- EBUG_ON(id == BTREE_ID_EXTENTS);
-
- bch_btree_iter_init_intent(&iter, c, id, k->k.p);
-
- u = bch_btree_iter_peek_with_holes(&iter);
- ret = btree_iter_err(u);
- if (ret)
- return ret;
-
- if (bkey_deleted(u.k)) {
- bch_btree_iter_unlock(&iter);
- return -ENOENT;
- }
-
- ret = bch_btree_insert_at(c, NULL, NULL, journal_seq, 0,
- BTREE_INSERT_ENTRY(&iter, k));
- bch_btree_iter_unlock(&iter);
- return ret;
-}
-
-/*
- * bch_btree_delete_range - delete everything within a given range
- *
- * Range is a half open interval - [start, end)
- */
-int bch_btree_delete_range(struct bch_fs *c, enum btree_id id,
- struct bpos start,
- struct bpos end,
- struct bversion version,
- struct disk_reservation *disk_res,
- struct extent_insert_hook *hook,
- u64 *journal_seq)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- bch_btree_iter_init_intent(&iter, c, id, start);
-
- while ((k = bch_btree_iter_peek(&iter)).k &&
- !(ret = btree_iter_err(k))) {
- unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
- /* really shouldn't be using a bare, unpadded bkey_i */
- struct bkey_i delete;
-
- if (bkey_cmp(iter.pos, end) >= 0)
- break;
-
- bkey_init(&delete.k);
-
- /*
- * For extents, iter.pos won't necessarily be the same as
- * bkey_start_pos(k.k) (for non extents they always will be the
- * same). It's important that we delete starting from iter.pos
- * because the range we want to delete could start in the middle
- * of k.
- *
- * (bch_btree_iter_peek() does guarantee that iter.pos >=
- * bkey_start_pos(k.k)).
- */
- delete.k.p = iter.pos;
- delete.k.version = version;
-
- if (iter.is_extents) {
- /*
- * The extents btree is special - KEY_TYPE_DISCARD is
- * used for deletions, not KEY_TYPE_DELETED. This is an
- * internal implementation detail that probably
- * shouldn't be exposed (internally, KEY_TYPE_DELETED is
- * used as a proxy for k->size == 0):
- */
- delete.k.type = KEY_TYPE_DISCARD;
-
- /* create the biggest key we can */
- bch_key_resize(&delete.k, max_sectors);
- bch_cut_back(end, &delete.k);
- }
-
- ret = bch_btree_insert_at(c, disk_res, hook, journal_seq,
- BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(&iter, &delete));
- if (ret)
- break;
-
- bch_btree_iter_cond_resched(&iter);
- }
-
- bch_btree_iter_unlock(&iter);
- return ret;
-}
-
-/**
- * bch_btree_node_rewrite - Rewrite/move a btree node
- *
- * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
- * btree_check_reserve() has to wait)
- */
-int bch_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
- struct closure *cl)
-{
- struct bch_fs *c = iter->c;
- struct btree *n, *parent = iter->nodes[b->level + 1];
- struct btree_reserve *reserve;
- struct btree_interior_update *as;
- unsigned flags = BTREE_INSERT_NOFAIL;
-
- /*
- * if caller is going to wait if allocating reserve fails, then this is
- * a rewrite that must succeed:
- */
- if (cl)
- flags |= BTREE_INSERT_USE_RESERVE;
-
- if (!bch_btree_iter_set_locks_want(iter, U8_MAX))
- return -EINTR;
-
- reserve = bch_btree_reserve_get(c, b, 0, flags, cl);
- if (IS_ERR(reserve)) {
- trace_bcache_btree_gc_rewrite_node_fail(c, b);
- return PTR_ERR(reserve);
- }
-
- as = bch_btree_interior_update_alloc(c);
-
- bch_btree_interior_update_will_free_node(c, as, b);
-
- n = btree_node_alloc_replacement(c, b, reserve);
-
- bch_btree_build_aux_trees(n);
- six_unlock_write(&n->lock);
-
- trace_bcache_btree_gc_rewrite_node(c, b);
-
- bch_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
-
- if (parent) {
- bch_btree_insert_node(parent, iter,
- &keylist_single(&n->key),
- reserve, as);
- } else {
- bch_btree_set_root(iter, n, as, reserve);
- }
-
- btree_open_bucket_put(c, n);
-
- bch_btree_node_free_inmem(iter, b);
-
- BUG_ON(!bch_btree_iter_node_replace(iter, n));
-
- bch_btree_reserve_put(c, reserve);
- return 0;
-}
diff --git a/libbcache/btree_update.h b/libbcache/btree_update.h
deleted file mode 100644
index 0be71862..00000000
--- a/libbcache/btree_update.h
+++ /dev/null
@@ -1,424 +0,0 @@
-#ifndef _BCACHE_BTREE_INSERT_H
-#define _BCACHE_BTREE_INSERT_H
-
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "journal.h"
-#include "vstructs.h"
-
-struct bch_fs;
-struct bkey_format_state;
-struct bkey_format;
-struct btree;
-
-static inline void btree_node_reset_sib_u64s(struct btree *b)
-{
- b->sib_u64s[0] = b->nr.live_u64s;
- b->sib_u64s[1] = b->nr.live_u64s;
-}
-
-struct btree_reserve {
- struct disk_reservation disk_res;
- unsigned nr;
- struct btree *b[BTREE_RESERVE_MAX];
-};
-
-void __bch_btree_calc_format(struct bkey_format_state *, struct btree *);
-bool bch_btree_node_format_fits(struct bch_fs *c, struct btree *,
- struct bkey_format *);
-
-/* Btree node freeing/allocation: */
-
-/*
- * Tracks a btree node that has been (or is about to be) freed in memory, but
- * has _not_ yet been freed on disk (because the write that makes the new
- * node(s) visible and frees the old hasn't completed yet)
- */
-struct pending_btree_node_free {
- bool index_update_done;
-
- __le64 seq;
- enum btree_id btree_id;
- unsigned level;
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-};
-
-/*
- * Tracks an in progress split/rewrite of a btree node and the update to the
- * parent node:
- *
- * When we split/rewrite a node, we do all the updates in memory without
- * waiting for any writes to complete - we allocate the new node(s) and update
- * the parent node, possibly recursively up to the root.
- *
- * The end result is that we have one or more new nodes being written -
- * possibly several, if there were multiple splits - and then a write (updating
- * an interior node) which will make all these new nodes visible.
- *
- * Additionally, as we split/rewrite nodes we free the old nodes - but the old
- * nodes can't be freed (their space on disk can't be reclaimed) until the
- * update to the interior node that makes the new node visible completes -
- * until then, the old nodes are still reachable on disk.
- *
- */
-struct btree_interior_update {
- struct closure cl;
- struct bch_fs *c;
-
- struct list_head list;
-
- /* What kind of update are we doing? */
- enum {
- BTREE_INTERIOR_NO_UPDATE,
- BTREE_INTERIOR_UPDATING_NODE,
- BTREE_INTERIOR_UPDATING_ROOT,
- BTREE_INTERIOR_UPDATING_AS,
- } mode;
-
- /*
- * BTREE_INTERIOR_UPDATING_NODE:
- * The update that made the new nodes visible was a regular update to an
- * existing interior node - @b. We can't write out the update to @b
- * until the new nodes we created are finished writing, so we block @b
- * from writing by putting this btree_interior update on the
- * @b->write_blocked list with @write_blocked_list:
- */
- struct btree *b;
- struct list_head write_blocked_list;
-
- /*
- * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
- * we're now blocking another btree_interior_update
- * @parent_as - btree_interior_update that's waiting on our nodes to finish
- * writing, before it can make new nodes visible on disk
- * @wait - list of child btree_interior_updates that are waiting on this
- * btree_interior_update to make all the new nodes visible before they can free
- * their old btree nodes
- */
- struct btree_interior_update *parent_as;
- struct closure_waitlist wait;
-
- /*
- * We may be freeing nodes that were dirty, and thus had journal entries
- * pinned: we need to transfer the oldest of those pins to the
- * btree_interior_update operation, and release it when the new node(s)
- * are all persistent and reachable:
- */
- struct journal_entry_pin journal;
-
- u64 journal_seq;
-
- /*
- * Nodes being freed:
- * Protected by c->btree_node_pending_free_lock
- */
- struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
- unsigned nr_pending;
-
- /* Only here to reduce stack usage on recursive splits: */
- struct keylist parent_keys;
- /*
- * Enough room for btree_split's keys without realloc - btree node
- * pointers never have crc/compression info, so we only need to acount
- * for the pointers for three keys
- */
- u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
-};
-
-#define for_each_pending_btree_node_free(c, as, p) \
- list_for_each_entry(as, &c->btree_interior_update_list, list) \
- for (p = as->pending; p < as->pending + as->nr_pending; p++)
-
-void bch_btree_node_free_inmem(struct btree_iter *, struct btree *);
-void bch_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
-
-void btree_open_bucket_put(struct bch_fs *c, struct btree *);
-
-struct btree *__btree_node_alloc_replacement(struct bch_fs *,
- struct btree *,
- struct bkey_format,
- struct btree_reserve *);
-struct btree *btree_node_alloc_replacement(struct bch_fs *, struct btree *,
- struct btree_reserve *);
-
-struct btree_interior_update *
-bch_btree_interior_update_alloc(struct bch_fs *);
-
-void bch_btree_interior_update_will_free_node(struct bch_fs *,
- struct btree_interior_update *,
- struct btree *);
-
-void bch_btree_set_root_initial(struct bch_fs *, struct btree *,
- struct btree_reserve *);
-
-void bch_btree_reserve_put(struct bch_fs *, struct btree_reserve *);
-struct btree_reserve *bch_btree_reserve_get(struct bch_fs *,
- struct btree *, unsigned,
- unsigned, struct closure *);
-
-int bch_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *);
-
-/* Inserting into a given leaf node (last stage of insert): */
-
-bool bch_btree_bset_insert_key(struct btree_iter *, struct btree *,
- struct btree_node_iter *, struct bkey_i *);
-void bch_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
- struct bkey_i *);
-
-static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
-{
- return (void *) b->data + btree_bytes(c);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
- struct btree *b)
-{
- return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
- struct btree *b)
-{
- return btree_data_end(c, b);
-}
-
-static inline void *write_block(struct btree *b)
-{
- return (void *) b->data + (b->written << 9);
-}
-
-static inline bool bset_written(struct btree *b, struct bset *i)
-{
- return (void *) i < write_block(b);
-}
-
-static inline bool bset_unwritten(struct btree *b, struct bset *i)
-{
- return (void *) i > write_block(b);
-}
-
-static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b,
- struct bset *i)
-{
- return round_up(bset_byte_offset(b, vstruct_end(i)),
- block_bytes(c)) >> 9;
-}
-
-static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
- struct btree *b)
-{
- struct bset *i = btree_bset_last(b);
- unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) +
- b->whiteout_u64s +
- b->uncompacted_whiteout_u64s;
- unsigned total = c->sb.btree_node_size << 6;
-
- EBUG_ON(used > total);
-
- if (bset_written(b, i))
- return 0;
-
- return total - used;
-}
-
-static inline unsigned btree_write_set_buffer(struct btree *b)
-{
- /*
- * Could buffer up larger amounts of keys for btrees with larger keys,
- * pending benchmarking:
- */
- return 4 << 10;
-}
-
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
- struct btree *b)
-{
- struct bset *i = btree_bset_last(b);
- unsigned offset = max_t(unsigned, b->written << 9,
- bset_byte_offset(b, vstruct_end(i)));
- ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t)
- (offset + sizeof(struct btree_node_entry) +
- b->whiteout_u64s * sizeof(u64) +
- b->uncompacted_whiteout_u64s * sizeof(u64));
-
- EBUG_ON(offset > btree_bytes(c));
-
- if ((unlikely(bset_written(b, i)) && n > 0) ||
- (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
- n > btree_write_set_buffer(b)))
- return (void *) b->data + offset;
-
- return NULL;
-}
-
-/*
- * write lock must be held on @b (else the dirty bset that we were going to
- * insert into could be written out from under us)
- */
-static inline bool bch_btree_node_insert_fits(struct bch_fs *c,
- struct btree *b, unsigned u64s)
-{
- if (btree_node_is_extents(b)) {
- /* The insert key might split an existing key
- * (bch_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
- */
- u64s += BKEY_EXTENT_U64s_MAX;
- }
-
- return u64s <= bch_btree_keys_u64s_remaining(c, b);
-}
-
-static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
- struct bkey_packed *k)
-{
- if (bset_written(b, bset(b, t))) {
- EBUG_ON(b->uncompacted_whiteout_u64s <
- bkeyp_key_u64s(&b->format, k));
- b->uncompacted_whiteout_u64s -=
- bkeyp_key_u64s(&b->format, k);
- }
-}
-
-static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
- struct bkey_packed *k)
-{
- if (bset_written(b, bset(b, t))) {
- BUG_ON(!k->needs_whiteout);
- b->uncompacted_whiteout_u64s +=
- bkeyp_key_u64s(&b->format, k);
- }
-}
-
-void bch_btree_insert_node(struct btree *, struct btree_iter *,
- struct keylist *, struct btree_reserve *,
- struct btree_interior_update *as);
-
-/* Normal update interface: */
-
-struct btree_insert {
- struct bch_fs *c;
- struct disk_reservation *disk_res;
- struct journal_res journal_res;
- u64 *journal_seq;
- struct extent_insert_hook *hook;
- unsigned flags;
- bool did_work;
-
- unsigned short nr;
- struct btree_insert_entry {
- struct btree_iter *iter;
- struct bkey_i *k;
- unsigned extra_res;
- /*
- * true if entire key was inserted - can only be false for
- * extents
- */
- bool done;
- } *entries;
-};
-
-int __bch_btree_insert_at(struct btree_insert *);
-
-
-#define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N
-#define COUNT_ARGS(...) _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1)
-
-#define BTREE_INSERT_ENTRY(_iter, _k) \
- ((struct btree_insert_entry) { \
- .iter = (_iter), \
- .k = (_k), \
- .done = false, \
- })
-
-#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra) \
- ((struct btree_insert_entry) { \
- .iter = (_iter), \
- .k = (_k), \
- .extra_res = (_extra), \
- .done = false, \
- })
-
-/**
- * bch_btree_insert_at - insert one or more keys at iterator positions
- * @iter: btree iterator
- * @insert_key: key to insert
- * @disk_res: disk reservation
- * @hook: extent insert callback
- *
- * Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- * if passed BTREE_INSERT_ATOMIC.
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-#define bch_btree_insert_at(_c, _disk_res, _hook, \
- _journal_seq, _flags, ...) \
- __bch_btree_insert_at(&(struct btree_insert) { \
- .c = (_c), \
- .disk_res = (_disk_res), \
- .journal_seq = (_journal_seq), \
- .hook = (_hook), \
- .flags = (_flags), \
- .nr = COUNT_ARGS(__VA_ARGS__), \
- .entries = (struct btree_insert_entry[]) { \
- __VA_ARGS__ \
- }})
-
-/*
- * Don't drop/retake locks: instead return -EINTR if need to upgrade to intent
- * locks, -EAGAIN if need to wait on btree reserve
- */
-#define BTREE_INSERT_ATOMIC (1 << 0)
-
-/* Don't check for -ENOSPC: */
-#define BTREE_INSERT_NOFAIL (1 << 1)
-
-/* for copygc, or when merging btree nodes */
-#define BTREE_INSERT_USE_RESERVE (1 << 2)
-
-/*
- * Insert is for journal replay: don't get journal reservations, or mark extents
- * (bch_mark_key)
- */
-#define BTREE_INSERT_JOURNAL_REPLAY (1 << 3)
-
-int bch_btree_insert_list_at(struct btree_iter *, struct keylist *,
- struct disk_reservation *,
- struct extent_insert_hook *, u64 *, unsigned);
-
-static inline bool journal_res_insert_fits(struct btree_insert *trans,
- struct btree_insert_entry *insert)
-{
- unsigned u64s = 0;
- struct btree_insert_entry *i;
-
- /*
- * If we didn't get a journal reservation, we're in journal replay and
- * we're not journalling updates:
- */
- if (!trans->journal_res.ref)
- return true;
-
- for (i = insert; i < trans->entries + trans->nr; i++)
- u64s += jset_u64s(i->k->k.u64s + i->extra_res);
-
- return u64s <= trans->journal_res.u64s;
-}
-
-int bch_btree_insert_check_key(struct btree_iter *, struct bkey_i *);
-int bch_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
- struct disk_reservation *,
- struct extent_insert_hook *, u64 *, int flags);
-int bch_btree_update(struct bch_fs *, enum btree_id,
- struct bkey_i *, u64 *);
-
-int bch_btree_delete_range(struct bch_fs *, enum btree_id,
- struct bpos, struct bpos, struct bversion,
- struct disk_reservation *,
- struct extent_insert_hook *, u64 *);
-
-int bch_btree_node_rewrite(struct btree_iter *, struct btree *, struct closure *);
-
-#endif /* _BCACHE_BTREE_INSERT_H */
-
diff --git a/libbcache/buckets.c b/libbcache/buckets.c
deleted file mode 100644
index 7be943d1..00000000
--- a/libbcache/buckets.c
+++ /dev/null
@@ -1,750 +0,0 @@
-/*
- * Code for manipulating bucket marks for garbage collection.
- *
- * Copyright 2014 Datera, Inc.
- *
- * Bucket states:
- * - free bucket: mark == 0
- * The bucket contains no data and will not be read
- *
- * - allocator bucket: owned_by_allocator == 1
- * The bucket is on a free list, or it is an open bucket
- *
- * - cached bucket: owned_by_allocator == 0 &&
- * dirty_sectors == 0 &&
- * cached_sectors > 0
- * The bucket contains data but may be safely discarded as there are
- * enough replicas of the data on other cache devices, or it has been
- * written back to the backing device
- *
- * - dirty bucket: owned_by_allocator == 0 &&
- * dirty_sectors > 0
- * The bucket contains data that we must not discard (either only copy,
- * or one of the 'main copies' for data requiring multiple replicas)
- *
- * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
- * This is a btree node, journal or gen/prio bucket
- *
- * Lifecycle:
- *
- * bucket invalidated => bucket on freelist => open bucket =>
- * [dirty bucket =>] cached bucket => bucket invalidated => ...
- *
- * Note that cache promotion can skip the dirty bucket step, as data
- * is copied from a deeper tier to a shallower tier, onto a cached
- * bucket.
- * Note also that a cached bucket can spontaneously become dirty --
- * see below.
- *
- * Only a traversal of the key space can determine whether a bucket is
- * truly dirty or cached.
- *
- * Transitions:
- *
- * - free => allocator: bucket was invalidated
- * - cached => allocator: bucket was invalidated
- *
- * - allocator => dirty: open bucket was filled up
- * - allocator => cached: open bucket was filled up
- * - allocator => metadata: metadata was allocated
- *
- * - dirty => cached: dirty sectors were copied to a deeper tier
- * - dirty => free: dirty sectors were overwritten or moved (copy gc)
- * - cached => free: cached sectors were overwritten
- *
- * - metadata => free: metadata was freed
- *
- * Oddities:
- * - cached => dirty: a device was removed so formerly replicated data
- * is no longer sufficiently replicated
- * - free => cached: cannot happen
- * - free => dirty: cannot happen
- * - free => metadata: cannot happen
- */
-
-#include "bcache.h"
-#include "alloc.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "error.h"
-
-#include <linux/preempt.h>
-#include <trace/events/bcache.h>
-
-#ifdef DEBUG_BUCKETS
-
-#define lg_local_lock lg_global_lock
-#define lg_local_unlock lg_global_unlock
-
-static void bch_fs_stats_verify(struct bch_fs *c)
-{
- struct bch_fs_usage stats =
- __bch_fs_usage_read(c);
-
- if ((s64) stats.sectors_dirty < 0)
- panic("sectors_dirty underflow: %lli\n", stats.sectors_dirty);
-
- if ((s64) stats.sectors_cached < 0)
- panic("sectors_cached underflow: %lli\n", stats.sectors_cached);
-
- if ((s64) stats.sectors_meta < 0)
- panic("sectors_meta underflow: %lli\n", stats.sectors_meta);
-
- if ((s64) stats.sectors_persistent_reserved < 0)
- panic("sectors_persistent_reserved underflow: %lli\n", stats.sectors_persistent_reserved);
-
- if ((s64) stats.sectors_online_reserved < 0)
- panic("sectors_online_reserved underflow: %lli\n", stats.sectors_online_reserved);
-}
-
-#else
-
-static void bch_fs_stats_verify(struct bch_fs *c) {}
-
-#endif
-
-/*
- * Clear journal_seq_valid for buckets for which it's not needed, to prevent
- * wraparound:
- */
-void bch_bucket_seq_cleanup(struct bch_fs *c)
-{
- u16 last_seq_ondisk = c->journal.last_seq_ondisk;
- struct bch_dev *ca;
- struct bucket *g;
- struct bucket_mark m;
- unsigned i;
-
- for_each_member_device(ca, c, i)
- for_each_bucket(g, ca) {
- bucket_cmpxchg(g, m, ({
- if (!m.journal_seq_valid ||
- bucket_needs_journal_commit(m, last_seq_ondisk))
- break;
-
- m.journal_seq_valid = 0;
- }));
- }
-}
-
-#define bch_usage_add(_acc, _stats) \
-do { \
- typeof(_acc) _a = (_acc), _s = (_stats); \
- unsigned i; \
- \
- for (i = 0; i < sizeof(*_a) / sizeof(u64); i++) \
- ((u64 *) (_a))[i] += ((u64 *) (_s))[i]; \
-} while (0)
-
-#define bch_usage_read_raw(_stats) \
-({ \
- typeof(*this_cpu_ptr(_stats)) _acc = { 0 }; \
- int cpu; \
- \
- for_each_possible_cpu(cpu) \
- bch_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \
- \
- _acc; \
-})
-
-#define bch_usage_read_cached(_c, _cached, _uncached) \
-({ \
- typeof(_cached) _ret; \
- unsigned _seq; \
- \
- do { \
- _seq = read_seqcount_begin(&(_c)->gc_pos_lock); \
- _ret = (_c)->gc_pos.phase == GC_PHASE_DONE \
- ? bch_usage_read_raw(_uncached) \
- : (_cached); \
- } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq)); \
- \
- _ret; \
-})
-
-struct bch_dev_usage __bch_dev_usage_read(struct bch_dev *ca)
-{
- return bch_usage_read_raw(ca->usage_percpu);
-}
-
-struct bch_dev_usage bch_dev_usage_read(struct bch_dev *ca)
-{
- return bch_usage_read_cached(ca->fs,
- ca->usage_cached,
- ca->usage_percpu);
-}
-
-struct bch_fs_usage
-__bch_fs_usage_read(struct bch_fs *c)
-{
- return bch_usage_read_raw(c->usage_percpu);
-}
-
-struct bch_fs_usage
-bch_fs_usage_read(struct bch_fs *c)
-{
- return bch_usage_read_cached(c,
- c->usage_cached,
- c->usage_percpu);
-}
-
-static inline int is_meta_bucket(struct bucket_mark m)
-{
- return m.data_type != BUCKET_DATA;
-}
-
-static inline int is_dirty_bucket(struct bucket_mark m)
-{
- return m.data_type == BUCKET_DATA && !!m.dirty_sectors;
-}
-
-static inline int is_cached_bucket(struct bucket_mark m)
-{
- return m.data_type == BUCKET_DATA &&
- !m.dirty_sectors && !!m.cached_sectors;
-}
-
-static inline enum s_alloc bucket_type(struct bucket_mark m)
-{
- return is_meta_bucket(m) ? S_META : S_DIRTY;
-}
-
-static bool bucket_became_unavailable(struct bch_fs *c,
- struct bucket_mark old,
- struct bucket_mark new)
-{
- return is_available_bucket(old) &&
- !is_available_bucket(new) &&
- c && c->gc_pos.phase == GC_PHASE_DONE;
-}
-
-void bch_fs_usage_apply(struct bch_fs *c,
- struct bch_fs_usage *stats,
- struct disk_reservation *disk_res,
- struct gc_pos gc_pos)
-{
- s64 added =
- stats->s[S_COMPRESSED][S_META] +
- stats->s[S_COMPRESSED][S_DIRTY] +
- stats->persistent_reserved +
- stats->online_reserved;
-
- /*
- * Not allowed to reduce sectors_available except by getting a
- * reservation:
- */
- BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
-
- if (added > 0) {
- disk_res->sectors -= added;
- stats->online_reserved -= added;
- }
-
- lg_local_lock(&c->usage_lock);
- /* online_reserved not subject to gc: */
- this_cpu_ptr(c->usage_percpu)->online_reserved +=
- stats->online_reserved;
- stats->online_reserved = 0;
-
- if (!gc_will_visit(c, gc_pos))
- bch_usage_add(this_cpu_ptr(c->usage_percpu), stats);
-
- bch_fs_stats_verify(c);
- lg_local_unlock(&c->usage_lock);
-
- memset(stats, 0, sizeof(*stats));
-}
-
-static void bch_fs_usage_update(struct bch_fs_usage *fs_usage,
- struct bucket_mark old, struct bucket_mark new)
-{
- fs_usage->s[S_COMPRESSED][S_CACHED] +=
- (int) new.cached_sectors - (int) old.cached_sectors;
- fs_usage->s[S_COMPRESSED][bucket_type(old)] -=
- old.dirty_sectors;
- fs_usage->s[S_COMPRESSED][bucket_type(new)] +=
- new.dirty_sectors;
-}
-
-static void bch_dev_usage_update(struct bch_dev *ca,
- struct bucket_mark old, struct bucket_mark new)
-{
- struct bch_fs *c = ca->fs;
- struct bch_dev_usage *dev_usage;
-
- bch_fs_inconsistent_on(old.data_type && new.data_type &&
- old.data_type != new.data_type, c,
- "different types of metadata in same bucket: %u, %u",
- old.data_type, new.data_type);
-
- preempt_disable();
- dev_usage = this_cpu_ptr(ca->usage_percpu);
-
- dev_usage->sectors[S_CACHED] +=
- (int) new.cached_sectors - (int) old.cached_sectors;
-
- dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
- dev_usage->sectors[bucket_type(new)] += new.dirty_sectors;
-
- dev_usage->buckets_alloc +=
- (int) new.owned_by_allocator - (int) old.owned_by_allocator;
-
- dev_usage->buckets_meta += is_meta_bucket(new) - is_meta_bucket(old);
- dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
- dev_usage->buckets_dirty += is_dirty_bucket(new) - is_dirty_bucket(old);
- preempt_enable();
-
- if (!is_available_bucket(old) && is_available_bucket(new))
- bch_wake_allocator(ca);
-}
-
-#define bucket_data_cmpxchg(ca, g, new, expr) \
-({ \
- struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
- \
- bch_dev_usage_update(ca, _old, new); \
- _old; \
-})
-
-void bch_invalidate_bucket(struct bch_dev *ca, struct bucket *g)
-{
- struct bch_fs_usage stats = { 0 };
- struct bucket_mark old, new;
-
- old = bucket_data_cmpxchg(ca, g, new, ({
- new.owned_by_allocator = 1;
- new.had_metadata = 0;
- new.data_type = 0;
- new.cached_sectors = 0;
- new.dirty_sectors = 0;
- new.copygc = 0;
- new.gen++;
- }));
-
- /* XXX: we're not actually updating fs usage's cached sectors... */
- bch_fs_usage_update(&stats, old, new);
-
- if (!old.owned_by_allocator && old.cached_sectors)
- trace_bcache_invalidate(ca, g - ca->buckets,
- old.cached_sectors);
-}
-
-void bch_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
-{
- struct bucket_mark old, new;
-
- old = bucket_data_cmpxchg(ca, g, new, ({
- new.owned_by_allocator = 0;
- new.data_type = 0;
- new.cached_sectors = 0;
- new.dirty_sectors = 0;
- }));
-
- BUG_ON(bucket_became_unavailable(ca->fs, old, new));
-}
-
-void bch_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
- bool owned_by_allocator)
-{
- struct bucket_mark new;
-
- bucket_data_cmpxchg(ca, g, new, ({
- new.owned_by_allocator = owned_by_allocator;
- }));
-}
-
-void bch_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
- enum bucket_data_type type,
- bool may_make_unavailable)
-{
- struct bucket_mark old, new;
-
- BUG_ON(!type);
-
- old = bucket_data_cmpxchg(ca, g, new, ({
- new.data_type = type;
- new.had_metadata = 1;
- }));
-
- BUG_ON(old.cached_sectors);
- BUG_ON(old.dirty_sectors);
- BUG_ON(!may_make_unavailable &&
- bucket_became_unavailable(ca->fs, old, new));
-}
-
-#define saturated_add(ca, dst, src, max) \
-do { \
- BUG_ON((int) (dst) + (src) < 0); \
- if ((dst) == (max)) \
- ; \
- else if ((dst) + (src) <= (max)) \
- dst += (src); \
- else { \
- dst = (max); \
- trace_bcache_sectors_saturated(ca); \
- } \
-} while (0)
-
-#if 0
-/* Reverting this until the copygc + compression issue is fixed: */
-
-static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
-{
- return crc_compression_type(crc)
- ? sectors * crc_compressed_size(crc) / crc_uncompressed_size(crc)
- : sectors;
-}
-
-static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
-{
- return crc_compression_type(crc)
- ? min_t(unsigned, crc_compressed_size(crc), sectors)
- : sectors;
-}
-#else
-static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
-{
- return sectors;
-}
-
-static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
-{
- return sectors;
-}
-#endif
-
-/*
- * Checking against gc's position has to be done here, inside the cmpxchg()
- * loop, to avoid racing with the start of gc clearing all the marks - GC does
- * that with the gc pos seqlock held.
- */
-static void bch_mark_pointer(struct bch_fs *c,
- struct bkey_s_c_extent e,
- const union bch_extent_crc *crc,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum s_alloc type,
- bool may_make_unavailable,
- struct bch_fs_usage *stats,
- bool gc_will_visit, u64 journal_seq)
-{
- struct bucket_mark old, new;
- unsigned saturated;
- struct bch_dev *ca = c->devs[ptr->dev];
- struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
- unsigned old_sectors, new_sectors;
- int disk_sectors, compressed_sectors;
-
- if (sectors > 0) {
- old_sectors = 0;
- new_sectors = sectors;
- } else {
- old_sectors = e.k->size;
- new_sectors = e.k->size + sectors;
- }
-
- disk_sectors = -__disk_sectors(crc, old_sectors)
- + __disk_sectors(crc, new_sectors);
- compressed_sectors = -__compressed_sectors(crc, old_sectors)
- + __compressed_sectors(crc, new_sectors);
-
- if (gc_will_visit) {
- if (journal_seq)
- bucket_cmpxchg(g, new, new.journal_seq = journal_seq);
-
- goto out;
- }
-
- old = bucket_data_cmpxchg(ca, g, new, ({
- saturated = 0;
-
- /*
- * Check this after reading bucket mark to guard against
- * the allocator invalidating a bucket after we've already
- * checked the gen
- */
- if (gen_after(new.gen, ptr->gen)) {
- EBUG_ON(type != S_CACHED &&
- test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
- return;
- }
-
- EBUG_ON(type != S_CACHED &&
- !may_make_unavailable &&
- is_available_bucket(new) &&
- test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
-
- if (type != S_CACHED &&
- new.dirty_sectors == GC_MAX_SECTORS_USED &&
- disk_sectors < 0)
- saturated = -disk_sectors;
-
- if (type == S_CACHED)
- saturated_add(ca, new.cached_sectors, disk_sectors,
- GC_MAX_SECTORS_USED);
- else
- saturated_add(ca, new.dirty_sectors, disk_sectors,
- GC_MAX_SECTORS_USED);
-
- if (!new.dirty_sectors &&
- !new.cached_sectors) {
- new.data_type = 0;
-
- if (journal_seq) {
- new.journal_seq_valid = 1;
- new.journal_seq = journal_seq;
- }
- } else {
- new.data_type = type == S_META
- ? BUCKET_BTREE : BUCKET_DATA;
- }
-
- new.had_metadata |= is_meta_bucket(new);
- }));
-
- BUG_ON(!may_make_unavailable &&
- bucket_became_unavailable(c, old, new));
-
- if (saturated &&
- atomic_long_add_return(saturated,
- &ca->saturated_count) >=
- ca->free_inc.size << ca->bucket_bits) {
- if (c->gc_thread) {
- trace_bcache_gc_sectors_saturated(c);
- wake_up_process(c->gc_thread);
- }
- }
-out:
- stats->s[S_COMPRESSED][type] += compressed_sectors;
- stats->s[S_UNCOMPRESSED][type] += sectors;
-}
-
-static void bch_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e,
- s64 sectors, bool metadata,
- bool may_make_unavailable,
- struct bch_fs_usage *stats,
- bool gc_will_visit, u64 journal_seq)
-{
- const struct bch_extent_ptr *ptr;
- const union bch_extent_crc *crc;
- enum s_alloc type = metadata ? S_META : S_DIRTY;
-
- BUG_ON(metadata && bkey_extent_is_cached(e.k));
- BUG_ON(!sectors);
-
- extent_for_each_ptr_crc(e, ptr, crc)
- bch_mark_pointer(c, e, crc, ptr, sectors,
- ptr->cached ? S_CACHED : type,
- may_make_unavailable,
- stats, gc_will_visit, journal_seq);
-}
-
-static void __bch_mark_key(struct bch_fs *c, struct bkey_s_c k,
- s64 sectors, bool metadata,
- bool may_make_unavailable,
- struct bch_fs_usage *stats,
- bool gc_will_visit, u64 journal_seq)
-{
- switch (k.k->type) {
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- bch_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
- may_make_unavailable, stats,
- gc_will_visit, journal_seq);
- break;
- case BCH_RESERVATION: {
- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
- stats->persistent_reserved += r.v->nr_replicas * sectors;
- break;
- }
- }
-}
-
-void __bch_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
- s64 sectors, bool metadata,
- struct bch_fs_usage *stats)
-{
- __bch_mark_key(c, k, sectors, metadata, true, stats, false, 0);
-}
-
-void bch_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
- s64 sectors, bool metadata)
-{
- struct bch_fs_usage stats = { 0 };
-
- __bch_gc_mark_key(c, k, sectors, metadata, &stats);
-
- preempt_disable();
- bch_usage_add(this_cpu_ptr(c->usage_percpu), &stats);
- preempt_enable();
-}
-
-void bch_mark_key(struct bch_fs *c, struct bkey_s_c k,
- s64 sectors, bool metadata, struct gc_pos gc_pos,
- struct bch_fs_usage *stats, u64 journal_seq)
-{
- /*
- * synchronization w.r.t. GC:
- *
- * Normally, bucket sector counts/marks are updated on the fly, as
- * references are added/removed from the btree, the lists of buckets the
- * allocator owns, other metadata buckets, etc.
- *
- * When GC is in progress and going to mark this reference, we do _not_
- * mark this reference here, to avoid double counting - GC will count it
- * when it gets to it.
- *
- * To know whether we should mark a given reference (GC either isn't
- * running, or has already marked references at this position) we
- * construct a total order for everything GC walks. Then, we can simply
- * compare the position of the reference we're marking - @gc_pos - with
- * GC's current position. If GC is going to mark this reference, GC's
- * current position will be less than @gc_pos; if GC's current position
- * is greater than @gc_pos GC has either already walked this position,
- * or isn't running.
- *
- * To avoid racing with GC's position changing, we have to deal with
- * - GC's position being set to GC_POS_MIN when GC starts:
- * usage_lock guards against this
- * - GC's position overtaking @gc_pos: we guard against this with
- * whatever lock protects the data structure the reference lives in
- * (e.g. the btree node lock, or the relevant allocator lock).
- */
- lg_local_lock(&c->usage_lock);
- __bch_mark_key(c, k, sectors, metadata, false, stats,
- gc_will_visit(c, gc_pos), journal_seq);
-
- bch_fs_stats_verify(c);
- lg_local_unlock(&c->usage_lock);
-}
-
-static u64 __recalc_sectors_available(struct bch_fs *c)
-{
- return c->capacity - bch_fs_sectors_used(c);
-}
-
-/* Used by gc when it's starting: */
-void bch_recalc_sectors_available(struct bch_fs *c)
-{
- int cpu;
-
- lg_global_lock(&c->usage_lock);
-
- for_each_possible_cpu(cpu)
- per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
-
- atomic64_set(&c->sectors_available,
- __recalc_sectors_available(c));
-
- lg_global_unlock(&c->usage_lock);
-}
-
-void bch_disk_reservation_put(struct bch_fs *c,
- struct disk_reservation *res)
-{
- if (res->sectors) {
- lg_local_lock(&c->usage_lock);
- this_cpu_sub(c->usage_percpu->online_reserved,
- res->sectors);
-
- bch_fs_stats_verify(c);
- lg_local_unlock(&c->usage_lock);
-
- res->sectors = 0;
- }
-}
-
-#define SECTORS_CACHE 1024
-
-int bch_disk_reservation_add(struct bch_fs *c,
- struct disk_reservation *res,
- unsigned sectors, int flags)
-{
- struct bch_fs_usage *stats;
- u64 old, new, v;
- s64 sectors_available;
- int ret;
-
- sectors *= res->nr_replicas;
-
- lg_local_lock(&c->usage_lock);
- stats = this_cpu_ptr(c->usage_percpu);
-
- if (sectors >= stats->available_cache)
- goto out;
-
- v = atomic64_read(&c->sectors_available);
- do {
- old = v;
- if (old < sectors) {
- lg_local_unlock(&c->usage_lock);
- goto recalculate;
- }
-
- new = max_t(s64, 0, old - sectors - SECTORS_CACHE);
- } while ((v = atomic64_cmpxchg(&c->sectors_available,
- old, new)) != old);
-
- stats->available_cache += old - new;
-out:
- stats->available_cache -= sectors;
- stats->online_reserved += sectors;
- res->sectors += sectors;
-
- bch_fs_stats_verify(c);
- lg_local_unlock(&c->usage_lock);
- return 0;
-
-recalculate:
- /*
- * GC recalculates sectors_available when it starts, so that hopefully
- * we don't normally end up blocking here:
- */
-
- /*
- * Piss fuck, we can be called from extent_insert_fixup() with btree
- * locks held:
- */
-
- if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
- if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
- down_read(&c->gc_lock);
- else if (!down_read_trylock(&c->gc_lock))
- return -EINTR;
- }
- lg_global_lock(&c->usage_lock);
-
- sectors_available = __recalc_sectors_available(c);
-
- if (sectors <= sectors_available ||
- (flags & BCH_DISK_RESERVATION_NOFAIL)) {
- atomic64_set(&c->sectors_available,
- max_t(s64, 0, sectors_available - sectors));
- stats->online_reserved += sectors;
- res->sectors += sectors;
- ret = 0;
- } else {
- atomic64_set(&c->sectors_available, sectors_available);
- ret = -ENOSPC;
- }
-
- bch_fs_stats_verify(c);
- lg_global_unlock(&c->usage_lock);
- if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
- up_read(&c->gc_lock);
-
- return ret;
-}
-
-int bch_disk_reservation_get(struct bch_fs *c,
- struct disk_reservation *res,
- unsigned sectors, int flags)
-{
- res->sectors = 0;
- res->gen = c->capacity_gen;
- res->nr_replicas = (flags & BCH_DISK_RESERVATION_METADATA)
- ? c->opts.metadata_replicas
- : c->opts.data_replicas;
-
- return bch_disk_reservation_add(c, res, sectors, flags);
-}
diff --git a/libbcache/buckets.h b/libbcache/buckets.h
deleted file mode 100644
index 81355576..00000000
--- a/libbcache/buckets.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Code for manipulating bucket marks for garbage collection.
- *
- * Copyright 2014 Datera, Inc.
- */
-
-#ifndef _BUCKETS_H
-#define _BUCKETS_H
-
-#include "buckets_types.h"
-#include "super.h"
-
-#define for_each_bucket(b, ca) \
- for (b = (ca)->buckets + (ca)->mi.first_bucket; \
- b < (ca)->buckets + (ca)->mi.nbuckets; b++)
-
-#define bucket_cmpxchg(g, new, expr) \
-({ \
- u64 _v = READ_ONCE((g)->_mark.counter); \
- struct bucket_mark _old; \
- \
- do { \
- (new).counter = _old.counter = _v; \
- expr; \
- } while ((_v = cmpxchg(&(g)->_mark.counter, \
- _old.counter, \
- (new).counter)) != _old.counter);\
- _old; \
-})
-
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bch_dev *ca, struct bucket *g)
-{
- unsigned long r = g - ca->buckets;
- return g->mark.gen - ca->oldest_gens[r];
-}
-
-static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- return sector_to_bucket(ca, ptr->offset);
-}
-
-/*
- * Returns 0 if no pointers or device offline - only for tracepoints!
- */
-static inline size_t PTR_BUCKET_NR_TRACE(const struct bch_fs *c,
- const struct bkey_i *k,
- unsigned ptr)
-{
- size_t bucket = 0;
-#if 0
- if (bkey_extent_is_data(&k->k)) {
- const struct bch_extent_ptr *ptr;
-
- extent_for_each_ptr(bkey_i_to_s_c_extent(k), ptr) {
- const struct bch_dev *ca = c->devs[ptr->dev];
- bucket = PTR_BUCKET_NR(ca, ptr);
- break;
- }
- }
-#endif
- return bucket;
-}
-
-static inline struct bucket *PTR_BUCKET(const struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- return ca->buckets + PTR_BUCKET_NR(ca, ptr);
-}
-
-static inline u8 __gen_after(u8 a, u8 b)
-{
- u8 r = a - b;
-
- return r > 128U ? 0 : r;
-}
-
-static inline u8 gen_after(u8 a, u8 b)
-{
- u8 r = a - b;
-
- BUG_ON(r > 128U);
-
- return r;
-}
-
-/**
- * ptr_stale() - check if a pointer points into a bucket that has been
- * invalidated.
- */
-static inline u8 ptr_stale(const struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- return gen_after(PTR_BUCKET(ca, ptr)->mark.gen, ptr->gen);
-}
-
-/* bucket heaps */
-
-static inline bool bucket_min_cmp(struct bucket_heap_entry l,
- struct bucket_heap_entry r)
-{
- return l.val < r.val;
-}
-
-static inline bool bucket_max_cmp(struct bucket_heap_entry l,
- struct bucket_heap_entry r)
-{
- return l.val > r.val;
-}
-
-static inline void bucket_heap_push(struct bch_dev *ca, struct bucket *g,
- unsigned long val)
-{
- struct bucket_heap_entry new = { g, val };
-
- if (!heap_full(&ca->heap))
- heap_add(&ca->heap, new, bucket_min_cmp);
- else if (bucket_min_cmp(new, heap_peek(&ca->heap))) {
- ca->heap.data[0] = new;
- heap_sift(&ca->heap, 0, bucket_min_cmp);
- }
-}
-
-/* bucket gc marks */
-
-/* The dirty and cached sector counts saturate. If this occurs,
- * reference counting alone will not free the bucket, and a btree
- * GC must be performed. */
-#define GC_MAX_SECTORS_USED ((1U << 15) - 1)
-
-static inline bool bucket_unused(struct bucket *g)
-{
- return !g->mark.counter;
-}
-
-static inline unsigned bucket_sectors_used(struct bucket *g)
-{
- return g->mark.dirty_sectors + g->mark.cached_sectors;
-}
-
-/* Per device stats: */
-
-struct bch_dev_usage __bch_dev_usage_read(struct bch_dev *);
-struct bch_dev_usage bch_dev_usage_read(struct bch_dev *);
-
-static inline u64 __dev_buckets_available(struct bch_dev *ca,
- struct bch_dev_usage stats)
-{
- return max_t(s64, 0,
- ca->mi.nbuckets - ca->mi.first_bucket -
- stats.buckets_dirty -
- stats.buckets_alloc -
- stats.buckets_meta);
-}
-
-/*
- * Number of reclaimable buckets - only for use by the allocator thread:
- */
-static inline u64 dev_buckets_available(struct bch_dev *ca)
-{
- return __dev_buckets_available(ca, bch_dev_usage_read(ca));
-}
-
-static inline u64 __dev_buckets_free(struct bch_dev *ca,
- struct bch_dev_usage stats)
-{
- return __dev_buckets_available(ca, stats) +
- fifo_used(&ca->free[RESERVE_NONE]) +
- fifo_used(&ca->free_inc);
-}
-
-static inline u64 dev_buckets_free(struct bch_dev *ca)
-{
- return __dev_buckets_free(ca, bch_dev_usage_read(ca));
-}
-
-/* Cache set stats: */
-
-struct bch_fs_usage __bch_fs_usage_read(struct bch_fs *);
-struct bch_fs_usage bch_fs_usage_read(struct bch_fs *);
-void bch_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
- struct disk_reservation *, struct gc_pos);
-
-static inline u64 __bch_fs_sectors_used(struct bch_fs *c)
-{
- struct bch_fs_usage stats = __bch_fs_usage_read(c);
- u64 reserved = stats.persistent_reserved +
- stats.online_reserved;
-
- return stats.s[S_COMPRESSED][S_META] +
- stats.s[S_COMPRESSED][S_DIRTY] +
- reserved +
- (reserved >> 7);
-}
-
-static inline u64 bch_fs_sectors_used(struct bch_fs *c)
-{
- return min(c->capacity, __bch_fs_sectors_used(c));
-}
-
-/* XXX: kill? */
-static inline u64 sectors_available(struct bch_fs *c)
-{
- struct bch_dev *ca;
- unsigned i;
- u64 ret = 0;
-
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i)
- ret += dev_buckets_available(ca) << ca->bucket_bits;
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline bool is_available_bucket(struct bucket_mark mark)
-{
- return (!mark.owned_by_allocator &&
- mark.data_type == BUCKET_DATA &&
- !mark.dirty_sectors &&
- !mark.nouse);
-}
-
-static inline bool bucket_needs_journal_commit(struct bucket_mark m,
- u16 last_seq_ondisk)
-{
- return m.journal_seq_valid &&
- ((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
-}
-
-void bch_bucket_seq_cleanup(struct bch_fs *);
-
-void bch_invalidate_bucket(struct bch_dev *, struct bucket *);
-void bch_mark_free_bucket(struct bch_dev *, struct bucket *);
-void bch_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool);
-void bch_mark_metadata_bucket(struct bch_dev *, struct bucket *,
- enum bucket_data_type, bool);
-
-void __bch_gc_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
- struct bch_fs_usage *);
-void bch_gc_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool);
-void bch_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
- struct gc_pos, struct bch_fs_usage *, u64);
-
-void bch_recalc_sectors_available(struct bch_fs *);
-
-void bch_disk_reservation_put(struct bch_fs *,
- struct disk_reservation *);
-
-#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
-#define BCH_DISK_RESERVATION_METADATA (1 << 1)
-#define BCH_DISK_RESERVATION_GC_LOCK_HELD (1 << 2)
-#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD (1 << 3)
-
-int bch_disk_reservation_add(struct bch_fs *,
- struct disk_reservation *,
- unsigned, int);
-int bch_disk_reservation_get(struct bch_fs *,
- struct disk_reservation *,
- unsigned, int);
-
-#endif /* _BUCKETS_H */
diff --git a/libbcache/buckets_types.h b/libbcache/buckets_types.h
deleted file mode 100644
index ca187099..00000000
--- a/libbcache/buckets_types.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef _BUCKETS_TYPES_H
-#define _BUCKETS_TYPES_H
-
-enum bucket_data_type {
- BUCKET_DATA = 0,
- BUCKET_BTREE,
- BUCKET_PRIOS,
- BUCKET_JOURNAL,
- BUCKET_SB,
-};
-
-struct bucket_mark {
- union {
- struct {
- u64 counter;
- };
-
- struct {
- u8 gen;
-
- /* generation copygc is going to move this bucket into */
- unsigned copygc:1;
-
- unsigned journal_seq_valid:1;
-
- /*
- * If this bucket had metadata while at the current generation
- * number, the allocator must increment its gen before we reuse
- * it:
- */
- unsigned had_metadata:1;
-
- unsigned owned_by_allocator:1;
-
- unsigned data_type:3;
-
- unsigned nouse:1;
-
- u16 dirty_sectors;
- u16 cached_sectors;
-
- /*
- * low bits of journal sequence number when this bucket was most
- * recently modified: if journal_seq_valid is set, this bucket
- * can't be reused until the journal sequence number written to
- * disk is >= the bucket's journal sequence number:
- */
- u16 journal_seq;
- };
- };
-};
-
-struct bucket {
- union {
- struct {
- u16 read_prio;
- u16 write_prio;
- };
- u16 prio[2];
- };
-
- union {
- struct bucket_mark _mark;
- const struct bucket_mark mark;
- };
-};
-
-enum s_compressed {
- S_COMPRESSED,
- S_UNCOMPRESSED,
- S_COMPRESSED_NR,
-};
-
-enum s_alloc {
- S_META,
- S_DIRTY,
- S_CACHED,
- S_ALLOC_NR,
-};
-
-struct bch_dev_usage {
- u64 buckets_dirty;
- u64 buckets_cached;
- u64 buckets_meta;
- u64 buckets_alloc;
-
- u64 sectors[S_ALLOC_NR];
-};
-
-struct bch_fs_usage {
- /* all fields are in units of 512 byte sectors: */
- u64 s[S_COMPRESSED_NR][S_ALLOC_NR];
- u64 persistent_reserved;
- u64 online_reserved;
- u64 available_cache;
-};
-
-struct bucket_heap_entry {
- struct bucket *g;
- unsigned long val;
-};
-
-/*
- * A reservation for space on disk:
- */
-struct disk_reservation {
- u64 sectors;
- u32 gen;
- unsigned nr_replicas;
-};
-
-#endif /* _BUCKETS_TYPES_H */
diff --git a/libbcache/chardev.c b/libbcache/chardev.c
deleted file mode 100644
index da6d827f..00000000
--- a/libbcache/chardev.c
+++ /dev/null
@@ -1,407 +0,0 @@
-#include "bcache.h"
-#include "super.h"
-#include "super-io.h"
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/major.h>
-#include <linux/cdev.h>
-#include <linux/device.h>
-#include <linux/ioctl.h>
-#include <linux/uaccess.h>
-#include <linux/slab.h>
-#include <linux/bcache-ioctl.h>
-
-static long bch_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
-{
- struct bch_ioctl_assemble arg;
- const char *err;
- u64 *user_devs = NULL;
- char **devs = NULL;
- unsigned i;
- int ret = -EFAULT;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
- if (!devs)
- return -ENOMEM;
-
- devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
-
- if (copy_from_user(user_devs, user_arg->devs,
- sizeof(u64) * arg.nr_devs))
- goto err;
-
- for (i = 0; i < arg.nr_devs; i++) {
- devs[i] = strndup_user((const char __user *)(unsigned long)
- user_devs[i],
- PATH_MAX);
- if (!devs[i]) {
- ret = -ENOMEM;
- goto err;
- }
- }
-
- err = bch_fs_open(devs, arg.nr_devs, bch_opts_empty(), NULL);
- if (err) {
- pr_err("Could not open filesystem: %s", err);
- ret = -EINVAL;
- goto err;
- }
-
- ret = 0;
-err:
- if (devs)
- for (i = 0; i < arg.nr_devs; i++)
- kfree(devs[i]);
- kfree(devs);
- return ret;
-}
-
-static long bch_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
-{
- struct bch_ioctl_incremental arg;
- const char *err;
- char *path;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
- if (!path)
- return -ENOMEM;
-
- err = bch_fs_open_incremental(path);
- kfree(path);
-
- if (err) {
- pr_err("Could not register bcache devices: %s", err);
- return -EINVAL;
- }
-
- return 0;
-}
-
-static long bch_global_ioctl(unsigned cmd, void __user *arg)
-{
- switch (cmd) {
- case BCH_IOCTL_ASSEMBLE:
- return bch_ioctl_assemble(arg);
- case BCH_IOCTL_INCREMENTAL:
- return bch_ioctl_incremental(arg);
- default:
- return -ENOTTY;
- }
-}
-
-static long bch_ioctl_query_uuid(struct bch_fs *c,
- struct bch_ioctl_query_uuid __user *user_arg)
-{
- return copy_to_user(&user_arg->uuid,
- &c->sb.user_uuid,
- sizeof(c->sb.user_uuid));
-}
-
-static long bch_ioctl_start(struct bch_fs *c, struct bch_ioctl_start __user *user_arg)
-{
- struct bch_ioctl_start arg;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- return bch_fs_start(c) ? -EIO : 0;
-}
-
-static long bch_ioctl_stop(struct bch_fs *c)
-{
- bch_fs_stop(c);
- return 0;
-}
-
-/* returns with ref on ca->ref */
-static struct bch_dev *bch_device_lookup(struct bch_fs *c,
- const char __user *dev)
-{
- struct block_device *bdev;
- struct bch_dev *ca;
- char *path;
- unsigned i;
-
- path = strndup_user(dev, PATH_MAX);
- if (!path)
- return ERR_PTR(-ENOMEM);
-
- bdev = lookup_bdev(strim(path));
- kfree(path);
- if (IS_ERR(bdev))
- return ERR_CAST(bdev);
-
- for_each_member_device(ca, c, i)
- if (ca->disk_sb.bdev == bdev)
- goto found;
-
- ca = NULL;
-found:
- bdput(bdev);
- return ca;
-}
-
-#if 0
-static struct bch_member *bch_uuid_lookup(struct bch_fs *c, uuid_le uuid)
-{
- struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb);
- unsigned i;
-
- lockdep_assert_held(&c->sb_lock);
-
- for (i = 0; i < c->disk_sb->nr_devices; i++)
- if (!memcmp(&mi->members[i].uuid, &uuid, sizeof(uuid)))
- return &mi->members[i];
-
- return NULL;
-}
-#endif
-
-static long bch_ioctl_disk_add(struct bch_fs *c,
- struct bch_ioctl_disk __user *user_arg)
-{
- struct bch_ioctl_disk arg;
- char *path;
- int ret;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
- if (!path)
- return -ENOMEM;
-
- ret = bch_dev_add(c, path);
- kfree(path);
-
- return ret;
-}
-
-static long bch_ioctl_disk_remove(struct bch_fs *c,
- struct bch_ioctl_disk __user *user_arg)
-{
- struct bch_ioctl_disk arg;
- struct bch_dev *ca;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- ca = bch_device_lookup(c, (const char __user *)(unsigned long) arg.dev);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- return bch_dev_remove(c, ca, arg.flags);
-}
-
-static long bch_ioctl_disk_online(struct bch_fs *c,
- struct bch_ioctl_disk __user *user_arg)
-{
- struct bch_ioctl_disk arg;
- char *path;
- int ret;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.flags || arg.pad)
- return -EINVAL;
-
- path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
- if (!path)
- return -ENOMEM;
-
- ret = bch_dev_online(c, path);
- kfree(path);
- return ret;
-}
-
-static long bch_ioctl_disk_offline(struct bch_fs *c,
- struct bch_ioctl_disk __user *user_arg)
-{
- struct bch_ioctl_disk arg;
- struct bch_dev *ca;
- int ret;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- if (arg.pad)
- return -EINVAL;
-
- ca = bch_device_lookup(c, (const char __user *)(unsigned long) arg.dev);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- ret = bch_dev_offline(c, ca, arg.flags);
- percpu_ref_put(&ca->ref);
- return ret;
-}
-
-static long bch_ioctl_disk_set_state(struct bch_fs *c,
- struct bch_ioctl_disk_set_state __user *user_arg)
-{
- struct bch_ioctl_disk_set_state arg;
- struct bch_dev *ca;
- int ret;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- ca = bch_device_lookup(c, (const char __user *)(unsigned long) arg.dev);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- ret = bch_dev_set_state(c, ca, arg.new_state, arg.flags);
-
- percpu_ref_put(&ca->ref);
- return ret;
-}
-
-static long bch_ioctl_disk_evacuate(struct bch_fs *c,
- struct bch_ioctl_disk __user *user_arg)
-{
- struct bch_ioctl_disk arg;
- struct bch_dev *ca;
- int ret;
-
- if (copy_from_user(&arg, user_arg, sizeof(arg)))
- return -EFAULT;
-
- ca = bch_device_lookup(c, (const char __user *)(unsigned long) arg.dev);
- if (IS_ERR(ca))
- return PTR_ERR(ca);
-
- ret = bch_dev_evacuate(c, ca);
-
- percpu_ref_put(&ca->ref);
- return ret;
-}
-
-long bch_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
-{
- /* ioctls that don't require admin cap: */
- switch (cmd) {
- case BCH_IOCTL_QUERY_UUID:
- return bch_ioctl_query_uuid(c, arg);
- }
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- /* ioctls that do require admin cap: */
- switch (cmd) {
- case BCH_IOCTL_START:
- return bch_ioctl_start(c, arg);
- case BCH_IOCTL_STOP:
- return bch_ioctl_stop(c);
-
- case BCH_IOCTL_DISK_ADD:
- return bch_ioctl_disk_add(c, arg);
- case BCH_IOCTL_DISK_REMOVE:
- return bch_ioctl_disk_remove(c, arg);
- case BCH_IOCTL_DISK_ONLINE:
- return bch_ioctl_disk_online(c, arg);
- case BCH_IOCTL_DISK_OFFLINE:
- return bch_ioctl_disk_offline(c, arg);
- case BCH_IOCTL_DISK_SET_STATE:
- return bch_ioctl_disk_set_state(c, arg);
- case BCH_IOCTL_DISK_EVACUATE:
- return bch_ioctl_disk_evacuate(c, arg);
-
- default:
- return -ENOTTY;
- }
-}
-
-static long bch_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
-{
- struct bch_fs *c = filp->private_data;
- void __user *arg = (void __user *) v;
-
- return c
- ? bch_fs_ioctl(c, cmd, arg)
- : bch_global_ioctl(cmd, arg);
-}
-
-static const struct file_operations bch_chardev_fops = {
- .owner = THIS_MODULE,
- .unlocked_ioctl = bch_chardev_ioctl,
- .open = nonseekable_open,
-};
-
-static int bch_chardev_major;
-static struct class *bch_chardev_class;
-static struct device *bch_chardev;
-static DEFINE_IDR(bch_chardev_minor);
-
-void bch_fs_chardev_exit(struct bch_fs *c)
-{
- if (!IS_ERR_OR_NULL(c->chardev))
- device_unregister(c->chardev);
- if (c->minor >= 0)
- idr_remove(&bch_chardev_minor, c->minor);
-}
-
-int bch_fs_chardev_init(struct bch_fs *c)
-{
- c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
- if (c->minor < 0)
- return c->minor;
-
- c->chardev = device_create(bch_chardev_class, NULL,
- MKDEV(bch_chardev_major, c->minor), NULL,
- "bcache%u-ctl", c->minor);
- if (IS_ERR(c->chardev))
- return PTR_ERR(c->chardev);
-
- return 0;
-}
-
-void bch_chardev_exit(void)
-{
- if (!IS_ERR_OR_NULL(bch_chardev_class))
- device_destroy(bch_chardev_class,
- MKDEV(bch_chardev_major, 255));
- if (!IS_ERR_OR_NULL(bch_chardev_class))
- class_destroy(bch_chardev_class);
- if (bch_chardev_major > 0)
- unregister_chrdev(bch_chardev_major, "bcache");
-}
-
-int __init bch_chardev_init(void)
-{
- bch_chardev_major = register_chrdev(0, "bcache-ctl", &bch_chardev_fops);
- if (bch_chardev_major < 0)
- return bch_chardev_major;
-
- bch_chardev_class = class_create(THIS_MODULE, "bcache");
- if (IS_ERR(bch_chardev_class))
- return PTR_ERR(bch_chardev_class);
-
- bch_chardev = device_create(bch_chardev_class, NULL,
- MKDEV(bch_chardev_major, 255),
- NULL, "bcache-ctl");
- if (IS_ERR(bch_chardev))
- return PTR_ERR(bch_chardev);
-
- return 0;
-}
diff --git a/libbcache/chardev.h b/libbcache/chardev.h
deleted file mode 100644
index 61a4c2b5..00000000
--- a/libbcache/chardev.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _BCACHE_CHARDEV_H
-#define _BCACHE_CHARDEV_H
-
-#ifndef NO_BCACHE_CHARDEV
-
-long bch_fs_ioctl(struct bch_fs *, unsigned, void __user *);
-
-void bch_fs_chardev_exit(struct bch_fs *);
-int bch_fs_chardev_init(struct bch_fs *);
-
-void bch_chardev_exit(void);
-int __init bch_chardev_init(void);
-
-#else
-
-static inline long bch_fs_ioctl(struct bch_fs *c,
- unsigned cmd, void __user * arg)
-{
- return -ENOSYS;
-}
-
-static inline void bch_fs_chardev_exit(struct bch_fs *c) {}
-static inline int bch_fs_chardev_init(struct bch_fs *c) { return 0; }
-
-static inline void bch_chardev_exit(void) {}
-static inline int __init bch_chardev_init(void) { return 0; }
-
-#endif
-
-#endif /* _BCACHE_CHARDEV_H */
diff --git a/libbcache/checksum.c b/libbcache/checksum.c
deleted file mode 100644
index b96050db..00000000
--- a/libbcache/checksum.c
+++ /dev/null
@@ -1,590 +0,0 @@
-
-#include "bcache.h"
-#include "checksum.h"
-#include "super.h"
-#include "super-io.h"
-
-#include <linux/crc32c.h>
-#include <linux/crypto.h>
-#include <linux/key.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
-#include <crypto/hash.h>
-#include <crypto/poly1305.h>
-#include <keys/user-type.h>
-
-/*
- * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
- * use permitted, subject to terms of PostgreSQL license; see.)
-
- * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
- * usual sort of implementation. (See Ross Williams' excellent introduction
- * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
- * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
- * If we have no working 64-bit type, then fake it with two 32-bit registers.
- *
- * The present implementation is a normal (not "reflected", in Williams'
- * terms) 64-bit CRC, using initial all-ones register contents and a final
- * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
- * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
- *
- * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x + 1
-*/
-
-static const u64 crc_table[256] = {
- 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
- 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
- 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
- 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
- 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
- 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
- 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
- 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
- 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
- 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
- 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
- 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
- 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
- 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
- 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
- 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
- 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
- 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
- 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
- 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
- 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
- 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
- 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
- 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
- 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
- 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
- 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
- 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
- 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
- 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
- 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
- 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
- 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
- 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
- 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
- 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
- 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
- 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
- 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
- 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
- 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
- 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
- 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
- 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
- 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
- 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
- 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
- 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
- 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
- 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
- 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
- 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
- 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
- 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
- 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
- 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
- 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
- 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
- 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
- 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
- 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
- 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
- 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
- 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
- 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
- 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
- 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
- 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
- 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
- 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
- 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
- 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
- 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
- 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
- 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
- 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
- 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
- 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
- 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
- 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
- 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
- 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
- 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
- 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
- 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
- 0x9AFCE626CE85B507ULL,
-};
-
-u64 bch_crc64_update(u64 crc, const void *_data, size_t len)
-{
- const unsigned char *data = _data;
-
- while (len--) {
- int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
- crc = crc_table[i] ^ (crc << 8);
- }
-
- return crc;
-}
-
-static u64 bch_checksum_init(unsigned type)
-{
- switch (type) {
- case BCH_CSUM_NONE:
- return 0;
- case BCH_CSUM_CRC32C:
- return U32_MAX;
- case BCH_CSUM_CRC64:
- return U64_MAX;
- default:
- BUG();
- }
-}
-
-static u64 bch_checksum_final(unsigned type, u64 crc)
-{
- switch (type) {
- case BCH_CSUM_NONE:
- return 0;
- case BCH_CSUM_CRC32C:
- return crc ^ U32_MAX;
- case BCH_CSUM_CRC64:
- return crc ^ U64_MAX;
- default:
- BUG();
- }
-}
-
-static u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
-{
- switch (type) {
- case BCH_CSUM_NONE:
- return 0;
- case BCH_CSUM_CRC32C:
- return crc32c(crc, data, len);
- case BCH_CSUM_CRC64:
- return bch_crc64_update(crc, data, len);
- default:
- BUG();
- }
-}
-
-static inline void do_encrypt_sg(struct crypto_blkcipher *tfm,
- struct nonce nonce,
- struct scatterlist *sg, size_t len)
-{
- struct blkcipher_desc desc = { .tfm = tfm, .info = nonce.d };
- int ret;
-
- ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len);
- BUG_ON(ret);
-}
-
-static inline void do_encrypt(struct crypto_blkcipher *tfm,
- struct nonce nonce,
- void *buf, size_t len)
-{
- struct scatterlist sg;
-
- sg_init_one(&sg, buf, len);
- do_encrypt_sg(tfm, nonce, &sg, len);
-}
-
-int bch_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
- void *buf, size_t len)
-{
- struct crypto_blkcipher *chacha20 =
- crypto_alloc_blkcipher("chacha20", 0, CRYPTO_ALG_ASYNC);
- int ret;
-
- if (!chacha20)
- return PTR_ERR(chacha20);
-
- ret = crypto_blkcipher_setkey(chacha20, (void *) key, sizeof(*key));
- if (ret)
- goto err;
-
- do_encrypt(chacha20, nonce, buf, len);
-err:
- crypto_free_blkcipher(chacha20);
- return ret;
-}
-
-static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
- struct nonce nonce)
-{
- u8 key[POLY1305_KEY_SIZE];
-
- nonce.d[3] ^= BCH_NONCE_POLY;
-
- memset(key, 0, sizeof(key));
- do_encrypt(c->chacha20, nonce, key, sizeof(key));
-
- desc->tfm = c->poly1305;
- desc->flags = 0;
- crypto_shash_init(desc);
- crypto_shash_update(desc, key, sizeof(key));
-}
-
-struct bch_csum bch_checksum(struct bch_fs *c, unsigned type,
- struct nonce nonce, const void *data, size_t len)
-{
- switch (type) {
- case BCH_CSUM_NONE:
- case BCH_CSUM_CRC32C:
- case BCH_CSUM_CRC64: {
- u64 crc = bch_checksum_init(type);
-
- crc = bch_checksum_update(type, crc, data, len);
- crc = bch_checksum_final(type, crc);
-
- return (struct bch_csum) { .lo = crc };
- }
-
- case BCH_CSUM_CHACHA20_POLY1305_80:
- case BCH_CSUM_CHACHA20_POLY1305_128: {
- SHASH_DESC_ON_STACK(desc, c->poly1305);
- u8 digest[POLY1305_DIGEST_SIZE];
- struct bch_csum ret = { 0 };
-
- gen_poly_key(c, desc, nonce);
-
- crypto_shash_update(desc, data, len);
- crypto_shash_final(desc, digest);
-
- memcpy(&ret, digest, bch_crc_bytes[type]);
- return ret;
- }
- default:
- BUG();
- }
-}
-
-void bch_encrypt(struct bch_fs *c, unsigned type,
- struct nonce nonce, void *data, size_t len)
-{
- if (!bch_csum_type_is_encryption(type))
- return;
-
- do_encrypt(c->chacha20, nonce, data, len);
-}
-
-struct bch_csum bch_checksum_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
-
- switch (type) {
- case BCH_CSUM_NONE:
- return (struct bch_csum) { 0 };
- case BCH_CSUM_CRC32C:
- case BCH_CSUM_CRC64: {
- u64 crc = bch_checksum_init(type);
-
- bio_for_each_contig_segment(bv, bio, iter) {
- void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
- crc = bch_checksum_update(type,
- crc, p, bv.bv_len);
- kunmap_atomic(p);
- }
-
- crc = bch_checksum_final(type, crc);
- return (struct bch_csum) { .lo = crc };
- }
-
- case BCH_CSUM_CHACHA20_POLY1305_80:
- case BCH_CSUM_CHACHA20_POLY1305_128: {
- SHASH_DESC_ON_STACK(desc, c->poly1305);
- u8 digest[POLY1305_DIGEST_SIZE];
- struct bch_csum ret = { 0 };
-
- gen_poly_key(c, desc, nonce);
-
- bio_for_each_contig_segment(bv, bio, iter) {
- void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
-
- crypto_shash_update(desc, p, bv.bv_len);
- kunmap_atomic(p);
- }
-
- crypto_shash_final(desc, digest);
-
- memcpy(&ret, digest, bch_crc_bytes[type]);
- return ret;
- }
- default:
- BUG();
- }
-}
-
-void bch_encrypt_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
- struct scatterlist sgl[16], *sg = sgl;
- size_t bytes = 0;
-
- if (!bch_csum_type_is_encryption(type))
- return;
-
- sg_init_table(sgl, ARRAY_SIZE(sgl));
-
- bio_for_each_contig_segment(bv, bio, iter) {
- if (sg == sgl + ARRAY_SIZE(sgl)) {
- sg_mark_end(sg - 1);
- do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
-
- le32_add_cpu(nonce.d, bytes / CHACHA20_BLOCK_SIZE);
- bytes = 0;
-
- sg_init_table(sgl, ARRAY_SIZE(sgl));
- sg = sgl;
- }
-
- sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
- bytes += bv.bv_len;
-
- }
-
- sg_mark_end(sg - 1);
- do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
-}
-
-#ifdef __KERNEL__
-int bch_request_key(struct bch_sb *sb, struct bch_key *key)
-{
- char key_description[60];
- struct key *keyring_key;
- const struct user_key_payload *ukp;
- int ret;
-
- snprintf(key_description, sizeof(key_description),
- "bcache:%pUb", &sb->user_uuid);
-
- keyring_key = request_key(&key_type_logon, key_description, NULL);
- if (IS_ERR(keyring_key))
- return PTR_ERR(keyring_key);
-
- down_read(&keyring_key->sem);
- ukp = user_key_payload(keyring_key);
- if (ukp->datalen == sizeof(*key)) {
- memcpy(key, ukp->data, ukp->datalen);
- ret = 0;
- } else {
- ret = -EINVAL;
- }
- up_read(&keyring_key->sem);
- key_put(keyring_key);
-
- return ret;
-}
-#else
-#include <keyutils.h>
-#include <uuid/uuid.h>
-
-int bch_request_key(struct bch_sb *sb, struct bch_key *key)
-{
- key_serial_t key_id;
- char key_description[60];
- char uuid[40];
-
- uuid_unparse_lower(sb->user_uuid.b, uuid);
- sprintf(key_description, "bcache:%s", uuid);
-
- key_id = request_key("user", key_description, NULL,
- KEY_SPEC_USER_KEYRING);
- if (key_id < 0)
- return -errno;
-
- if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
- return -1;
-
- return 0;
-}
-#endif
-
-static int bch_decrypt_sb_key(struct bch_fs *c,
- struct bch_sb_field_crypt *crypt,
- struct bch_key *key)
-{
- struct bch_encrypted_key sb_key = crypt->key;
- struct bch_key user_key;
- int ret = 0;
-
- /* is key encrypted? */
- if (!bch_key_is_encrypted(&sb_key))
- goto out;
-
- ret = bch_request_key(c->disk_sb, &user_key);
- if (ret) {
- bch_err(c, "error requesting encryption key");
- goto err;
- }
-
- /* decrypt real key: */
- ret = bch_chacha_encrypt_key(&user_key, bch_sb_key_nonce(c),
- &sb_key, sizeof(sb_key));
- if (ret)
- goto err;
-
- if (bch_key_is_encrypted(&sb_key)) {
- bch_err(c, "incorrect encryption key");
- ret = -EINVAL;
- goto err;
- }
-out:
- *key = sb_key.key;
-err:
- memzero_explicit(&sb_key, sizeof(sb_key));
- memzero_explicit(&user_key, sizeof(user_key));
- return ret;
-}
-
-static int bch_alloc_ciphers(struct bch_fs *c)
-{
- if (!c->chacha20)
- c->chacha20 = crypto_alloc_blkcipher("chacha20", 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(c->chacha20))
- return PTR_ERR(c->chacha20);
-
- if (!c->poly1305)
- c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
- if (IS_ERR(c->poly1305))
- return PTR_ERR(c->poly1305);
-
- return 0;
-}
-
-int bch_disable_encryption(struct bch_fs *c)
-{
- struct bch_sb_field_crypt *crypt;
- struct bch_key key;
- int ret = -EINVAL;
-
- mutex_lock(&c->sb_lock);
-
- crypt = bch_sb_get_crypt(c->disk_sb);
- if (!crypt)
- goto out;
-
- /* is key encrypted? */
- ret = 0;
- if (bch_key_is_encrypted(&crypt->key))
- goto out;
-
- ret = bch_decrypt_sb_key(c, crypt, &key);
- if (ret)
- goto out;
-
- crypt->key.magic = BCH_KEY_MAGIC;
- crypt->key.key = key;
-
- SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 0);
- bch_write_super(c);
-out:
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-int bch_enable_encryption(struct bch_fs *c, bool keyed)
-{
- struct bch_encrypted_key key;
- struct bch_key user_key;
- struct bch_sb_field_crypt *crypt;
- int ret = -EINVAL;
-
- mutex_lock(&c->sb_lock);
-
- /* Do we already have an encryption key? */
- if (bch_sb_get_crypt(c->disk_sb))
- goto err;
-
- ret = bch_alloc_ciphers(c);
- if (ret)
- goto err;
-
- key.magic = BCH_KEY_MAGIC;
- get_random_bytes(&key.key, sizeof(key.key));
-
- if (keyed) {
- ret = bch_request_key(c->disk_sb, &user_key);
- if (ret) {
- bch_err(c, "error requesting encryption key");
- goto err;
- }
-
- ret = bch_chacha_encrypt_key(&user_key, bch_sb_key_nonce(c),
- &key, sizeof(key));
- if (ret)
- goto err;
- }
-
- ret = crypto_blkcipher_setkey(c->chacha20,
- (void *) &key.key, sizeof(key.key));
- if (ret)
- goto err;
-
- crypt = bch_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64));
- if (!crypt) {
- ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
- goto err;
- }
-
- crypt->key = key;
-
- /* write superblock */
- SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 1);
- bch_write_super(c);
-err:
- mutex_unlock(&c->sb_lock);
- memzero_explicit(&user_key, sizeof(user_key));
- memzero_explicit(&key, sizeof(key));
- return ret;
-}
-
-void bch_fs_encryption_exit(struct bch_fs *c)
-{
- if (!IS_ERR_OR_NULL(c->poly1305))
- crypto_free_shash(c->poly1305);
- if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_blkcipher(c->chacha20);
-}
-
-int bch_fs_encryption_init(struct bch_fs *c)
-{
- struct bch_sb_field_crypt *crypt;
- struct bch_key key;
- int ret;
-
- crypt = bch_sb_get_crypt(c->disk_sb);
- if (!crypt)
- return 0;
-
- ret = bch_alloc_ciphers(c);
- if (ret)
- return ret;
-
- ret = bch_decrypt_sb_key(c, crypt, &key);
- if (ret)
- goto err;
-
- ret = crypto_blkcipher_setkey(c->chacha20,
- (void *) &key.key, sizeof(key.key));
-err:
- memzero_explicit(&key, sizeof(key));
- return ret;
-}
diff --git a/libbcache/checksum.h b/libbcache/checksum.h
deleted file mode 100644
index 10f62e5b..00000000
--- a/libbcache/checksum.h
+++ /dev/null
@@ -1,133 +0,0 @@
-#ifndef _BCACHE_CHECKSUM_H
-#define _BCACHE_CHECKSUM_H
-
-#include "bcache.h"
-#include "super-io.h"
-
-#include <crypto/chacha20.h>
-
-u64 bch_crc64_update(u64, const void *, size_t);
-
-#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28)
-#define BCH_NONCE_BTREE cpu_to_le32(2 << 28)
-#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28)
-#define BCH_NONCE_PRIO cpu_to_le32(4 << 28)
-#define BCH_NONCE_POLY cpu_to_le32(1 << 31)
-
-struct bch_csum bch_checksum(struct bch_fs *, unsigned, struct nonce,
- const void *, size_t);
-
-/*
- * This is used for various on disk data structures - bch_sb, prio_set, bset,
- * jset: The checksum is _always_ the first field of these structs
- */
-#define csum_vstruct(_c, _type, _nonce, _i) \
-({ \
- const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \
- const void *end = vstruct_end(_i); \
- \
- bch_checksum(_c, _type, _nonce, start, end - start); \
-})
-
-int bch_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
-int bch_request_key(struct bch_sb *, struct bch_key *);
-
-void bch_encrypt(struct bch_fs *, unsigned, struct nonce,
- void *data, size_t);
-
-struct bch_csum bch_checksum_bio(struct bch_fs *, unsigned,
- struct nonce, struct bio *);
-void bch_encrypt_bio(struct bch_fs *, unsigned,
- struct nonce, struct bio *);
-
-int bch_disable_encryption(struct bch_fs *);
-int bch_enable_encryption(struct bch_fs *, bool);
-
-void bch_fs_encryption_exit(struct bch_fs *);
-int bch_fs_encryption_init(struct bch_fs *);
-
-static inline unsigned bch_data_checksum_type(struct bch_fs *c)
-{
- if (c->sb.encryption_type)
- return c->opts.wide_macs
- ? BCH_CSUM_CHACHA20_POLY1305_128
- : BCH_CSUM_CHACHA20_POLY1305_80;
-
- return c->opts.data_checksum;
-}
-
-static inline unsigned bch_meta_checksum_type(struct bch_fs *c)
-{
- return c->sb.encryption_type
- ? BCH_CSUM_CHACHA20_POLY1305_128
- : c->opts.metadata_checksum;
-}
-
-static inline bool bch_checksum_type_valid(const struct bch_fs *c,
- unsigned type)
-{
- if (type >= BCH_CSUM_NR)
- return false;
-
- if (bch_csum_type_is_encryption(type) && !c->chacha20)
- return false;
-
- return true;
-}
-
-static const unsigned bch_crc_bytes[] = {
- [BCH_CSUM_NONE] = 0,
- [BCH_CSUM_CRC32C] = 4,
- [BCH_CSUM_CRC64] = 8,
- [BCH_CSUM_CHACHA20_POLY1305_80] = 10,
- [BCH_CSUM_CHACHA20_POLY1305_128] = 16,
-};
-
-static inline bool bch_crc_cmp(struct bch_csum l, struct bch_csum r)
-{
- /*
- * XXX: need some way of preventing the compiler from optimizing this
- * into a form that isn't constant time..
- */
- return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
-}
-
-/* for skipping ahead and encrypting/decrypting at an offset: */
-static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
-{
- EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
-
- le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
- return nonce;
-}
-
-static inline bool bch_key_is_encrypted(struct bch_encrypted_key *key)
-{
- return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
-}
-
-static inline struct nonce __bch_sb_key_nonce(struct bch_sb *sb)
-{
- __le64 magic = __bch_sb_magic(sb);
-
- return (struct nonce) {{
- [0] = 0,
- [1] = 0,
- [2] = ((__le32 *) &magic)[0],
- [3] = ((__le32 *) &magic)[1],
- }};
-}
-
-static inline struct nonce bch_sb_key_nonce(struct bch_fs *c)
-{
- __le64 magic = bch_sb_magic(c);
-
- return (struct nonce) {{
- [0] = 0,
- [1] = 0,
- [2] = ((__le32 *) &magic)[0],
- [3] = ((__le32 *) &magic)[1],
- }};
-}
-
-#endif /* _BCACHE_CHECKSUM_H */
diff --git a/libbcache/clock.c b/libbcache/clock.c
deleted file mode 100644
index 85891a03..00000000
--- a/libbcache/clock.c
+++ /dev/null
@@ -1,161 +0,0 @@
-#include "bcache.h"
-#include "clock.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-
-static inline bool io_timer_cmp(struct io_timer *l, struct io_timer *r)
-{
- return time_after(l->expire, r->expire);
-}
-
-void bch_io_timer_add(struct io_clock *clock, struct io_timer *timer)
-{
- size_t i;
-
- spin_lock(&clock->timer_lock);
- for (i = 0; i < clock->timers.used; i++)
- if (clock->timers.data[i] == timer)
- goto out;
-
- BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp));
-out:
- spin_unlock(&clock->timer_lock);
-}
-
-void bch_io_timer_del(struct io_clock *clock, struct io_timer *timer)
-{
- size_t i;
-
- spin_lock(&clock->timer_lock);
-
- for (i = 0; i < clock->timers.used; i++)
- if (clock->timers.data[i] == timer) {
- heap_del(&clock->timers, i, io_timer_cmp);
- break;
- }
-
- spin_unlock(&clock->timer_lock);
-}
-
-struct io_clock_wait {
- struct io_timer timer;
- struct task_struct *task;
- int expired;
-};
-
-static void io_clock_wait_fn(struct io_timer *timer)
-{
- struct io_clock_wait *wait = container_of(timer,
- struct io_clock_wait, timer);
-
- wait->expired = 1;
- wake_up_process(wait->task);
-}
-
-void bch_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
-{
- struct io_clock_wait wait;
-
- /* XXX: calculate sleep time rigorously */
- wait.timer.expire = until;
- wait.timer.fn = io_clock_wait_fn;
- wait.task = current;
- wait.expired = 0;
- bch_io_timer_add(clock, &wait.timer);
-
- schedule();
-
- bch_io_timer_del(clock, &wait.timer);
-}
-
-/*
- * _only_ to be used from a kthread
- */
-void bch_kthread_io_clock_wait(struct io_clock *clock,
- unsigned long until)
-{
- struct io_clock_wait wait;
-
- /* XXX: calculate sleep time rigorously */
- wait.timer.expire = until;
- wait.timer.fn = io_clock_wait_fn;
- wait.task = current;
- wait.expired = 0;
- bch_io_timer_add(clock, &wait.timer);
-
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop())
- break;
-
- if (wait.expired)
- break;
-
- schedule();
- try_to_freeze();
- }
-
- __set_current_state(TASK_RUNNING);
- bch_io_timer_del(clock, &wait.timer);
-}
-
-static struct io_timer *get_expired_timer(struct io_clock *clock,
- unsigned long now)
-{
- struct io_timer *ret = NULL;
-
- spin_lock(&clock->timer_lock);
-
- if (clock->timers.used &&
- time_after_eq(now, clock->timers.data[0]->expire))
- heap_pop(&clock->timers, ret, io_timer_cmp);
-
- spin_unlock(&clock->timer_lock);
-
- return ret;
-}
-
-void bch_increment_clock(struct bch_fs *c, unsigned sectors, int rw)
-{
- struct io_clock *clock = &c->io_clock[rw];
- struct io_timer *timer;
- unsigned long now;
-
- /* Buffer up one megabyte worth of IO in the percpu counter */
- preempt_disable();
-
- if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
- IO_CLOCK_PCPU_SECTORS)) {
- preempt_enable();
- return;
- }
-
- sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
- preempt_enable();
- now = atomic_long_add_return(sectors, &clock->now);
-
- while ((timer = get_expired_timer(clock, now)))
- timer->fn(timer);
-}
-
-void bch_io_clock_exit(struct io_clock *clock)
-{
- free_heap(&clock->timers);
- free_percpu(clock->pcpu_buf);
-}
-
-int bch_io_clock_init(struct io_clock *clock)
-{
- atomic_long_set(&clock->now, 0);
- spin_lock_init(&clock->timer_lock);
-
- clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
- if (!clock->pcpu_buf)
- return -ENOMEM;
-
- if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
- return -ENOMEM;
-
- return 0;
-}
diff --git a/libbcache/clock.h b/libbcache/clock.h
deleted file mode 100644
index 9e081d7d..00000000
--- a/libbcache/clock.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _BCACHE_CLOCK_H
-#define _BCACHE_CLOCK_H
-
-void bch_io_timer_add(struct io_clock *, struct io_timer *);
-void bch_io_timer_del(struct io_clock *, struct io_timer *);
-void bch_kthread_io_clock_wait(struct io_clock *, unsigned long);
-void bch_increment_clock(struct bch_fs *, unsigned, int);
-
-void bch_io_clock_schedule_timeout(struct io_clock *, unsigned long);
-
-#define bch_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
-({ \
- long __ret = timeout; \
- might_sleep(); \
- if (!___wait_cond_timeout(condition)) \
- __ret = __wait_event_timeout(wq, condition, timeout); \
- __ret; \
-})
-
-void bch_io_clock_exit(struct io_clock *);
-int bch_io_clock_init(struct io_clock *);
-
-#endif /* _BCACHE_CLOCK_H */
diff --git a/libbcache/clock_types.h b/libbcache/clock_types.h
deleted file mode 100644
index 4a02f467..00000000
--- a/libbcache/clock_types.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef _BCACHE_CLOCK_TYPES_H
-#define _BCACHE_CLOCK_TYPES_H
-
-#include "util.h"
-
-#define NR_IO_TIMERS 8
-
-/*
- * Clocks/timers in units of sectors of IO:
- *
- * Note - they use percpu batching, so they're only approximate.
- */
-
-struct io_timer;
-typedef void (*io_timer_fn)(struct io_timer *);
-
-struct io_timer {
- io_timer_fn fn;
- unsigned long expire;
-};
-
-/* Amount to buffer up on a percpu counter */
-#define IO_CLOCK_PCPU_SECTORS 128
-
-struct io_clock {
- atomic_long_t now;
- u16 __percpu *pcpu_buf;
-
- spinlock_t timer_lock;
- DECLARE_HEAP(struct io_timer *, timers);
-};
-
-#endif /* _BCACHE_CLOCK_TYPES_H */
-
diff --git a/libbcache/closure.c b/libbcache/closure.c
deleted file mode 100644
index f6f4dd99..00000000
--- a/libbcache/closure.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Asynchronous refcounty things
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
-
-#include "closure.h"
-
-static inline void closure_put_after_sub(struct closure *cl, int flags)
-{
- int r = flags & CLOSURE_REMAINING_MASK;
-
- BUG_ON(flags & CLOSURE_GUARD_MASK);
- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
-
- if (!r) {
- if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
- atomic_set(&cl->remaining,
- CLOSURE_REMAINING_INITIALIZER);
- closure_queue(cl);
- } else {
- struct closure *parent = cl->parent;
- closure_fn *destructor = cl->fn;
-
- closure_debug_destroy(cl);
-
- if (destructor)
- destructor(cl);
-
- if (parent)
- closure_put(parent);
- }
- }
-}
-
-/* For clearing flags with the same atomic op as a put */
-void closure_sub(struct closure *cl, int v)
-{
- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
-}
-EXPORT_SYMBOL(closure_sub);
-
-/**
- * closure_put - decrement a closure's refcount
- */
-void closure_put(struct closure *cl)
-{
- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
-}
-EXPORT_SYMBOL(closure_put);
-
-/**
- * closure_wake_up - wake up all closures on a wait list, without memory barrier
- */
-void __closure_wake_up(struct closure_waitlist *wait_list)
-{
- struct llist_node *list, *next;
- struct closure *cl;
-
- /*
- * Grab entire list, reverse order to preserve FIFO ordering, and wake
- * everything up
- */
- for (list = llist_reverse_order(llist_del_all(&wait_list->list));
- list;
- list = next) {
- next = llist_next(list);
- cl = container_of(list, struct closure, list);
-
- closure_set_waiting(cl, 0);
- closure_sub(cl, CLOSURE_WAITING + 1);
- }
-}
-EXPORT_SYMBOL(__closure_wake_up);
-
-/**
- * closure_wait - add a closure to a waitlist
- *
- * @waitlist will own a ref on @cl, which will be released when
- * closure_wake_up() is called on @waitlist.
- *
- */
-bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
-{
- if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
- return false;
-
- closure_set_waiting(cl, _RET_IP_);
- atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
- llist_add(&cl->list, &waitlist->list);
-
- return true;
-}
-EXPORT_SYMBOL(closure_wait);
-
-struct closure_syncer {
- struct task_struct *task;
- int done;
-};
-
-static void closure_sync_fn(struct closure *cl)
-{
- cl->s->done = 1;
- wake_up_process(cl->s->task);
-}
-
-void __sched __closure_sync(struct closure *cl)
-{
- struct closure_syncer s = { .task = current };
-
- cl->s = &s;
- continue_at_noreturn(cl, closure_sync_fn, NULL);
-
- while (1) {
- __set_current_state(TASK_UNINTERRUPTIBLE);
- smp_mb();
- if (s.done)
- break;
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
-}
-EXPORT_SYMBOL(__closure_sync);
-
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-
-static LIST_HEAD(closure_list);
-static DEFINE_SPINLOCK(closure_list_lock);
-
-void closure_debug_create(struct closure *cl)
-{
- unsigned long flags;
-
- BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
- cl->magic = CLOSURE_MAGIC_ALIVE;
-
- spin_lock_irqsave(&closure_list_lock, flags);
- list_add(&cl->all, &closure_list);
- spin_unlock_irqrestore(&closure_list_lock, flags);
-}
-EXPORT_SYMBOL(closure_debug_create);
-
-void closure_debug_destroy(struct closure *cl)
-{
- unsigned long flags;
-
- BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
- cl->magic = CLOSURE_MAGIC_DEAD;
-
- spin_lock_irqsave(&closure_list_lock, flags);
- list_del(&cl->all);
- spin_unlock_irqrestore(&closure_list_lock, flags);
-}
-EXPORT_SYMBOL(closure_debug_destroy);
-
-static struct dentry *debug;
-
-static int debug_seq_show(struct seq_file *f, void *data)
-{
- struct closure *cl;
-
- spin_lock_irq(&closure_list_lock);
-
- list_for_each_entry(cl, &closure_list, all) {
- int r = atomic_read(&cl->remaining);
-
- seq_printf(f, "%p: %pF -> %pf p %p r %i ",
- cl, (void *) cl->ip, cl->fn, cl->parent,
- r & CLOSURE_REMAINING_MASK);
-
- seq_printf(f, "%s%s\n",
- test_bit(WORK_STRUCT_PENDING_BIT,
- work_data_bits(&cl->work)) ? "Q" : "",
- r & CLOSURE_RUNNING ? "R" : "");
-
- if (r & CLOSURE_WAITING)
- seq_printf(f, " W %pF\n",
- (void *) cl->waiting_on);
-
- seq_puts(f, "\n");
- }
-
- spin_unlock_irq(&closure_list_lock);
- return 0;
-}
-
-static int debug_seq_open(struct inode *inode, struct file *file)
-{
- return single_open(file, debug_seq_show, NULL);
-}
-
-static const struct file_operations debug_ops = {
- .owner = THIS_MODULE,
- .open = debug_seq_open,
- .read = seq_read,
- .release = single_release
-};
-
-void __init closure_debug_init(void)
-{
- debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
-}
-
-#endif
diff --git a/libbcache/closure.h b/libbcache/closure.h
deleted file mode 100644
index b55254b6..00000000
--- a/libbcache/closure.h
+++ /dev/null
@@ -1,387 +0,0 @@
-#ifndef _LINUX_CLOSURE_H
-#define _LINUX_CLOSURE_H
-
-#include <linux/llist.h>
-#include <linux/sched.h>
-#include <linux/workqueue.h>
-
-/*
- * Closure is perhaps the most overused and abused term in computer science, but
- * since I've been unable to come up with anything better you're stuck with it
- * again.
- *
- * What are closures?
- *
- * They embed a refcount. The basic idea is they count "things that are in
- * progress" - in flight bios, some other thread that's doing something else -
- * anything you might want to wait on.
- *
- * The refcount may be manipulated with closure_get() and closure_put().
- * closure_put() is where many of the interesting things happen, when it causes
- * the refcount to go to 0.
- *
- * Closures can be used to wait on things both synchronously and asynchronously,
- * and synchronous and asynchronous use can be mixed without restriction. To
- * wait synchronously, use closure_sync() - you will sleep until your closure's
- * refcount hits 1.
- *
- * To wait asynchronously, use
- * continue_at(cl, next_function, workqueue);
- *
- * passing it, as you might expect, the function to run when nothing is pending
- * and the workqueue to run that function out of.
- *
- * continue_at() also, critically, requires a 'return' immediately following the
- * location where this macro is referenced, to return to the calling function.
- * There's good reason for this.
- *
- * To use safely closures asynchronously, they must always have a refcount while
- * they are running owned by the thread that is running them. Otherwise, suppose
- * you submit some bios and wish to have a function run when they all complete:
- *
- * foo_endio(struct bio *bio)
- * {
- * closure_put(cl);
- * }
- *
- * closure_init(cl);
- *
- * do_stuff();
- * closure_get(cl);
- * bio1->bi_endio = foo_endio;
- * bio_submit(bio1);
- *
- * do_more_stuff();
- * closure_get(cl);
- * bio2->bi_endio = foo_endio;
- * bio_submit(bio2);
- *
- * continue_at(cl, complete_some_read, system_wq);
- *
- * If closure's refcount started at 0, complete_some_read() could run before the
- * second bio was submitted - which is almost always not what you want! More
- * importantly, it wouldn't be possible to say whether the original thread or
- * complete_some_read()'s thread owned the closure - and whatever state it was
- * associated with!
- *
- * So, closure_init() initializes a closure's refcount to 1 - and when a
- * closure_fn is run, the refcount will be reset to 1 first.
- *
- * Then, the rule is - if you got the refcount with closure_get(), release it
- * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
- * on a closure because you called closure_init() or you were run out of a
- * closure - _always_ use continue_at(). Doing so consistently will help
- * eliminate an entire class of particularly pernicious races.
- *
- * Lastly, you might have a wait list dedicated to a specific event, and have no
- * need for specifying the condition - you just want to wait until someone runs
- * closure_wake_up() on the appropriate wait list. In that case, just use
- * closure_wait(). It will return either true or false, depending on whether the
- * closure was already on a wait list or not - a closure can only be on one wait
- * list at a time.
- *
- * Parents:
- *
- * closure_init() takes two arguments - it takes the closure to initialize, and
- * a (possibly null) parent.
- *
- * If parent is non null, the new closure will have a refcount for its lifetime;
- * a closure is considered to be "finished" when its refcount hits 0 and the
- * function to run is null. Hence
- *
- * continue_at(cl, NULL, NULL);
- *
- * returns up the (spaghetti) stack of closures, precisely like normal return
- * returns up the C stack. continue_at() with non null fn is better thought of
- * as doing a tail call.
- *
- * All this implies that a closure should typically be embedded in a particular
- * struct (which its refcount will normally control the lifetime of), and that
- * struct can very much be thought of as a stack frame.
- */
-
-struct closure;
-struct closure_syncer;
-typedef void (closure_fn) (struct closure *);
-
-struct closure_waitlist {
- struct llist_head list;
-};
-
-enum closure_state {
- /*
- * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
- * the thread that owns the closure, and cleared by the thread that's
- * waking up the closure.
- *
- * The rest are for debugging and don't affect behaviour:
- *
- * CLOSURE_RUNNING: Set when a closure is running (i.e. by
- * closure_init() and when closure_put() runs then next function), and
- * must be cleared before remaining hits 0. Primarily to help guard
- * against incorrect usage and accidentally transferring references.
- * continue_at() and closure_return() clear it for you, if you're doing
- * something unusual you can use closure_set_dead() which also helps
- * annotate where references are being transferred.
- */
-
- CLOSURE_BITS_START = (1U << 27),
- CLOSURE_DESTRUCTOR = (1U << 27),
- CLOSURE_WAITING = (1U << 29),
- CLOSURE_RUNNING = (1U << 31),
-};
-
-#define CLOSURE_GUARD_MASK \
- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
-
-#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
-#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
-
-struct closure {
- union {
- struct {
- struct workqueue_struct *wq;
- struct closure_syncer *s;
- struct llist_node list;
- closure_fn *fn;
- };
- struct work_struct work;
- };
-
- struct closure *parent;
-
- atomic_t remaining;
-
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-#define CLOSURE_MAGIC_DEAD 0xc054dead
-#define CLOSURE_MAGIC_ALIVE 0xc054a11e
-
- unsigned magic;
- struct list_head all;
- unsigned long ip;
- unsigned long waiting_on;
-#endif
-};
-
-void closure_sub(struct closure *cl, int v);
-void closure_put(struct closure *cl);
-void __closure_wake_up(struct closure_waitlist *list);
-bool closure_wait(struct closure_waitlist *list, struct closure *cl);
-void __closure_sync(struct closure *cl);
-
-/**
- * closure_sync - sleep until a closure a closure has nothing left to wait on
- *
- * Sleeps until the refcount hits 1 - the thread that's running the closure owns
- * the last refcount.
- */
-static inline void closure_sync(struct closure *cl)
-{
- if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
- __closure_sync(cl);
-}
-
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-
-void closure_debug_init(void);
-void closure_debug_create(struct closure *cl);
-void closure_debug_destroy(struct closure *cl);
-
-#else
-
-static inline void closure_debug_init(void) {}
-static inline void closure_debug_create(struct closure *cl) {}
-static inline void closure_debug_destroy(struct closure *cl) {}
-
-#endif
-
-static inline void closure_set_ip(struct closure *cl)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
- cl->ip = _THIS_IP_;
-#endif
-}
-
-static inline void closure_set_ret_ip(struct closure *cl)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
- cl->ip = _RET_IP_;
-#endif
-}
-
-static inline void closure_set_waiting(struct closure *cl, unsigned long f)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
- cl->waiting_on = f;
-#endif
-}
-
-static inline void closure_set_stopped(struct closure *cl)
-{
- atomic_sub(CLOSURE_RUNNING, &cl->remaining);
-}
-
-static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
- struct workqueue_struct *wq)
-{
- closure_set_ip(cl);
- cl->fn = fn;
- cl->wq = wq;
- /* between atomic_dec() in closure_put() */
- smp_mb__before_atomic();
-}
-
-static inline void closure_queue(struct closure *cl)
-{
- struct workqueue_struct *wq = cl->wq;
-
- if (wq) {
- INIT_WORK(&cl->work, cl->work.func);
- queue_work(wq, &cl->work);
- } else
- cl->fn(cl);
-}
-
-/**
- * closure_get - increment a closure's refcount
- */
-static inline void closure_get(struct closure *cl)
-{
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
- BUG_ON((atomic_inc_return(&cl->remaining) &
- CLOSURE_REMAINING_MASK) <= 1);
-#else
- atomic_inc(&cl->remaining);
-#endif
-}
-
-/**
- * closure_init - Initialize a closure, setting the refcount to 1
- * @cl: closure to initialize
- * @parent: parent of the new closure. cl will take a refcount on it for its
- * lifetime; may be NULL.
- */
-static inline void closure_init(struct closure *cl, struct closure *parent)
-{
- cl->fn = NULL;
- cl->parent = parent;
- if (parent)
- closure_get(parent);
-
- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-
- closure_debug_create(cl);
- closure_set_ip(cl);
-}
-
-static inline void closure_init_stack(struct closure *cl)
-{
- memset(cl, 0, sizeof(struct closure));
- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-}
-
-/**
- * closure_wake_up - wake up all closures on a wait list.
- */
-static inline void closure_wake_up(struct closure_waitlist *list)
-{
- smp_mb();
- __closure_wake_up(list);
-}
-
-#define continue_at_noreturn(_cl, _fn, _wq) \
-do { \
- set_closure_fn(_cl, _fn, _wq); \
- closure_sub(_cl, CLOSURE_RUNNING + 1); \
-} while (0)
-
-/**
- * continue_at - jump to another function with barrier
- *
- * After @cl is no longer waiting on anything (i.e. all outstanding refs have
- * been dropped with closure_put()), it will resume execution at @fn running out
- * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
- *
- * NOTE: This macro expands to a return in the calling function!
- *
- * This is because after calling continue_at() you no longer have a ref on @cl,
- * and whatever @cl owns may be freed out from under you - a running closure fn
- * has a ref on its own closure which continue_at() drops.
- */
-#define continue_at(_cl, _fn, _wq) \
-do { \
- continue_at_noreturn(_cl, _fn, _wq); \
- return; \
-} while (0)
-
-/**
- * closure_return - finish execution of a closure
- *
- * This is used to indicate that @cl is finished: when all outstanding refs on
- * @cl have been dropped @cl's ref on its parent closure (as passed to
- * closure_init()) will be dropped, if one was specified - thus this can be
- * thought of as returning to the parent closure.
- */
-#define closure_return(_cl) continue_at((_cl), NULL, NULL)
-
-/**
- * continue_at_nobarrier - jump to another function without barrier
- *
- * Causes @fn to be executed out of @cl, in @wq context (or called directly if
- * @wq is NULL).
- *
- * NOTE: like continue_at(), this macro expands to a return in the caller!
- *
- * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
- * thus it's not safe to touch anything protected by @cl after a
- * continue_at_nobarrier().
- */
-#define continue_at_nobarrier(_cl, _fn, _wq) \
-do { \
- closure_set_ip(cl); \
- if (_wq) { \
- INIT_WORK(&(_cl)->work, (void *) _fn); \
- queue_work((_wq), &(_cl)->work); \
- } else { \
- (_fn)(_cl); \
- } \
- return; \
-} while (0)
-
-#define closure_return_with_destructor_noreturn(_cl, _destructor) \
-do { \
- set_closure_fn(_cl, _destructor, NULL); \
- closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \
-} while (0)
-
-/**
- * closure_return - finish execution of a closure, with destructor
- *
- * Works like closure_return(), except @destructor will be called when all
- * outstanding refs on @cl have been dropped; @destructor may be used to safely
- * free the memory occupied by @cl, and it is called with the ref on the parent
- * closure still held - so @destructor could safely return an item to a
- * freelist protected by @cl's parent.
- */
-#define closure_return_with_destructor(_cl, _destructor) \
-do { \
- closure_return_with_destructor_noreturn(_cl, _destructor); \
- return; \
-} while (0)
-
-/**
- * closure_call - execute @fn out of a new, uninitialized closure
- *
- * Typically used when running out of one closure, and we want to run @fn
- * asynchronously out of a new closure - @parent will then wait for @cl to
- * finish.
- */
-static inline void closure_call(struct closure *cl, closure_fn fn,
- struct workqueue_struct *wq,
- struct closure *parent)
-{
- closure_init(cl, parent);
- continue_at_nobarrier(cl, fn, wq);
-}
-
-#endif /* _LINUX_CLOSURE_H */
diff --git a/libbcache/compress.c b/libbcache/compress.c
deleted file mode 100644
index d9a64c38..00000000
--- a/libbcache/compress.c
+++ /dev/null
@@ -1,500 +0,0 @@
-#include "bcache.h"
-#include "compress.h"
-#include "extents.h"
-#include "io.h"
-#include "super-io.h"
-
-#include <linux/lz4.h>
-#include <linux/zlib.h>
-
-enum bounced {
- BOUNCED_CONTIG,
- BOUNCED_MAPPED,
- BOUNCED_KMALLOCED,
- BOUNCED_VMALLOCED,
- BOUNCED_MEMPOOLED,
-};
-
-static void *__bounce_alloc(struct bch_fs *c, unsigned size,
- unsigned *bounced, int direction)
-{
- void *data;
-
- *bounced = BOUNCED_KMALLOCED;
- data = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
- if (data)
- return data;
-
- *bounced = BOUNCED_MEMPOOLED;
- data = mempool_alloc(&c->compression_bounce[direction], GFP_NOWAIT);
- if (data)
- return page_address(data);
-
- *bounced = BOUNCED_VMALLOCED;
- data = vmalloc(size);
- if (data)
- return data;
-
- *bounced = BOUNCED_MEMPOOLED;
- data = mempool_alloc(&c->compression_bounce[direction], GFP_NOIO);
- return page_address(data);
-}
-
-static void *__bio_map_or_bounce(struct bch_fs *c,
- struct bio *bio, struct bvec_iter start,
- unsigned *bounced, int direction)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
- unsigned nr_pages = 0;
- struct page *stack_pages[16];
- struct page **pages = NULL;
- bool first = true;
- unsigned prev_end = PAGE_SIZE;
- void *data;
-
- BUG_ON(bvec_iter_sectors(start) > BCH_ENCODED_EXTENT_MAX);
-
-#ifndef CONFIG_HIGHMEM
- *bounced = BOUNCED_CONTIG;
-
- __bio_for_each_contig_segment(bv, bio, iter, start) {
- if (bv.bv_len == start.bi_size)
- return page_address(bv.bv_page) + bv.bv_offset;
- }
-#endif
- *bounced = BOUNCED_MAPPED;
-
- __bio_for_each_segment(bv, bio, iter, start) {
- if ((!first && bv.bv_offset) ||
- prev_end != PAGE_SIZE)
- goto bounce;
-
- prev_end = bv.bv_offset + bv.bv_len;
- nr_pages++;
- }
-
- BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
-
- pages = nr_pages > ARRAY_SIZE(stack_pages)
- ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
- : stack_pages;
- if (!pages)
- goto bounce;
-
- nr_pages = 0;
- __bio_for_each_segment(bv, bio, iter, start)
- pages[nr_pages++] = bv.bv_page;
-
- data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
- if (pages != stack_pages)
- kfree(pages);
-
- return data + bio_iter_offset(bio, start);
-bounce:
- data = __bounce_alloc(c, start.bi_size, bounced, direction);
-
- if (direction == READ)
- memcpy_from_bio(data, bio, start);
-
- return data;
-}
-
-static void *bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
- unsigned *bounced, int direction)
-{
- return __bio_map_or_bounce(c, bio, bio->bi_iter, bounced, direction);
-}
-
-static void bio_unmap_or_unbounce(struct bch_fs *c, void *data,
- unsigned bounced, int direction)
-{
- if (!data)
- return;
-
- switch (bounced) {
- case BOUNCED_MAPPED:
- vunmap((void *) ((unsigned long) data & PAGE_MASK));
- return;
- case BOUNCED_KMALLOCED:
- kfree(data);
- return;
- case BOUNCED_VMALLOCED:
- vfree(data);
- return;
- case BOUNCED_MEMPOOLED:
- mempool_free(virt_to_page(data), &c->compression_bounce[direction]);
- return;
- }
-}
-
-static inline void zlib_set_workspace(z_stream *strm, void *workspace)
-{
-#ifdef __KERNEL__
- strm->workspace = workspace;
-#endif
-}
-
-static int __bio_uncompress(struct bch_fs *c, struct bio *src,
- void *dst_data, struct bch_extent_crc128 crc)
-{
- void *src_data = NULL;
- unsigned src_bounced;
- size_t src_len = src->bi_iter.bi_size;
- size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
- int ret;
-
- src_data = bio_map_or_bounce(c, src, &src_bounced, READ);
-
- switch (crc.compression_type) {
- case BCH_COMPRESSION_LZ4:
- ret = lz4_decompress(src_data, &src_len,
- dst_data, dst_len);
- if (ret) {
- ret = -EIO;
- goto err;
- }
- break;
- case BCH_COMPRESSION_GZIP: {
- void *workspace;
- z_stream strm;
-
- workspace = kmalloc(zlib_inflate_workspacesize(),
- GFP_NOIO|__GFP_NOWARN);
- if (!workspace) {
- mutex_lock(&c->zlib_workspace_lock);
- workspace = c->zlib_workspace;
- }
-
- strm.next_in = src_data;
- strm.avail_in = src_len;
- strm.next_out = dst_data;
- strm.avail_out = dst_len;
- zlib_set_workspace(&strm, workspace);
- zlib_inflateInit2(&strm, -MAX_WBITS);
-
- ret = zlib_inflate(&strm, Z_FINISH);
-
- if (workspace == c->zlib_workspace)
- mutex_unlock(&c->zlib_workspace_lock);
- else
- kfree(workspace);
-
- if (ret != Z_STREAM_END) {
- ret = -EIO;
- goto err;
- }
- break;
- }
- default:
- BUG();
- }
- ret = 0;
-err:
- bio_unmap_or_unbounce(c, src_data, src_bounced, READ);
- return ret;
-}
-
-int bch_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
- unsigned live_data_sectors,
- struct bch_extent_crc128 crc)
-{
- void *dst_data = NULL;
- size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
- int ret = -ENOMEM;
-
- BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs);
-
- /* XXX mempoolify */
- dst_data = kmalloc(dst_len, GFP_NOIO|__GFP_NOWARN);
- if (!dst_data) {
- dst_data = vmalloc(dst_len);
- if (!dst_data)
- goto err;
- }
-
- ret = __bio_uncompress(c, bio, dst_data, crc);
- if (ret)
- goto err;
-
- while (bio->bi_vcnt < DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS)) {
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
-
- bv->bv_page = alloc_page(GFP_NOIO);
- if (!bv->bv_page)
- goto use_mempool;
-
- bv->bv_len = PAGE_SIZE;
- bv->bv_offset = 0;
- bio->bi_vcnt++;
- }
-
- bio->bi_iter.bi_size = live_data_sectors << 9;
-copy_data:
- memcpy_to_bio(bio, bio->bi_iter, dst_data + (crc.offset << 9));
-err:
- kvfree(dst_data);
- return ret;
-use_mempool:
- /*
- * We already allocated from mempool, we can't allocate from it again
- * without freeing the pages we already allocated or else we could
- * deadlock:
- */
-
- bch_bio_free_pages_pool(c, bio);
- bch_bio_alloc_pages_pool(c, bio, live_data_sectors << 9);
- goto copy_data;
-}
-
-int bch_bio_uncompress(struct bch_fs *c, struct bio *src,
- struct bio *dst, struct bvec_iter dst_iter,
- struct bch_extent_crc128 crc)
-{
- void *dst_data = NULL;
- unsigned dst_bounced;
- size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
- int ret = -ENOMEM;
-
- dst_data = dst_len == dst_iter.bi_size
- ? __bio_map_or_bounce(c, dst, dst_iter, &dst_bounced, WRITE)
- : __bounce_alloc(c, dst_len, &dst_bounced, WRITE);
-
- ret = __bio_uncompress(c, src, dst_data, crc);
- if (ret)
- goto err;
-
- if (dst_bounced)
- memcpy_to_bio(dst, dst_iter, dst_data + (crc.offset << 9));
-err:
- bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE);
- return ret;
-}
-
-static int __bio_compress(struct bch_fs *c,
- struct bio *dst, size_t *dst_len,
- struct bio *src, size_t *src_len,
- unsigned compression_type)
-{
- void *src_data = NULL, *dst_data = NULL;
- unsigned src_bounced, dst_bounced, pad;
- int ret = -1;
-
- dst_data = bio_map_or_bounce(c, dst, &dst_bounced, WRITE);
- src_data = bio_map_or_bounce(c, src, &src_bounced, READ);
-
- switch (compression_type) {
- case BCH_COMPRESSION_LZ4: {
- void *workspace;
-
- *dst_len = dst->bi_iter.bi_size;
- *src_len = src->bi_iter.bi_size;
-
- workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
-
- while (*src_len > block_bytes(c) &&
- (ret = lz4_compress(src_data, *src_len,
- dst_data, dst_len,
- workspace))) {
- /*
- * On error, the compressed data was bigger than
- * dst_len, and -ret is the amount of data we were able
- * to compress - round down to nearest block and try
- * again:
- */
- BUG_ON(ret > 0);
- BUG_ON(-ret >= *src_len);
-
- *src_len = round_down(-ret, block_bytes(c));
- }
-
- mempool_free(workspace, &c->lz4_workspace_pool);
-
- if (ret)
- goto err;
- break;
- }
- case BCH_COMPRESSION_GZIP: {
- void *workspace;
- z_stream strm;
-
- workspace = kmalloc(zlib_deflate_workspacesize(MAX_WBITS,
- DEF_MEM_LEVEL),
- GFP_NOIO|__GFP_NOWARN);
- if (!workspace) {
- mutex_lock(&c->zlib_workspace_lock);
- workspace = c->zlib_workspace;
- }
-
- strm.next_in = src_data;
- strm.avail_in = min(src->bi_iter.bi_size,
- dst->bi_iter.bi_size);
- strm.next_out = dst_data;
- strm.avail_out = dst->bi_iter.bi_size;
- zlib_set_workspace(&strm, workspace);
- zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
- Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
- Z_DEFAULT_STRATEGY);
-
- ret = zlib_deflate(&strm, Z_FINISH);
- if (ret != Z_STREAM_END) {
- ret = -EIO;
- goto zlib_err;
- }
-
- ret = zlib_deflateEnd(&strm);
- if (ret != Z_OK) {
- ret = -EIO;
- goto zlib_err;
- }
-
- ret = 0;
-zlib_err:
- if (workspace == c->zlib_workspace)
- mutex_unlock(&c->zlib_workspace_lock);
- else
- kfree(workspace);
-
- if (ret)
- goto err;
-
- *dst_len = strm.total_out;
- *src_len = strm.total_in;
- break;
- }
- default:
- BUG();
- }
-
- BUG_ON(!*dst_len);
- BUG_ON(*dst_len > dst->bi_iter.bi_size);
-
- BUG_ON(*src_len & (block_bytes(c) - 1));
- BUG_ON(*src_len > src->bi_iter.bi_size);
-
- /* Didn't get smaller: */
- if (round_up(*dst_len, block_bytes(c)) >= *src_len) {
- ret = -1;
- goto err;
- }
-
- pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
-
- memset(dst_data + *dst_len, 0, pad);
- *dst_len += pad;
-
- if (dst_bounced)
- memcpy_to_bio(dst, dst->bi_iter, dst_data);
-err:
- bio_unmap_or_unbounce(c, src_data, src_bounced, READ);
- bio_unmap_or_unbounce(c, dst_data, dst_bounced, WRITE);
- return ret;
-}
-
-void bch_bio_compress(struct bch_fs *c,
- struct bio *dst, size_t *dst_len,
- struct bio *src, size_t *src_len,
- unsigned *compression_type)
-{
- unsigned orig_dst = dst->bi_iter.bi_size;
- unsigned orig_src = src->bi_iter.bi_size;
-
- /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
- src->bi_iter.bi_size =
- min(src->bi_iter.bi_size, BCH_ENCODED_EXTENT_MAX << 9);
-
- /* Don't generate a bigger output than input: */
- dst->bi_iter.bi_size =
- min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-
- /* If it's only one block, don't bother trying to compress: */
- if (*compression_type != BCH_COMPRESSION_NONE &&
- bio_sectors(src) > c->sb.block_size &&
- !__bio_compress(c, dst, dst_len, src, src_len, *compression_type))
- goto out;
-
- /* If compressing failed (didn't get smaller), just copy: */
- *compression_type = BCH_COMPRESSION_NONE;
- *dst_len = *src_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
- bio_copy_data(dst, src);
-out:
- dst->bi_iter.bi_size = orig_dst;
- src->bi_iter.bi_size = orig_src;
-}
-
-/* doesn't write superblock: */
-int bch_check_set_has_compressed_data(struct bch_fs *c,
- unsigned compression_type)
-{
- switch (compression_type) {
- case BCH_COMPRESSION_NONE:
- return 0;
- case BCH_COMPRESSION_LZ4:
- if (bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4))
- return 0;
-
- bch_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4);
- break;
- case BCH_COMPRESSION_GZIP:
- if (bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
- return 0;
-
- bch_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP);
- break;
- }
-
- return bch_fs_compress_init(c);
-}
-
-void bch_fs_compress_exit(struct bch_fs *c)
-{
- vfree(c->zlib_workspace);
- mempool_exit(&c->lz4_workspace_pool);
- mempool_exit(&c->compression_bounce[WRITE]);
- mempool_exit(&c->compression_bounce[READ]);
-}
-
-#define COMPRESSION_WORKSPACE_SIZE \
- max_t(size_t, zlib_inflate_workspacesize(), \
- zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
-
-int bch_fs_compress_init(struct bch_fs *c)
-{
- unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9);
- int ret;
-
- if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
- !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
- return 0;
-
- if (!mempool_initialized(&c->compression_bounce[READ])) {
- ret = mempool_init_page_pool(&c->compression_bounce[READ],
- 1, order);
- if (ret)
- return ret;
- }
-
- if (!mempool_initialized(&c->compression_bounce[WRITE])) {
- ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
- 1, order);
- if (ret)
- return ret;
- }
-
- if (!mempool_initialized(&c->lz4_workspace_pool) &&
- bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) {
- ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool,
- 1, LZ4_MEM_COMPRESS);
- if (ret)
- return ret;
- }
-
- if (!c->zlib_workspace &&
- bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) {
- c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE);
- if (!c->zlib_workspace)
- return -ENOMEM;
- }
-
- return 0;
-}
diff --git a/libbcache/compress.h b/libbcache/compress.h
deleted file mode 100644
index e8d208a0..00000000
--- a/libbcache/compress.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _BCACHE_COMPRESS_H
-#define _BCACHE_COMPRESS_H
-
-int bch_bio_uncompress_inplace(struct bch_fs *, struct bio *,
- unsigned, struct bch_extent_crc128);
-int bch_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
- struct bvec_iter, struct bch_extent_crc128);
-void bch_bio_compress(struct bch_fs *, struct bio *, size_t *,
- struct bio *, size_t *, unsigned *);
-
-int bch_check_set_has_compressed_data(struct bch_fs *, unsigned);
-void bch_fs_compress_exit(struct bch_fs *);
-int bch_fs_compress_init(struct bch_fs *);
-
-#endif /* _BCACHE_COMPRESS_H */
diff --git a/libbcache/debug.c b/libbcache/debug.c
deleted file mode 100644
index bddff979..00000000
--- a/libbcache/debug.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Assorted bcache debug code
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcache.h"
-#include "bkey_methods.h"
-#include "btree_cache.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "fs-gc.h"
-#include "inode.h"
-#include "io.h"
-#include "super.h"
-
-#include <linux/console.h>
-#include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-
-static struct dentry *bch_debug;
-
-#ifdef CONFIG_BCACHE_DEBUG
-
-static void btree_verify_endio(struct bio *bio)
-{
- struct closure *cl = bio->bi_private;
-
- closure_put(cl);
-}
-
-void __bch_btree_verify(struct bch_fs *c, struct btree *b)
-{
- struct btree *v = c->verify_data;
- struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
- struct bset *sorted, *inmemory;
- struct extent_pick_ptr pick;
- struct bio *bio;
- struct closure cl;
-
- if (c->opts.nochanges)
- return;
-
- closure_init_stack(&cl);
-
- btree_node_io_lock(b);
- mutex_lock(&c->verify_lock);
-
- n_ondisk = c->verify_ondisk;
- n_sorted = c->verify_data->data;
- n_inmemory = b->data;
-
- bkey_copy(&v->key, &b->key);
- v->written = 0;
- v->level = b->level;
- v->btree_id = b->btree_id;
- bch_btree_keys_init(v, &c->expensive_debug_checks);
-
- pick = bch_btree_pick_ptr(c, b);
- if (IS_ERR_OR_NULL(pick.ca))
- return;
-
- bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
- bio->bi_bdev = pick.ca->disk_sb.bdev;
- bio->bi_iter.bi_sector = pick.ptr.offset;
- bio->bi_iter.bi_size = btree_bytes(c);
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
- bio->bi_private = &cl;
- bio->bi_end_io = btree_verify_endio;
- bch_bio_map(bio, n_sorted);
-
- closure_get(&cl);
- bch_generic_make_request(bio, c);
- closure_sync(&cl);
-
- bio_put(bio);
-
- memcpy(n_ondisk, n_sorted, btree_bytes(c));
-
- bch_btree_node_read_done(c, v, pick.ca, &pick.ptr);
- n_sorted = c->verify_data->data;
-
- percpu_ref_put(&pick.ca->io_ref);
-
- sorted = &n_sorted->keys;
- inmemory = &n_inmemory->keys;
-
- if (inmemory->u64s != sorted->u64s ||
- memcmp(inmemory->start,
- sorted->start,
- vstruct_end(inmemory) - (void *) inmemory->start)) {
- unsigned offset = 0, sectors;
- struct bset *i;
- unsigned j;
-
- console_lock();
-
- printk(KERN_ERR "*** in memory:\n");
- bch_dump_bset(b, inmemory, 0);
-
- printk(KERN_ERR "*** read back in:\n");
- bch_dump_bset(v, sorted, 0);
-
- while (offset < b->written) {
- if (!offset ) {
- i = &n_ondisk->keys;
- sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
- c->block_bits;
- } else {
- struct btree_node_entry *bne =
- (void *) n_ondisk + (offset << 9);
- i = &bne->keys;
-
- sectors = vstruct_blocks(bne, c->block_bits) <<
- c->block_bits;
- }
-
- printk(KERN_ERR "*** on disk block %u:\n", offset);
- bch_dump_bset(b, i, offset);
-
- offset += sectors;
- }
-
- printk(KERN_ERR "*** block %u/%u not written\n",
- offset >> c->block_bits, btree_blocks(c));
-
- for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
- if (inmemory->_data[j] != sorted->_data[j])
- break;
-
- printk(KERN_ERR "b->written %u\n", b->written);
-
- console_unlock();
- panic("verify failed at %u\n", j);
- }
-
- mutex_unlock(&c->verify_lock);
- btree_node_io_unlock(b);
-}
-
-void bch_data_verify(struct cached_dev *dc, struct bio *bio)
-{
- char name[BDEVNAME_SIZE];
- struct bio *check;
- struct bio_vec bv;
- struct bvec_iter iter;
-
- check = bio_clone(bio, GFP_NOIO);
- if (!check)
- return;
- bio_set_op_attrs(check, REQ_OP_READ, READ_SYNC);
-
- if (bio_alloc_pages(check, GFP_NOIO))
- goto out_put;
-
- submit_bio_wait(check);
-
- bio_for_each_segment(bv, bio, iter) {
- void *p1 = kmap_atomic(bv.bv_page);
- void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page);
-
- if (memcmp(p1 + bv.bv_offset,
- p2 + bv.bv_offset,
- bv.bv_len))
- panic("verify failed at dev %s sector %llu\n",
- bdevname(dc->disk_sb.bdev, name),
- (uint64_t) bio->bi_iter.bi_sector);
-
- kunmap_atomic(p1);
- }
-
- bio_free_pages(check);
-out_put:
- bio_put(check);
-}
-
-#endif
-
-#ifdef CONFIG_DEBUG_FS
-
-/* XXX: bch_fs refcounting */
-
-struct dump_iter {
- struct bpos from;
- struct bch_fs *c;
- enum btree_id id;
-
- char buf[PAGE_SIZE];
- size_t bytes; /* what's currently in buf */
-
- char __user *ubuf; /* destination user buffer */
- size_t size; /* size of requested read */
- ssize_t ret; /* bytes read so far */
-};
-
-static int flush_buf(struct dump_iter *i)
-{
- if (i->bytes) {
- size_t bytes = min(i->bytes, i->size);
- int err = copy_to_user(i->ubuf, i->buf, bytes);
-
- if (err)
- return err;
-
- i->ret += bytes;
- i->ubuf += bytes;
- i->size -= bytes;
- i->bytes -= bytes;
- memmove(i->buf, i->buf + bytes, i->bytes);
- }
-
- return 0;
-}
-
-static int bch_dump_open(struct inode *inode, struct file *file)
-{
- struct btree_debug *bd = inode->i_private;
- struct dump_iter *i;
-
- i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
- if (!i)
- return -ENOMEM;
-
- file->private_data = i;
- i->from = POS_MIN;
- i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]);
- i->id = bd->id;
-
- return 0;
-}
-
-static int bch_dump_release(struct inode *inode, struct file *file)
-{
- kfree(file->private_data);
- return 0;
-}
-
-static ssize_t bch_read_btree(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct btree_iter iter;
- struct bkey_s_c k;
- int err;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- err = flush_buf(i);
- if (err)
- return err;
-
- if (!i->size)
- return i->ret;
-
- bch_btree_iter_init(&iter, i->c, i->id, i->from);
-
- while ((k = bch_btree_iter_peek(&iter)).k &&
- !(err = btree_iter_err(k))) {
- bch_bkey_val_to_text(i->c, bkey_type(0, i->id),
- i->buf, sizeof(i->buf), k);
- i->bytes = strlen(i->buf);
- BUG_ON(i->bytes >= PAGE_SIZE);
- i->buf[i->bytes] = '\n';
- i->bytes++;
-
- bch_btree_iter_advance_pos(&iter);
- i->from = iter.pos;
-
- err = flush_buf(i);
- if (err)
- break;
-
- if (!i->size)
- break;
- }
- bch_btree_iter_unlock(&iter);
-
- return err < 0 ? err : i->ret;
-}
-
-static const struct file_operations btree_debug_ops = {
- .owner = THIS_MODULE,
- .open = bch_dump_open,
- .release = bch_dump_release,
- .read = bch_read_btree,
-};
-
-static ssize_t bch_read_btree_formats(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct btree_iter iter;
- struct btree *b;
- int err;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- err = flush_buf(i);
- if (err)
- return err;
-
- if (!i->size || !bkey_cmp(POS_MAX, i->from))
- return i->ret;
-
- for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) {
- i->bytes = bch_print_btree_node(i->c, b, i->buf,
- sizeof(i->buf));
- err = flush_buf(i);
- if (err)
- break;
-
- /*
- * can't easily correctly restart a btree node traversal across
- * all nodes, meh
- */
- i->from = bkey_cmp(POS_MAX, b->key.k.p)
- ? bkey_successor(b->key.k.p)
- : b->key.k.p;
-
- if (!i->size)
- break;
- }
- bch_btree_iter_unlock(&iter);
-
- return err < 0 ? err : i->ret;
-}
-
-static const struct file_operations btree_format_debug_ops = {
- .owner = THIS_MODULE,
- .open = bch_dump_open,
- .release = bch_dump_release,
- .read = bch_read_btree_formats,
-};
-
-static ssize_t bch_read_bfloat_failed(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
-{
- struct dump_iter *i = file->private_data;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct btree *prev_node = NULL;
- int err;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- err = flush_buf(i);
- if (err)
- return err;
-
- if (!i->size)
- return i->ret;
-
- bch_btree_iter_init(&iter, i->c, i->id, i->from);
-
- while ((k = bch_btree_iter_peek(&iter)).k &&
- !(err = btree_iter_err(k))) {
- struct btree *b = iter.nodes[0];
- struct btree_node_iter *node_iter = &iter.node_iters[0];
- struct bkey_packed *_k = bch_btree_node_iter_peek(node_iter, b);
-
- if (iter.nodes[0] != prev_node) {
- i->bytes = bch_print_btree_node(i->c, b, i->buf,
- sizeof(i->buf));
- err = flush_buf(i);
- if (err)
- break;
- }
- prev_node = iter.nodes[0];
-
- i->bytes = bch_bkey_print_bfloat(b, _k, i->buf, sizeof(i->buf));
-
- err = flush_buf(i);
- if (err)
- break;
-
- bch_btree_iter_advance_pos(&iter);
- i->from = iter.pos;
-
- err = flush_buf(i);
- if (err)
- break;
-
- if (!i->size)
- break;
- }
- bch_btree_iter_unlock(&iter);
-
- return err < 0 ? err : i->ret;
-}
-
-static const struct file_operations bfloat_failed_debug_ops = {
- .owner = THIS_MODULE,
- .open = bch_dump_open,
- .release = bch_dump_release,
- .read = bch_read_bfloat_failed,
-};
-
-void bch_fs_debug_exit(struct bch_fs *c)
-{
- if (!IS_ERR_OR_NULL(c->debug))
- debugfs_remove_recursive(c->debug);
-}
-
-void bch_fs_debug_init(struct bch_fs *c)
-{
- struct btree_debug *bd;
- char name[100];
-
- if (IS_ERR_OR_NULL(bch_debug))
- return;
-
- snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
- c->debug = debugfs_create_dir(name, bch_debug);
- if (IS_ERR_OR_NULL(c->debug))
- return;
-
- for (bd = c->btree_debug;
- bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
- bd++) {
- bd->id = bd - c->btree_debug;
- bd->btree = debugfs_create_file(bch_btree_ids[bd->id],
- 0400, c->debug, bd,
- &btree_debug_ops);
-
- snprintf(name, sizeof(name), "%s-formats",
- bch_btree_ids[bd->id]);
-
- bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
- &btree_format_debug_ops);
-
- snprintf(name, sizeof(name), "%s-bfloat-failed",
- bch_btree_ids[bd->id]);
-
- bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
- &bfloat_failed_debug_ops);
- }
-}
-
-#endif
-
-void bch_debug_exit(void)
-{
- if (!IS_ERR_OR_NULL(bch_debug))
- debugfs_remove_recursive(bch_debug);
-}
-
-int __init bch_debug_init(void)
-{
- int ret = 0;
-
- bch_debug = debugfs_create_dir("bcache", NULL);
- return ret;
-}
diff --git a/libbcache/debug.h b/libbcache/debug.h
deleted file mode 100644
index 63e74304..00000000
--- a/libbcache/debug.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef _BCACHE_DEBUG_H
-#define _BCACHE_DEBUG_H
-
-#include "bcache.h"
-
-struct bio;
-struct btree;
-struct cached_dev;
-struct bch_fs;
-
-#define BCH_DEBUG_PARAM(name, description) extern bool bch_##name;
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_DEBUG_PARAM(name, description) \
- static inline bool name(struct bch_fs *c) \
- { return bch_##name || c->name; }
-BCH_DEBUG_PARAMS_ALWAYS()
-#undef BCH_DEBUG_PARAM
-
-#ifdef CONFIG_BCACHE_DEBUG
-
-#define BCH_DEBUG_PARAM(name, description) \
- static inline bool name(struct bch_fs *c) \
- { return bch_##name || c->name; }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
-void __bch_btree_verify(struct bch_fs *, struct btree *);
-void bch_data_verify(struct cached_dev *, struct bio *);
-
-#define bypass_torture_test(d) ((d)->bypass_torture_test)
-
-#else /* DEBUG */
-
-#define BCH_DEBUG_PARAM(name, description) \
- static inline bool name(struct bch_fs *c) { return false; }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
-static inline void __bch_btree_verify(struct bch_fs *c, struct btree *b) {}
-static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
-
-#define bypass_torture_test(d) 0
-
-#endif
-
-static inline void bch_btree_verify(struct bch_fs *c, struct btree *b)
-{
- if (verify_btree_ondisk(c))
- __bch_btree_verify(c, b);
-}
-
-#ifdef CONFIG_DEBUG_FS
-void bch_fs_debug_exit(struct bch_fs *);
-void bch_fs_debug_init(struct bch_fs *);
-#else
-static inline void bch_fs_debug_exit(struct bch_fs *c) {}
-static inline void bch_fs_debug_init(struct bch_fs *c) {}
-#endif
-
-void bch_debug_exit(void);
-int bch_debug_init(void);
-
-#endif
diff --git a/libbcache/dirent.c b/libbcache/dirent.c
deleted file mode 100644
index f961e881..00000000
--- a/libbcache/dirent.c
+++ /dev/null
@@ -1,427 +0,0 @@
-
-#include "bcache.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "extents.h"
-#include "dirent.h"
-#include "fs.h"
-#include "keylist.h"
-#include "str_hash.h"
-
-#include <linux/dcache.h>
-
-unsigned bch_dirent_name_bytes(struct bkey_s_c_dirent d)
-{
- unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent);
-
- while (len && !d.v->d_name[len - 1])
- --len;
-
- return len;
-}
-
-static u64 bch_dirent_hash(const struct bch_hash_info *info,
- const struct qstr *name)
-{
- struct bch_str_hash_ctx ctx;
-
- bch_str_hash_init(&ctx, info);
- bch_str_hash_update(&ctx, info, name->name, name->len);
-
- /* [0,2) reserved for dots */
- return max_t(u64, bch_str_hash_end(&ctx, info), 2);
-}
-
-static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
-{
- return bch_dirent_hash(info, key);
-}
-
-static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
-{
- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- struct qstr name = QSTR_INIT(d.v->d_name, bch_dirent_name_bytes(d));
-
- return bch_dirent_hash(info, &name);
-}
-
-static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
-{
- struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
- int len = bch_dirent_name_bytes(l);
- const struct qstr *r = _r;
-
- return len - r->len ?: memcmp(l.v->d_name, r->name, len);
-}
-
-static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
-{
- struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
- struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
- int l_len = bch_dirent_name_bytes(l);
- int r_len = bch_dirent_name_bytes(r);
-
- return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
-}
-
-static const struct bch_hash_desc dirent_hash_desc = {
- .btree_id = BTREE_ID_DIRENTS,
- .key_type = BCH_DIRENT,
- .whiteout_type = BCH_DIRENT_WHITEOUT,
- .hash_key = dirent_hash_key,
- .hash_bkey = dirent_hash_bkey,
- .cmp_key = dirent_cmp_key,
- .cmp_bkey = dirent_cmp_bkey,
-};
-
-static const char *bch_dirent_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
-{
- switch (k.k->type) {
- case BCH_DIRENT:
- return bkey_val_bytes(k.k) < sizeof(struct bch_dirent)
- ? "value too small"
- : NULL;
-
- case BCH_DIRENT_WHITEOUT:
- return bkey_val_bytes(k.k) != 0
- ? "value size should be zero"
- : NULL;
-
- default:
- return "invalid type";
- }
-}
-
-static void bch_dirent_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
-{
- struct bkey_s_c_dirent d;
-
- switch (k.k->type) {
- case BCH_DIRENT:
- d = bkey_s_c_to_dirent(k);
-
- if (size) {
- unsigned n = min_t(unsigned, size,
- bch_dirent_name_bytes(d));
- memcpy(buf, d.v->d_name, n);
- buf[size - 1] = '\0';
- buf += n;
- size -= n;
- }
-
- scnprintf(buf, size, " -> %llu", d.v->d_inum);
- break;
- case BCH_DIRENT_WHITEOUT:
- scnprintf(buf, size, "whiteout");
- break;
- }
-}
-
-const struct bkey_ops bch_bkey_dirent_ops = {
- .key_invalid = bch_dirent_invalid,
- .val_to_text = bch_dirent_to_text,
-};
-
-static struct bkey_i_dirent *dirent_create_key(u8 type,
- const struct qstr *name, u64 dst)
-{
- struct bkey_i_dirent *dirent;
- unsigned u64s = BKEY_U64s +
- DIV_ROUND_UP(sizeof(struct bch_dirent) + name->len,
- sizeof(u64));
-
- dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS);
- if (!dirent)
- return NULL;
-
- bkey_dirent_init(&dirent->k_i);
- dirent->k.u64s = u64s;
- dirent->v.d_inum = cpu_to_le64(dst);
- dirent->v.d_type = type;
-
- memcpy(dirent->v.d_name, name->name, name->len);
- memset(dirent->v.d_name + name->len, 0,
- bkey_val_bytes(&dirent->k) -
- (sizeof(struct bch_dirent) + name->len));
-
- EBUG_ON(bch_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
-
- return dirent;
-}
-
-int bch_dirent_create(struct bch_fs *c, u64 dir_inum,
- const struct bch_hash_info *hash_info,
- u8 type, const struct qstr *name, u64 dst_inum,
- u64 *journal_seq, int flags)
-{
- struct bkey_i_dirent *dirent;
- int ret;
-
- dirent = dirent_create_key(type, name, dst_inum);
- if (!dirent)
- return -ENOMEM;
-
- ret = bch_hash_set(dirent_hash_desc, hash_info, c, dir_inum,
- journal_seq, &dirent->k_i, flags);
- kfree(dirent);
-
- return ret;
-}
-
-static void dirent_copy_target(struct bkey_i_dirent *dst,
- struct bkey_s_c_dirent src)
-{
- dst->v.d_inum = src.v->d_inum;
- dst->v.d_type = src.v->d_type;
-}
-
-static struct bpos bch_dirent_pos(struct bch_inode_info *ei,
- const struct qstr *name)
-{
- return POS(ei->vfs_inode.i_ino, bch_dirent_hash(&ei->str_hash, name));
-}
-
-int bch_dirent_rename(struct bch_fs *c,
- struct inode *src_dir, const struct qstr *src_name,
- struct inode *dst_dir, const struct qstr *dst_name,
- u64 *journal_seq, enum bch_rename_mode mode)
-{
- struct bch_inode_info *src_ei = to_bch_ei(src_dir);
- struct bch_inode_info *dst_ei = to_bch_ei(dst_dir);
- struct btree_iter src_iter, dst_iter, whiteout_iter;
- struct bkey_s_c old_src, old_dst;
- struct bkey delete;
- struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
- struct bpos src_pos = bch_dirent_pos(src_ei, src_name);
- struct bpos dst_pos = bch_dirent_pos(dst_ei, dst_name);
- bool need_whiteout;
- int ret = -ENOMEM;
-
- bch_btree_iter_init_intent(&src_iter, c, BTREE_ID_DIRENTS, src_pos);
- bch_btree_iter_init_intent(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos);
- bch_btree_iter_link(&src_iter, &dst_iter);
-
- bch_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos);
- bch_btree_iter_link(&src_iter, &whiteout_iter);
-
- if (mode == BCH_RENAME_EXCHANGE) {
- new_src = dirent_create_key(0, src_name, 0);
- if (!new_src)
- goto err;
- } else {
- new_src = (void *) &delete;
- }
-
- new_dst = dirent_create_key(0, dst_name, 0);
- if (!new_dst)
- goto err;
-retry:
- /*
- * Note that on -EINTR/dropped locks we're not restarting the lookup
- * from the original hashed position (like we do when creating dirents,
- * in bch_hash_set) - we never move existing dirents to different slot:
- */
- old_src = bch_hash_lookup_at(dirent_hash_desc,
- &src_ei->str_hash,
- &src_iter, src_name);
- if ((ret = btree_iter_err(old_src)))
- goto err;
-
- ret = bch_hash_needs_whiteout(dirent_hash_desc,
- &src_ei->str_hash,
- &whiteout_iter, &src_iter);
- if (ret < 0)
- goto err;
- need_whiteout = ret;
-
- /*
- * Note that in BCH_RENAME mode, we're _not_ checking if
- * the target already exists - we're relying on the VFS
- * to do that check for us for correctness:
- */
- old_dst = mode == BCH_RENAME
- ? bch_hash_hole_at(dirent_hash_desc, &dst_iter)
- : bch_hash_lookup_at(dirent_hash_desc,
- &dst_ei->str_hash,
- &dst_iter, dst_name);
- if ((ret = btree_iter_err(old_dst)))
- goto err;
-
- switch (mode) {
- case BCH_RENAME:
- bkey_init(&new_src->k);
- dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-
- if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
- bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
- /*
- * If we couldn't insert new_dst at its hashed
- * position (dst_pos) due to a hash collision,
- * and we're going to be deleting in
- * between the hashed position and first empty
- * slot we found - just overwrite the pos we
- * were going to delete:
- *
- * Note: this is a correctness issue, in this
- * situation bch_hash_needs_whiteout() could
- * return false when the whiteout would have
- * been needed if we inserted at the pos
- * __dirent_find_hole() found
- */
- new_dst->k.p = src_iter.pos;
- ret = bch_btree_insert_at(c, NULL, NULL,
- journal_seq,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&src_iter,
- &new_dst->k_i));
- goto err;
- }
-
- if (need_whiteout)
- new_src->k.type = BCH_DIRENT_WHITEOUT;
- break;
- case BCH_RENAME_OVERWRITE:
- bkey_init(&new_src->k);
- dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-
- if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
- bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
- /*
- * Same case described above -
- * bch_hash_needs_whiteout could spuriously
- * return false, but we have to insert at
- * dst_iter.pos because we're overwriting
- * another dirent:
- */
- new_src->k.type = BCH_DIRENT_WHITEOUT;
- } else if (need_whiteout)
- new_src->k.type = BCH_DIRENT_WHITEOUT;
- break;
- case BCH_RENAME_EXCHANGE:
- dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
- dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
- break;
- }
-
- new_src->k.p = src_iter.pos;
- new_dst->k.p = dst_iter.pos;
- ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i),
- BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i));
-err:
- if (ret == -EINTR)
- goto retry;
-
- bch_btree_iter_unlock(&whiteout_iter);
- bch_btree_iter_unlock(&dst_iter);
- bch_btree_iter_unlock(&src_iter);
-
- if (new_src != (void *) &delete)
- kfree(new_src);
- kfree(new_dst);
- return ret;
-}
-
-int bch_dirent_delete(struct bch_fs *c, u64 dir_inum,
- const struct bch_hash_info *hash_info,
- const struct qstr *name,
- u64 *journal_seq)
-{
- return bch_hash_delete(dirent_hash_desc, hash_info,
- c, dir_inum, journal_seq, name);
-}
-
-u64 bch_dirent_lookup(struct bch_fs *c, u64 dir_inum,
- const struct bch_hash_info *hash_info,
- const struct qstr *name)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 inum;
-
- k = bch_hash_lookup(dirent_hash_desc, hash_info, c,
- dir_inum, &iter, name);
- if (IS_ERR(k.k)) {
- bch_btree_iter_unlock(&iter);
- return 0;
- }
-
- inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
- bch_btree_iter_unlock(&iter);
-
- return inum;
-}
-
-int bch_empty_dir(struct bch_fs *c, u64 dir_inum)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), k) {
- if (k.k->p.inode > dir_inum)
- break;
-
- if (k.k->type == BCH_DIRENT) {
- ret = -ENOTEMPTY;
- break;
- }
- }
- bch_btree_iter_unlock(&iter);
-
- return ret;
-}
-
-int bch_readdir(struct bch_fs *c, struct file *file,
- struct dir_context *ctx)
-{
- struct inode *inode = file_inode(file);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_dirent dirent;
- unsigned len;
-
- if (!dir_emit_dots(file, ctx))
- return 0;
-
- pr_debug("listing for %lu from %llu", inode->i_ino, ctx->pos);
-
- for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
- POS(inode->i_ino, ctx->pos), k) {
- if (k.k->type != BCH_DIRENT)
- continue;
-
- dirent = bkey_s_c_to_dirent(k);
-
- pr_debug("saw %llu:%llu (%s) -> %llu",
- k.k->p.inode, k.k->p.offset,
- dirent.v->d_name, dirent.v->d_inum);
-
- if (bkey_cmp(k.k->p, POS(inode->i_ino, ctx->pos)) < 0)
- continue;
-
- if (k.k->p.inode > inode->i_ino)
- break;
-
- len = bch_dirent_name_bytes(dirent);
-
- pr_debug("emitting %s", dirent.v->d_name);
-
- /*
- * XXX: dir_emit() can fault and block, while we're holding
- * locks
- */
- if (!dir_emit(ctx, dirent.v->d_name, len,
- le64_to_cpu(dirent.v->d_inum),
- dirent.v->d_type))
- break;
-
- ctx->pos = k.k->p.offset + 1;
- }
- bch_btree_iter_unlock(&iter);
-
- return 0;
-}
diff --git a/libbcache/dirent.h b/libbcache/dirent.h
deleted file mode 100644
index 158d4cae..00000000
--- a/libbcache/dirent.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef _BCACHE_DIRENT_H
-#define _BCACHE_DIRENT_H
-
-extern const struct bkey_ops bch_bkey_dirent_ops;
-
-struct qstr;
-struct file;
-struct dir_context;
-struct bch_fs;
-struct bch_hash_info;
-
-unsigned bch_dirent_name_bytes(struct bkey_s_c_dirent);
-int bch_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *,
- u8, const struct qstr *, u64, u64 *, int);
-int bch_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
- const struct qstr *, u64 *);
-
-enum bch_rename_mode {
- BCH_RENAME,
- BCH_RENAME_OVERWRITE,
- BCH_RENAME_EXCHANGE,
-};
-
-int bch_dirent_rename(struct bch_fs *,
- struct inode *, const struct qstr *,
- struct inode *, const struct qstr *,
- u64 *, enum bch_rename_mode);
-
-u64 bch_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
- const struct qstr *);
-
-int bch_empty_dir(struct bch_fs *, u64);
-int bch_readdir(struct bch_fs *, struct file *, struct dir_context *);
-
-#endif /* _BCACHE_DIRENT_H */
-
diff --git a/libbcache/error.c b/libbcache/error.c
deleted file mode 100644
index ba46d2d1..00000000
--- a/libbcache/error.c
+++ /dev/null
@@ -1,140 +0,0 @@
-#include "bcache.h"
-#include "error.h"
-#include "io.h"
-#include "notify.h"
-#include "super.h"
-
-void bch_inconsistent_error(struct bch_fs *c)
-{
- set_bit(BCH_FS_ERROR, &c->flags);
-
- switch (c->opts.errors) {
- case BCH_ON_ERROR_CONTINUE:
- break;
- case BCH_ON_ERROR_RO:
- if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
- /* XXX do something better here? */
- bch_fs_stop_async(c);
- return;
- }
-
- if (bch_fs_emergency_read_only(c))
- bch_err(c, "emergency read only");
- break;
- case BCH_ON_ERROR_PANIC:
- panic(bch_fmt(c, "panic after error"));
- break;
- }
-}
-
-void bch_fatal_error(struct bch_fs *c)
-{
- if (bch_fs_emergency_read_only(c))
- bch_err(c, "emergency read only");
-}
-
-/* Nonfatal IO errors, IO error/latency accounting: */
-
-/* Just does IO error accounting: */
-void bch_account_io_completion(struct bch_dev *ca)
-{
- /*
- * The halflife of an error is:
- * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
- */
-
- if (ca->fs->error_decay) {
- unsigned count = atomic_inc_return(&ca->io_count);
-
- while (count > ca->fs->error_decay) {
- unsigned errors;
- unsigned old = count;
- unsigned new = count - ca->fs->error_decay;
-
- /*
- * First we subtract refresh from count; each time we
- * succesfully do so, we rescale the errors once:
- */
-
- count = atomic_cmpxchg(&ca->io_count, old, new);
-
- if (count == old) {
- count = new;
-
- errors = atomic_read(&ca->io_errors);
- do {
- old = errors;
- new = ((uint64_t) errors * 127) / 128;
- errors = atomic_cmpxchg(&ca->io_errors,
- old, new);
- } while (old != errors);
- }
- }
- }
-}
-
-/* IO error accounting and latency accounting: */
-void bch_account_io_completion_time(struct bch_dev *ca,
- unsigned submit_time_us, int op)
-{
- struct bch_fs *c;
- unsigned threshold;
-
- if (!ca)
- return;
-
- c = ca->fs;
- threshold = op_is_write(op)
- ? c->congested_write_threshold_us
- : c->congested_read_threshold_us;
-
- if (threshold && submit_time_us) {
- unsigned t = local_clock_us();
-
- int us = t - submit_time_us;
- int congested = atomic_read(&c->congested);
-
- if (us > (int) threshold) {
- int ms = us / 1024;
- c->congested_last_us = t;
-
- ms = min(ms, CONGESTED_MAX + congested);
- atomic_sub(ms, &c->congested);
- } else if (congested < 0)
- atomic_inc(&c->congested);
- }
-
- bch_account_io_completion(ca);
-}
-
-void bch_nonfatal_io_error_work(struct work_struct *work)
-{
- struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
- struct bch_fs *c = ca->fs;
- unsigned errors = atomic_read(&ca->io_errors);
- bool dev;
-
- if (errors < c->error_limit) {
- bch_notify_dev_error(ca, false);
- } else {
- bch_notify_dev_error(ca, true);
-
- mutex_lock(&c->state_lock);
- dev = bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
- BCH_FORCE_IF_DEGRADED);
- if (dev
- ? __bch_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
- BCH_FORCE_IF_DEGRADED)
- : bch_fs_emergency_read_only(c))
- bch_err(ca,
- "too many IO errors, setting %s RO",
- dev ? "device" : "filesystem");
- mutex_unlock(&c->state_lock);
- }
-}
-
-void bch_nonfatal_io_error(struct bch_dev *ca)
-{
- atomic_add(1 << IO_ERROR_SHIFT, &ca->io_errors);
- queue_work(system_long_wq, &ca->io_error_work);
-}
diff --git a/libbcache/error.h b/libbcache/error.h
deleted file mode 100644
index 726b20d4..00000000
--- a/libbcache/error.h
+++ /dev/null
@@ -1,240 +0,0 @@
-#ifndef _BCACHE_ERROR_H
-#define _BCACHE_ERROR_H
-
-#include <linux/printk.h>
-
-struct bch_dev;
-struct bch_fs;
-
-/*
- * XXX: separate out errors that indicate on disk data is inconsistent, and flag
- * superblock as such
- */
-
-/* Error messages: */
-
-/*
- * Very fatal logic/inconsistency errors: these indicate that we've majorly
- * screwed up at runtime, i.e. it's not likely that it was just caused by the
- * data on disk being inconsistent. These BUG():
- *
- * XXX: audit and convert to inconsistent() checks
- */
-
-#define bch_fs_bug(c, ...) \
-do { \
- bch_err(c, __VA_ARGS__); \
- BUG(); \
-} while (0)
-
-#define bch_fs_bug_on(cond, c, ...) \
-do { \
- if (cond) \
- bch_fs_bug(c, __VA_ARGS__); \
-} while (0)
-
-/*
- * Inconsistency errors: The on disk data is inconsistent. If these occur during
- * initial recovery, they don't indicate a bug in the running code - we walk all
- * the metadata before modifying anything. If they occur at runtime, they
- * indicate either a bug in the running code or (less likely) data is being
- * silently corrupted under us.
- *
- * XXX: audit all inconsistent errors and make sure they're all recoverable, in
- * BCH_ON_ERROR_CONTINUE mode
- */
-
-void bch_inconsistent_error(struct bch_fs *);
-
-#define bch_fs_inconsistent(c, ...) \
-do { \
- bch_err(c, __VA_ARGS__); \
- bch_inconsistent_error(c); \
-} while (0)
-
-#define bch_fs_inconsistent_on(cond, c, ...) \
-({ \
- int _ret = !!(cond); \
- \
- if (_ret) \
- bch_fs_inconsistent(c, __VA_ARGS__); \
- _ret; \
-})
-
-/*
- * Later we might want to mark only the particular device inconsistent, not the
- * entire filesystem:
- */
-
-#define bch_dev_inconsistent(ca, ...) \
-do { \
- bch_err(ca, __VA_ARGS__); \
- bch_inconsistent_error((ca)->fs); \
-} while (0)
-
-#define bch_dev_inconsistent_on(cond, ca, ...) \
-({ \
- int _ret = !!(cond); \
- \
- if (_ret) \
- bch_dev_inconsistent(ca, __VA_ARGS__); \
- _ret; \
-})
-
-/*
- * Fsck errors: inconsistency errors we detect at mount time, and should ideally
- * be able to repair:
- */
-
-enum {
- BCH_FSCK_OK = 0,
- BCH_FSCK_ERRORS_NOT_FIXED = 1,
- BCH_FSCK_REPAIR_UNIMPLEMENTED = 2,
- BCH_FSCK_REPAIR_IMPOSSIBLE = 3,
- BCH_FSCK_UNKNOWN_VERSION = 4,
-};
-
-/* These macros return true if error should be fixed: */
-
-/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
-
-#ifndef __fsck_err
-#define __fsck_err(c, _can_fix, _can_ignore, _nofix_msg, msg, ...) \
-({ \
- bool _fix = false; \
- \
- if (_can_fix && (c)->opts.fix_errors) { \
- bch_err(c, msg ", fixing", ##__VA_ARGS__); \
- set_bit(BCH_FS_FSCK_FIXED_ERRORS, &(c)->flags); \
- _fix = true; \
- } else if (_can_ignore && \
- (c)->opts.errors == BCH_ON_ERROR_CONTINUE) { \
- bch_err(c, msg " (ignoring)", ##__VA_ARGS__); \
- } else { \
- bch_err(c, msg " ("_nofix_msg")", ##__VA_ARGS__); \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
- goto fsck_err; \
- } \
- \
- BUG_ON(!_fix && !_can_ignore); \
- _fix; \
-})
-#endif
-
-#define __fsck_err_on(cond, c, _can_fix, _can_ignore, _nofix_msg, ...) \
- ((cond) ? __fsck_err(c, _can_fix, _can_ignore, \
- _nofix_msg, ##__VA_ARGS__) : false)
-
-#define unfixable_fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, false, true, "repair unimplemented", ##__VA_ARGS__)
-
-#define need_fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, false, true, "run fsck to correct", ##__VA_ARGS__)
-
-#define mustfix_fsck_err(c, ...) \
- __fsck_err(c, true, false, "not fixing", ##__VA_ARGS__)
-
-#define mustfix_fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, true, false, "not fixing", ##__VA_ARGS__)
-
-#define fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, true, true, "not fixing", ##__VA_ARGS__)
-
-/*
- * Fatal errors: these don't indicate a bug, but we can't continue running in RW
- * mode - pretty much just due to metadata IO errors:
- */
-
-void bch_fatal_error(struct bch_fs *);
-
-#define bch_fs_fatal_error(c, ...) \
-do { \
- bch_err(c, __VA_ARGS__); \
- bch_fatal_error(c); \
-} while (0)
-
-#define bch_fs_fatal_err_on(cond, c, ...) \
-({ \
- int _ret = !!(cond); \
- \
- if (_ret) \
- bch_fs_fatal_error(c, __VA_ARGS__); \
- _ret; \
-})
-
-#define bch_dev_fatal_error(ca, ...) \
-do { \
- bch_err(ca, __VA_ARGS__); \
- bch_fatal_error(c); \
-} while (0)
-
-#define bch_dev_fatal_io_error(ca, fmt, ...) \
-do { \
- printk_ratelimited(KERN_ERR bch_fmt((ca)->fs, \
- "fatal IO error on %s for " fmt), \
- (ca)->name, ##__VA_ARGS__); \
- bch_fatal_error((ca)->fs); \
-} while (0)
-
-#define bch_dev_fatal_io_err_on(cond, ca, ...) \
-({ \
- int _ret = !!(cond); \
- \
- if (_ret) \
- bch_dev_fatal_io_error(ca, __VA_ARGS__); \
- _ret; \
-})
-
-/*
- * Nonfatal IO errors: either recoverable metadata IO (because we have
- * replicas), or data IO - we need to log it and print out a message, but we
- * don't (necessarily) want to shut down the fs:
- */
-
-void bch_account_io_completion(struct bch_dev *);
-void bch_account_io_completion_time(struct bch_dev *, unsigned, int);
-
-void bch_nonfatal_io_error_work(struct work_struct *);
-
-/* Does the error handling without logging a message */
-void bch_nonfatal_io_error(struct bch_dev *);
-
-#if 0
-#define bch_fs_nonfatal_io_error(c, ...) \
-do { \
- bch_err(c, __VA_ARGS__); \
- bch_nonfatal_io_error(c); \
-} while (0)
-#endif
-
-/* Logs message and handles the error: */
-#define bch_dev_nonfatal_io_error(ca, fmt, ...) \
-do { \
- printk_ratelimited(KERN_ERR bch_fmt((ca)->fs, \
- "IO error on %s for " fmt), \
- (ca)->name, ##__VA_ARGS__); \
- bch_nonfatal_io_error(ca); \
-} while (0)
-
-#define bch_dev_nonfatal_io_err_on(cond, ca, ...) \
-({ \
- bool _ret = (cond); \
- \
- if (_ret) \
- bch_dev_nonfatal_io_error(ca, __VA_ARGS__); \
- _ret; \
-})
-
-/* kill? */
-
-#define __bcache_io_error(c, fmt, ...) \
- printk_ratelimited(KERN_ERR bch_fmt(c, \
- "IO error: " fmt), ##__VA_ARGS__)
-
-#define bcache_io_error(c, bio, fmt, ...) \
-do { \
- __bcache_io_error(c, fmt, ##__VA_ARGS__); \
- (bio)->bi_error = -EIO; \
-} while (0)
-
-#endif /* _BCACHE_ERROR_H */
diff --git a/libbcache/extents.c b/libbcache/extents.c
deleted file mode 100644
index 4b422fb1..00000000
--- a/libbcache/extents.c
+++ /dev/null
@@ -1,2498 +0,0 @@
-/*
- * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
- *
- * Code for managing the extent btree and dynamically updating the writeback
- * dirty sector count.
- */
-
-#include "bcache.h"
-#include "bkey_methods.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "checksum.h"
-#include "debug.h"
-#include "dirent.h"
-#include "error.h"
-#include "extents.h"
-#include "inode.h"
-#include "journal.h"
-#include "super-io.h"
-#include "writeback.h"
-#include "xattr.h"
-
-#include <trace/events/bcache.h>
-
-static enum merge_result bch_extent_merge(struct bch_fs *, struct btree *,
- struct bkey_i *, struct bkey_i *);
-
-static void sort_key_next(struct btree_node_iter *iter,
- struct btree *b,
- struct btree_node_iter_set *i)
-{
- i->k += __btree_node_offset_to_key(b, i->k)->u64s;
-
- if (i->k == i->end)
- *i = iter->data[--iter->used];
-}
-
-/*
- * Returns true if l > r - unless l == r, in which case returns true if l is
- * older than r.
- *
- * Necessary for btree_sort_fixup() - if there are multiple keys that compare
- * equal in different sets, we have to process them newest to oldest.
- */
-#define key_sort_cmp(l, r) \
-({ \
- int _c = bkey_cmp_packed(b, \
- __btree_node_offset_to_key(b, (l).k), \
- __btree_node_offset_to_key(b, (r).k)); \
- \
- _c ? _c > 0 : (l).k > (r).k; \
-})
-
-static inline bool should_drop_next_key(struct btree_node_iter *iter,
- struct btree *b)
-{
- struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
- struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
-
- if (bkey_whiteout(k))
- return true;
-
- if (iter->used < 2)
- return false;
-
- if (iter->used > 2 &&
- key_sort_cmp(r[0], r[1]))
- r++;
-
- /*
- * key_sort_cmp() ensures that when keys compare equal the older key
- * comes first; so if l->k compares equal to r->k then l->k is older and
- * should be dropped.
- */
- return !bkey_cmp_packed(b,
- __btree_node_offset_to_key(b, l->k),
- __btree_node_offset_to_key(b, r->k));
-}
-
-struct btree_nr_keys bch_key_sort_fix_overlapping(struct bset *dst,
- struct btree *b,
- struct btree_node_iter *iter)
-{
- struct bkey_packed *out = dst->start;
- struct btree_nr_keys nr;
-
- memset(&nr, 0, sizeof(nr));
-
- heap_resort(iter, key_sort_cmp);
-
- while (!bch_btree_node_iter_end(iter)) {
- if (!should_drop_next_key(iter, b)) {
- struct bkey_packed *k =
- __btree_node_offset_to_key(b, iter->data->k);
-
- bkey_copy(out, k);
- btree_keys_account_key_add(&nr, 0, out);
- out = bkey_next(out);
- }
-
- sort_key_next(iter, b, iter->data);
- heap_sift(iter, 0, key_sort_cmp);
- }
-
- dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
- return nr;
-}
-
-/* Common among btree and extent ptrs */
-
-const struct bch_extent_ptr *
-bch_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
-{
- const struct bch_extent_ptr *ptr;
-
- extent_for_each_ptr(e, ptr)
- if (ptr->dev == dev)
- return ptr;
-
- return NULL;
-}
-
-unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent e)
-{
- const struct bch_extent_ptr *ptr;
- unsigned nr_ptrs = 0;
-
- extent_for_each_ptr(e, ptr)
- nr_ptrs++;
-
- return nr_ptrs;
-}
-
-unsigned bch_extent_nr_dirty_ptrs(struct bkey_s_c k)
-{
- struct bkey_s_c_extent e;
- const struct bch_extent_ptr *ptr;
- unsigned nr_ptrs = 0;
-
- switch (k.k->type) {
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- e = bkey_s_c_to_extent(k);
-
- extent_for_each_ptr(e, ptr)
- nr_ptrs += !ptr->cached;
- break;
-
- case BCH_RESERVATION:
- nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
- break;
- }
-
- return nr_ptrs;
-}
-
-/* returns true if equal */
-static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
-{
- return extent_crc_type(l) == extent_crc_type(r) &&
- !memcmp(l, r, extent_entry_bytes(to_entry(l)));
-}
-
-/* Increment pointers after @crc by crc's offset until the next crc entry: */
-void bch_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc)
-{
- union bch_extent_entry *entry;
-
- extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) {
- if (!extent_entry_is_ptr(entry))
- return;
-
- entry->ptr.offset += crc_offset(crc);
- }
-}
-
-/*
- * We're writing another replica for this extent, so while we've got the data in
- * memory we'll be computing a new checksum for the currently live data.
- *
- * If there are other replicas we aren't moving, and they are checksummed but
- * not compressed, we can modify them to point to only the data that is
- * currently live (so that readers won't have to bounce) while we've got the
- * checksum we need:
- *
- * XXX: to guard against data being corrupted while in memory, instead of
- * recomputing the checksum here, it would be better in the read path to instead
- * of computing the checksum of the entire extent:
- *
- * | extent |
- *
- * compute the checksums of the live and dead data separately
- * | dead data || live data || dead data |
- *
- * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
- * use crc_live here (that we verified was correct earlier)
- *
- * note: doesn't work with encryption
- */
-void bch_extent_narrow_crcs(struct bkey_s_extent e)
-{
- union bch_extent_crc *crc;
- bool have_wide = false, have_narrow = false;
- struct bch_csum csum = { 0 };
- unsigned csum_type = 0;
-
- extent_for_each_crc(e, crc) {
- if (crc_compression_type(crc) ||
- bch_csum_type_is_encryption(crc_csum_type(crc)))
- continue;
-
- if (crc_uncompressed_size(e.k, crc) != e.k->size) {
- have_wide = true;
- } else {
- have_narrow = true;
- csum = crc_csum(crc);
- csum_type = crc_csum_type(crc);
- }
- }
-
- if (!have_wide || !have_narrow)
- return;
-
- extent_for_each_crc(e, crc) {
- if (crc_compression_type(crc))
- continue;
-
- if (crc_uncompressed_size(e.k, crc) != e.k->size) {
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- BUG();
- case BCH_EXTENT_CRC32:
- if (bch_crc_bytes[csum_type] > 4)
- continue;
-
- bch_extent_crc_narrow_pointers(e, crc);
- crc->crc32._compressed_size = e.k->size - 1;
- crc->crc32._uncompressed_size = e.k->size - 1;
- crc->crc32.offset = 0;
- crc->crc32.csum_type = csum_type;
- crc->crc32.csum = csum.lo;
- break;
- case BCH_EXTENT_CRC64:
- if (bch_crc_bytes[csum_type] > 10)
- continue;
-
- bch_extent_crc_narrow_pointers(e, crc);
- crc->crc64._compressed_size = e.k->size - 1;
- crc->crc64._uncompressed_size = e.k->size - 1;
- crc->crc64.offset = 0;
- crc->crc64.csum_type = csum_type;
- crc->crc64.csum_lo = csum.lo;
- crc->crc64.csum_hi = csum.hi;
- break;
- case BCH_EXTENT_CRC128:
- if (bch_crc_bytes[csum_type] > 16)
- continue;
-
- bch_extent_crc_narrow_pointers(e, crc);
- crc->crc128._compressed_size = e.k->size - 1;
- crc->crc128._uncompressed_size = e.k->size - 1;
- crc->crc128.offset = 0;
- crc->crc128.csum_type = csum_type;
- crc->crc128.csum = csum;
- break;
- }
- }
- }
-}
-
-void bch_extent_drop_redundant_crcs(struct bkey_s_extent e)
-{
- union bch_extent_entry *entry = e.v->start;
- union bch_extent_crc *crc, *prev = NULL;
-
- while (entry != extent_entry_last(e)) {
- union bch_extent_entry *next = extent_entry_next(entry);
- size_t crc_u64s = extent_entry_u64s(entry);
-
- if (!extent_entry_is_crc(entry))
- goto next;
-
- crc = entry_to_crc(entry);
-
- if (next == extent_entry_last(e)) {
- /* crc entry with no pointers after it: */
- goto drop;
- }
-
- if (extent_entry_is_crc(next)) {
- /* no pointers before next crc entry: */
- goto drop;
- }
-
- if (prev && crc_cmp(crc, prev)) {
- /* identical to previous crc entry: */
- goto drop;
- }
-
- if (!prev &&
- !crc_csum_type(crc) &&
- !crc_compression_type(crc)) {
- /* null crc entry: */
- bch_extent_crc_narrow_pointers(e, crc);
- goto drop;
- }
-
- prev = crc;
-next:
- entry = next;
- continue;
-drop:
- memmove_u64s_down(crc, next,
- (u64 *) extent_entry_last(e) - (u64 *) next);
- e.k->u64s -= crc_u64s;
- }
-
- EBUG_ON(bkey_val_u64s(e.k) && !bch_extent_nr_ptrs(e.c));
-}
-
-static bool should_drop_ptr(const struct bch_fs *c,
- struct bkey_s_c_extent e,
- const struct bch_extent_ptr *ptr)
-{
- return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr);
-}
-
-static void bch_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
-{
- struct bch_extent_ptr *ptr = &e.v->start->ptr;
- bool dropped = false;
-
- while ((ptr = extent_ptr_next(e, ptr)))
- if (should_drop_ptr(c, e.c, ptr)) {
- __bch_extent_drop_ptr(e, ptr);
- dropped = true;
- } else
- ptr++;
-
- if (dropped)
- bch_extent_drop_redundant_crcs(e);
-}
-
-static bool bch_ptr_normalize(struct bch_fs *c, struct btree *bk,
- struct bkey_s k)
-{
- return bch_extent_normalize(c, k);
-}
-
-static void bch_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
-{
- switch (k->type) {
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED: {
- union bch_extent_entry *entry;
- u64 *d = (u64 *) bkeyp_val(f, k);
- unsigned i;
-
- for (i = 0; i < bkeyp_val_u64s(f, k); i++)
- d[i] = swab64(d[i]);
-
- for (entry = (union bch_extent_entry *) d;
- entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
- entry = extent_entry_next(entry)) {
- switch (extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_crc32:
- entry->crc32.csum = swab32(entry->crc32.csum);
- break;
- case BCH_EXTENT_ENTRY_crc64:
- entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
- entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
- break;
- case BCH_EXTENT_ENTRY_crc128:
- entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
- entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
- break;
- case BCH_EXTENT_ENTRY_ptr:
- break;
- }
- }
- break;
- }
- }
-}
-
-static const char *extent_ptr_invalid(const struct bch_fs *c,
- struct bkey_s_c_extent e,
- const struct bch_extent_ptr *ptr,
- unsigned size_ondisk,
- bool metadata)
-{
- const struct bch_extent_ptr *ptr2;
- struct bch_dev *ca;
-
- if (ptr->dev >= c->sb.nr_devices)
- return "pointer to invalid device";
-
- ca = c->devs[ptr->dev];
- if (!ca)
- return "pointer to invalid device";
-
- extent_for_each_ptr(e, ptr2)
- if (ptr != ptr2 && ptr->dev == ptr2->dev)
- return "multiple pointers to same device";
-
- if (ptr->offset + size_ondisk > ca->mi.bucket_size * ca->mi.nbuckets)
- return "offset past end of device";
-
- if (ptr->offset < ca->mi.bucket_size * ca->mi.first_bucket)
- return "offset before first bucket";
-
- if ((ptr->offset & (ca->mi.bucket_size - 1)) +
- size_ondisk > ca->mi.bucket_size)
- return "spans multiple buckets";
-
- if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data))
- return "device not marked as containing data";
-
- return NULL;
-}
-
-static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c_extent e)
-{
- char *out = buf, *end = buf + size;
- const union bch_extent_entry *entry;
- const union bch_extent_crc *crc;
- const struct bch_extent_ptr *ptr;
- struct bch_dev *ca;
- bool first = true;
-
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
-
- extent_for_each_entry(e, entry) {
- if (!first)
- p(" ");
-
- switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_crc32:
- case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128:
- crc = entry_to_crc(entry);
-
- p("crc: c_size %u size %u offset %u csum %u compress %u",
- crc_compressed_size(e.k, crc),
- crc_uncompressed_size(e.k, crc),
- crc_offset(crc), crc_csum_type(crc),
- crc_compression_type(crc));
- break;
- case BCH_EXTENT_ENTRY_ptr:
- ptr = entry_to_ptr(entry);
- ca = c->devs[ptr->dev];
-
- p("ptr: %u:%llu gen %u%s", ptr->dev,
- (u64) ptr->offset, ptr->gen,
- ca && ptr_stale(ca, ptr)
- ? " stale" : "");
- break;
- default:
- p("(invalid extent entry %.16llx)", *((u64 *) entry));
- goto out;
- }
-
- first = false;
- }
-out:
- if (bkey_extent_is_cached(e.k))
- p(" cached");
-#undef p
- return out - buf;
-}
-
-/* Btree ptrs */
-
-static const char *bch_btree_ptr_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
-{
- if (bkey_extent_is_cached(k.k))
- return "cached";
-
- if (k.k->size)
- return "nonzero key size";
-
- if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
- return "value too big";
-
- switch (k.k->type) {
- case BCH_EXTENT: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const union bch_extent_entry *entry;
- const struct bch_extent_ptr *ptr;
- const union bch_extent_crc *crc;
- const char *reason;
-
- extent_for_each_entry(e, entry)
- if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
- return "invalid extent entry type";
-
- extent_for_each_ptr_crc(e, ptr, crc) {
- reason = extent_ptr_invalid(c, e, ptr,
- c->sb.btree_node_size,
- true);
- if (reason)
- return reason;
- }
-
- if (crc)
- return "has crc field";
-
- return NULL;
- }
-
- default:
- return "invalid value type";
- }
-}
-
-static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k)
-{
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const struct bch_extent_ptr *ptr;
- unsigned seq;
- const char *err;
- char buf[160];
- struct bucket *g;
- struct bch_dev *ca;
- unsigned replicas = 0;
- bool bad;
-
- extent_for_each_ptr(e, ptr) {
- ca = c->devs[ptr->dev];
- g = PTR_BUCKET(ca, ptr);
- replicas++;
-
- err = "stale";
- if (ptr_stale(ca, ptr))
- goto err;
-
- do {
- seq = read_seqcount_begin(&c->gc_pos_lock);
- bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
- g->mark.data_type != BUCKET_BTREE;
- } while (read_seqcount_retry(&c->gc_pos_lock, seq));
-
- err = "inconsistent";
- if (bad)
- goto err;
- }
-
- if (replicas < c->sb.meta_replicas_have) {
- bch_bkey_val_to_text(c, btree_node_type(b),
- buf, sizeof(buf), k);
- bch_fs_bug(c,
- "btree key bad (too few replicas, %u < %u): %s",
- replicas, c->sb.meta_replicas_have, buf);
- return;
- }
-
- return;
-err:
- bch_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
- bch_fs_bug(c, "%s btree pointer %s: bucket %zi prio %i "
- "gen %i last_gc %i mark %08x",
- err, buf, PTR_BUCKET_NR(ca, ptr),
- g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
- ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
- (unsigned) g->mark.counter);
-}
-
-static void bch_btree_ptr_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
-{
- char *out = buf, *end = buf + size;
- const char *invalid;
-
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
-
- if (bkey_extent_is_data(k.k))
- out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
-
- invalid = bch_btree_ptr_invalid(c, k);
- if (invalid)
- p(" invalid: %s", invalid);
-#undef p
-}
-
-struct extent_pick_ptr
-bch_btree_pick_ptr(struct bch_fs *c, const struct btree *b)
-{
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
- const union bch_extent_crc *crc;
- const struct bch_extent_ptr *ptr;
- struct extent_pick_ptr pick = { .ca = NULL };
-
- extent_for_each_ptr_crc(e, ptr, crc) {
- struct bch_dev *ca = c->devs[ptr->dev];
- struct btree *root = btree_node_root(c, b);
-
- if (bch_fs_inconsistent_on(crc, c,
- "btree node pointer with crc at btree %u level %u/%u bucket %zu",
- b->btree_id, b->level, root ? root->level : -1,
- PTR_BUCKET_NR(ca, ptr)))
- break;
-
- if (bch_dev_inconsistent_on(ptr_stale(ca, ptr), ca,
- "stale btree node pointer at btree %u level %u/%u bucket %zu",
- b->btree_id, b->level, root ? root->level : -1,
- PTR_BUCKET_NR(ca, ptr)))
- continue;
-
- if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
- continue;
-
- if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
- continue;
-
- if (!percpu_ref_tryget(&ca->io_ref))
- continue;
-
- if (pick.ca)
- percpu_ref_put(&pick.ca->io_ref);
-
- pick.ca = ca;
- pick.ptr = *ptr;
- }
-
- return pick;
-}
-
-const struct bkey_ops bch_bkey_btree_ops = {
- .key_invalid = bch_btree_ptr_invalid,
- .key_debugcheck = btree_ptr_debugcheck,
- .val_to_text = bch_btree_ptr_to_text,
- .swab = bch_ptr_swab,
-};
-
-/* Extents */
-
-static bool __bch_cut_front(struct bpos where, struct bkey_s k)
-{
- u64 len = 0;
-
- if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
- return false;
-
- EBUG_ON(bkey_cmp(where, k.k->p) > 0);
-
- len = k.k->p.offset - where.offset;
-
- BUG_ON(len > k.k->size);
-
- /*
- * Don't readjust offset if the key size is now 0, because that could
- * cause offset to point to the next bucket:
- */
- if (!len)
- __set_bkey_deleted(k.k);
- else if (bkey_extent_is_data(k.k)) {
- struct bkey_s_extent e = bkey_s_to_extent(k);
- struct bch_extent_ptr *ptr;
- union bch_extent_crc *crc, *prev_crc = NULL;
-
- extent_for_each_ptr_crc(e, ptr, crc) {
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- ptr->offset += e.k->size - len;
- break;
- case BCH_EXTENT_CRC32:
- if (prev_crc != crc)
- crc->crc32.offset += e.k->size - len;
- break;
- case BCH_EXTENT_CRC64:
- if (prev_crc != crc)
- crc->crc64.offset += e.k->size - len;
- break;
- case BCH_EXTENT_CRC128:
- if (prev_crc != crc)
- crc->crc128.offset += e.k->size - len;
- break;
- }
- prev_crc = crc;
- }
- }
-
- k.k->size = len;
-
- return true;
-}
-
-bool bch_cut_front(struct bpos where, struct bkey_i *k)
-{
- return __bch_cut_front(where, bkey_i_to_s(k));
-}
-
-bool bch_cut_back(struct bpos where, struct bkey *k)
-{
- u64 len = 0;
-
- if (bkey_cmp(where, k->p) >= 0)
- return false;
-
- EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
-
- len = where.offset - bkey_start_offset(k);
-
- BUG_ON(len > k->size);
-
- k->p = where;
- k->size = len;
-
- if (!len)
- __set_bkey_deleted(k);
-
- return true;
-}
-
-/**
- * bch_key_resize - adjust size of @k
- *
- * bkey_start_offset(k) will be preserved, modifies where the extent ends
- */
-void bch_key_resize(struct bkey *k,
- unsigned new_size)
-{
- k->p.offset -= k->size;
- k->p.offset += new_size;
- k->size = new_size;
-}
-
-/*
- * In extent_sort_fix_overlapping(), insert_fixup_extent(),
- * extent_merge_inline() - we're modifying keys in place that are packed. To do
- * that we have to unpack the key, modify the unpacked key - then this
- * copies/repacks the unpacked to the original as necessary.
- */
-static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
- struct bkey_packed *dst, struct bkey *src)
-{
- struct bkey_format *f = &b->format;
- struct bkey_i *dst_unpacked;
- bool ret;
-
- if ((dst_unpacked = packed_to_bkey(dst))) {
- dst_unpacked->k = *src;
- ret = true;
- } else {
- ret = bkey_pack_key(dst, src, f);
- }
-
- if (ret && iter)
- bch_verify_key_order(b, iter, dst);
-
- return ret;
-}
-
-static void extent_save(struct btree *b, struct btree_node_iter *iter,
- struct bkey_packed *dst, struct bkey *src)
-{
- BUG_ON(!__extent_save(b, iter, dst, src));
-}
-
-/*
- * Returns true if l > r - unless l == r, in which case returns true if l is
- * older than r.
- *
- * Necessary for sort_fix_overlapping() - if there are multiple keys that
- * compare equal in different sets, we have to process them newest to oldest.
- */
-#define extent_sort_cmp(l, r) \
-({ \
- struct bkey _ul = bkey_unpack_key(b, \
- __btree_node_offset_to_key(b, (l).k)); \
- struct bkey _ur = bkey_unpack_key(b, \
- __btree_node_offset_to_key(b, (r).k)); \
- \
- int _c = bkey_cmp(bkey_start_pos(&_ul), bkey_start_pos(&_ur)); \
- _c ? _c > 0 : (l).k < (r).k; \
-})
-
-static inline void extent_sort_sift(struct btree_node_iter *iter,
- struct btree *b, size_t i)
-{
- heap_sift(iter, i, extent_sort_cmp);
-}
-
-static inline void extent_sort_next(struct btree_node_iter *iter,
- struct btree *b,
- struct btree_node_iter_set *i)
-{
- sort_key_next(iter, b, i);
- heap_sift(iter, i - iter->data, extent_sort_cmp);
-}
-
-static void extent_sort_append(struct bch_fs *c,
- struct btree *b,
- struct btree_nr_keys *nr,
- struct bkey_packed *start,
- struct bkey_packed **prev,
- struct bkey_packed *k)
-{
- struct bkey_format *f = &b->format;
- BKEY_PADDED(k) tmp;
-
- if (bkey_whiteout(k))
- return;
-
- bkey_unpack(b, &tmp.k, k);
-
- if (*prev &&
- bch_extent_merge(c, b, (void *) *prev, &tmp.k))
- return;
-
- if (*prev) {
- bkey_pack(*prev, (void *) *prev, f);
-
- btree_keys_account_key_add(nr, 0, *prev);
- *prev = bkey_next(*prev);
- } else {
- *prev = start;
- }
-
- bkey_copy(*prev, &tmp.k);
-}
-
-struct btree_nr_keys bch_extent_sort_fix_overlapping(struct bch_fs *c,
- struct bset *dst,
- struct btree *b,
- struct btree_node_iter *iter)
-{
- struct bkey_format *f = &b->format;
- struct btree_node_iter_set *_l = iter->data, *_r;
- struct bkey_packed *prev = NULL, *out, *lk, *rk;
- struct bkey l_unpacked, r_unpacked;
- struct bkey_s l, r;
- struct btree_nr_keys nr;
-
- memset(&nr, 0, sizeof(nr));
-
- heap_resort(iter, extent_sort_cmp);
-
- while (!bch_btree_node_iter_end(iter)) {
- lk = __btree_node_offset_to_key(b, _l->k);
-
- if (iter->used == 1) {
- extent_sort_append(c, b, &nr, dst->start, &prev, lk);
- extent_sort_next(iter, b, _l);
- continue;
- }
-
- _r = iter->data + 1;
- if (iter->used > 2 &&
- extent_sort_cmp(_r[0], _r[1]))
- _r++;
-
- rk = __btree_node_offset_to_key(b, _r->k);
-
- l = __bkey_disassemble(b, lk, &l_unpacked);
- r = __bkey_disassemble(b, rk, &r_unpacked);
-
- /* If current key and next key don't overlap, just append */
- if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
- extent_sort_append(c, b, &nr, dst->start, &prev, lk);
- extent_sort_next(iter, b, _l);
- continue;
- }
-
- /* Skip 0 size keys */
- if (!r.k->size) {
- extent_sort_next(iter, b, _r);
- continue;
- }
-
- /*
- * overlap: keep the newer key and trim the older key so they
- * don't overlap. comparing pointers tells us which one is
- * newer, since the bsets are appended one after the other.
- */
-
- /* can't happen because of comparison func */
- BUG_ON(_l->k < _r->k &&
- !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
-
- if (_l->k > _r->k) {
- /* l wins, trim r */
- if (bkey_cmp(l.k->p, r.k->p) >= 0) {
- sort_key_next(iter, b, _r);
- } else {
- __bch_cut_front(l.k->p, r);
- extent_save(b, NULL, rk, r.k);
- }
-
- extent_sort_sift(iter, b, _r - iter->data);
- } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
- BKEY_PADDED(k) tmp;
-
- /*
- * r wins, but it overlaps in the middle of l - split l:
- */
- bkey_reassemble(&tmp.k, l.s_c);
- bch_cut_back(bkey_start_pos(r.k), &tmp.k.k);
-
- __bch_cut_front(r.k->p, l);
- extent_save(b, NULL, lk, l.k);
-
- extent_sort_sift(iter, b, 0);
-
- extent_sort_append(c, b, &nr, dst->start, &prev,
- bkey_to_packed(&tmp.k));
- } else {
- bch_cut_back(bkey_start_pos(r.k), l.k);
- extent_save(b, NULL, lk, l.k);
- }
- }
-
- if (prev) {
- bkey_pack(prev, (void *) prev, f);
- btree_keys_account_key_add(&nr, 0, prev);
- out = bkey_next(prev);
- } else {
- out = dst->start;
- }
-
- dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
- return nr;
-}
-
-struct extent_insert_state {
- struct btree_insert *trans;
- struct btree_insert_entry *insert;
- struct bpos committed;
- struct bch_fs_usage stats;
-
- /* for deleting: */
- struct bkey_i whiteout;
- bool do_journal;
- bool deleting;
-};
-
-static void bch_add_sectors(struct extent_insert_state *s,
- struct bkey_s_c k, u64 offset, s64 sectors)
-{
- struct bch_fs *c = s->trans->c;
- struct btree *b = s->insert->iter->nodes[0];
-
- EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
-
- if (!sectors)
- return;
-
- bch_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
- &s->stats, s->trans->journal_res.seq);
-
- if (bkey_extent_is_data(k.k) &&
- !bkey_extent_is_cached(k.k))
- bcache_dev_sectors_dirty_add(c, k.k->p.inode, offset, sectors);
-}
-
-static void bch_subtract_sectors(struct extent_insert_state *s,
- struct bkey_s_c k, u64 offset, s64 sectors)
-{
- bch_add_sectors(s, k, offset, -sectors);
-}
-
-/* These wrappers subtract exactly the sectors that we're removing from @k */
-static void bch_cut_subtract_back(struct extent_insert_state *s,
- struct bpos where, struct bkey_s k)
-{
- bch_subtract_sectors(s, k.s_c, where.offset,
- k.k->p.offset - where.offset);
- bch_cut_back(where, k.k);
-}
-
-static void bch_cut_subtract_front(struct extent_insert_state *s,
- struct bpos where, struct bkey_s k)
-{
- bch_subtract_sectors(s, k.s_c, bkey_start_offset(k.k),
- where.offset - bkey_start_offset(k.k));
- __bch_cut_front(where, k);
-}
-
-static void bch_drop_subtract(struct extent_insert_state *s, struct bkey_s k)
-{
- if (k.k->size)
- bch_subtract_sectors(s, k.s_c,
- bkey_start_offset(k.k), k.k->size);
- k.k->size = 0;
- __set_bkey_deleted(k.k);
-}
-
-/*
- * Note: If this returns true because only some pointers matched,
- * we can lose some caching that had happened in the interim.
- * Because cache promotion only promotes the part of the extent
- * actually read, and not the whole extent, and due to the key
- * splitting done in bch_extent_insert_fixup, preserving such
- * caching is difficult.
- */
-static bool bch_extent_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r)
-{
- struct bkey_s_c_extent le, re;
- const struct bch_extent_ptr *lp, *rp;
- s64 offset;
-
- BUG_ON(!l.k->size || !r.k->size);
-
- if (l.k->type != r.k->type ||
- bversion_cmp(l.k->version, r.k->version))
- return false;
-
- switch (l.k->type) {
- case KEY_TYPE_COOKIE:
- return !memcmp(bkey_s_c_to_cookie(l).v,
- bkey_s_c_to_cookie(r).v,
- sizeof(struct bch_cookie));
-
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- le = bkey_s_c_to_extent(l);
- re = bkey_s_c_to_extent(r);
-
- /*
- * bkey_cmpxchg() handles partial matches - when either l or r
- * has been trimmed - so we need just to handle l or r not
- * starting at the same place when checking for a match here.
- *
- * If the starts of the keys are different, we just apply that
- * offset to the device pointer offsets when checking those -
- * matching how bch_cut_front() adjusts device pointer offsets
- * when adjusting the start of a key:
- */
- offset = bkey_start_offset(l.k) - bkey_start_offset(r.k);
-
- /*
- * XXX: perhaps we only raced with copygc or tiering replacing
- * one of the pointers: it should suffice to find _any_ matching
- * pointer
- */
-
- if (bkey_val_u64s(le.k) != bkey_val_u64s(re.k))
- return false;
-
- extent_for_each_ptr(le, lp) {
- const union bch_extent_entry *entry =
- vstruct_idx(re.v, (u64 *) lp - le.v->_data);
-
- if (!extent_entry_is_ptr(entry))
- return false;
-
- rp = &entry->ptr;
-
- if (lp->offset != rp->offset + offset ||
- lp->dev != rp->dev ||
- lp->gen != rp->gen)
- return false;
- }
-
- return true;
- default:
- return false;
- }
-
-}
-
-/*
- * Returns true on success, false on failure (and false means @new no longer
- * overlaps with @k)
- *
- * If returned true, we may have inserted up to one key in @b.
- * If returned false, we may have inserted up to two keys in @b.
- *
- * On return, there is room in @res for at least one more key of the same size
- * as @new.
- */
-enum extent_insert_hook_ret bch_extent_cmpxchg(struct extent_insert_hook *hook,
- struct bpos committed_pos,
- struct bpos next_pos,
- struct bkey_s_c k,
- const struct bkey_i *new)
-{
- struct bch_replace_info *replace = container_of(hook,
- struct bch_replace_info, hook);
- struct bkey_i *old = &replace->key;
-
- EBUG_ON(bkey_cmp(committed_pos, bkey_start_pos(&new->k)) < 0);
-
- /* must have something to compare against */
- EBUG_ON(!bkey_val_u64s(&old->k));
-
- /* new must be a subset of old */
- EBUG_ON(bkey_cmp(new->k.p, old->k.p) > 0 ||
- bkey_cmp(bkey_start_pos(&new->k), bkey_start_pos(&old->k)) < 0);
-
- if (k.k && bch_extent_cmpxchg_cmp(k, bkey_i_to_s_c(old))) {
- replace->successes++;
- return BTREE_HOOK_DO_INSERT;
- } else {
- replace->failures++;
- return BTREE_HOOK_NO_INSERT;
- }
-}
-
-static bool bch_extent_merge_inline(struct bch_fs *,
- struct btree_iter *,
- struct bkey_packed *,
- struct bkey_packed *,
- bool);
-
-#define MAX_LOCK_HOLD_TIME (5 * NSEC_PER_MSEC)
-
-static enum btree_insert_ret
-extent_insert_should_stop(struct extent_insert_state *s)
-{
- struct btree *b = s->insert->iter->nodes[0];
-
- /*
- * Check if we have sufficient space in both the btree node and the
- * journal reservation:
- *
- * Each insert checks for room in the journal entry, but we check for
- * room in the btree node up-front. In the worst case, bkey_cmpxchg()
- * will insert two keys, and one iteration of this room will insert one
- * key, so we need room for three keys.
- */
- if (!bch_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s))
- return BTREE_INSERT_BTREE_NODE_FULL;
- else if (!journal_res_insert_fits(s->trans, s->insert))
- return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */
- else
- return BTREE_INSERT_OK;
-}
-
-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
- struct bset_tree *t = bset_tree_last(b);
- struct bkey_packed *where =
- bch_btree_node_iter_bset_pos(node_iter, b, t);
- struct bkey_packed *prev = bkey_prev(b, t, where);
- struct bkey_packed *next_live_key = where;
- unsigned clobber_u64s;
-
- if (prev)
- where = bkey_next(prev);
-
- while (next_live_key != btree_bkey_last(b, t) &&
- bkey_deleted(next_live_key))
- next_live_key = bkey_next(next_live_key);
-
- /*
- * Everything between where and next_live_key is now deleted keys, and
- * is overwritten:
- */
- clobber_u64s = (u64 *) next_live_key - (u64 *) where;
-
- if (prev &&
- bch_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
- goto drop_deleted_keys;
-
- if (next_live_key != btree_bkey_last(b, t) &&
- bch_extent_merge_inline(c, iter, bkey_to_packed(insert),
- next_live_key, false))
- goto drop_deleted_keys;
-
- bch_bset_insert(b, node_iter, where, insert, clobber_u64s);
- bch_btree_node_iter_fix(iter, b, node_iter, t, where,
- clobber_u64s, where->u64s);
- return;
-drop_deleted_keys:
- bch_bset_delete(b, where, clobber_u64s);
- bch_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, 0);
-}
-
-static void extent_insert_committed(struct extent_insert_state *s)
-{
- struct bch_fs *c = s->trans->c;
- struct btree_iter *iter = s->insert->iter;
- struct bkey_i *insert = !s->deleting
- ? s->insert->k
- : &s->whiteout;
- BKEY_PADDED(k) split;
-
- EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
- EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
-
- if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
- return;
-
- if (s->deleting && !s->do_journal) {
- bch_cut_front(s->committed, insert);
- goto done;
- }
-
- EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
-
- bkey_copy(&split.k, insert);
-
- if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
- bkey_cmp(s->committed, insert->k.p) &&
- bkey_extent_is_compressed(bkey_i_to_s_c(insert))) {
- /* XXX: possibly need to increase our reservation? */
- bch_cut_subtract_back(s, s->committed,
- bkey_i_to_s(&split.k));
- bch_cut_front(s->committed, insert);
- bch_add_sectors(s, bkey_i_to_s_c(insert),
- bkey_start_offset(&insert->k),
- insert->k.size);
- } else {
- bch_cut_back(s->committed, &split.k.k);
- bch_cut_front(s->committed, insert);
- }
-
- if (debug_check_bkeys(c))
- bkey_debugcheck(c, iter->nodes[iter->level],
- bkey_i_to_s_c(&split.k));
-
- bch_btree_journal_key(s->trans, iter, &split.k);
-
- if (!s->deleting)
- extent_bset_insert(c, iter, &split.k);
-done:
- bch_btree_iter_set_pos_same_leaf(iter, s->committed);
-
- insert->k.needs_whiteout = false;
- s->do_journal = false;
- s->trans->did_work = true;
-}
-
-static enum extent_insert_hook_ret
-__extent_insert_advance_pos(struct extent_insert_state *s,
- struct bpos next_pos,
- struct bkey_s_c k)
-{
- struct extent_insert_hook *hook = s->trans->hook;
- enum extent_insert_hook_ret ret;
-#if 0
- /*
- * Currently disabled for encryption - broken with fcollapse. Will have
- * to reenable when versions are exposed for send/receive - versions
- * will have to be monotonic then:
- */
- if (k.k && k.k->size &&
- !bversion_zero(s->insert->k->k.version) &&
- bversion_cmp(k.k->version, s->insert->k->k.version) > 0) {
- ret = BTREE_HOOK_NO_INSERT;
- } else
-#endif
- if (hook)
- ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
- else
- ret = BTREE_HOOK_DO_INSERT;
-
- EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
-
- switch (ret) {
- case BTREE_HOOK_DO_INSERT:
- break;
- case BTREE_HOOK_NO_INSERT:
- extent_insert_committed(s);
- bch_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k));
-
- bch_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos);
- break;
- case BTREE_HOOK_RESTART_TRANS:
- return ret;
- }
-
- s->committed = next_pos;
- return ret;
-}
-
-/*
- * Update iter->pos, marking how much of @insert we've processed, and call hook
- * fn:
- */
-static enum extent_insert_hook_ret
-extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
-{
- struct btree *b = s->insert->iter->nodes[0];
- struct bpos next_pos = bpos_min(s->insert->k->k.p,
- k.k ? k.k->p : b->key.k.p);
-
- /* hole? */
- if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
- bool have_uncommitted = bkey_cmp(s->committed,
- bkey_start_pos(&s->insert->k->k)) > 0;
-
- switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k),
- bkey_s_c_null)) {
- case BTREE_HOOK_DO_INSERT:
- break;
- case BTREE_HOOK_NO_INSERT:
- /*
- * we had to split @insert and insert the committed
- * part - need to bail out and recheck journal
- * reservation/btree node before we advance pos past @k:
- */
- if (have_uncommitted)
- return BTREE_HOOK_NO_INSERT;
- break;
- case BTREE_HOOK_RESTART_TRANS:
- return BTREE_HOOK_RESTART_TRANS;
- }
- }
-
- /* avoid redundant calls to hook fn: */
- if (!bkey_cmp(s->committed, next_pos))
- return BTREE_HOOK_DO_INSERT;
-
- return __extent_insert_advance_pos(s, next_pos, k);
-}
-
-static enum btree_insert_ret
-extent_insert_check_split_compressed(struct extent_insert_state *s,
- struct bkey_s_c k,
- enum bch_extent_overlap overlap)
-{
- struct bch_fs *c = s->trans->c;
- unsigned sectors;
-
- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
- (sectors = bkey_extent_is_compressed(k))) {
- int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
-
- if (s->trans->flags & BTREE_INSERT_NOFAIL)
- flags |= BCH_DISK_RESERVATION_NOFAIL;
-
- switch (bch_disk_reservation_add(c,
- s->trans->disk_res,
- sectors, flags)) {
- case 0:
- break;
- case -ENOSPC:
- return BTREE_INSERT_ENOSPC;
- case -EINTR:
- return BTREE_INSERT_NEED_GC_LOCK;
- default:
- BUG();
- }
- }
-
- return BTREE_INSERT_OK;
-}
-
-static enum btree_insert_ret
-extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
- struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
- enum bch_extent_overlap overlap)
-{
- struct bch_fs *c = s->trans->c;
- struct btree_iter *iter = s->insert->iter;
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
-
- switch (overlap) {
- case BCH_EXTENT_OVERLAP_FRONT:
- /* insert overlaps with start of k: */
- bch_cut_subtract_front(s, insert->k.p, k);
- BUG_ON(bkey_deleted(k.k));
- extent_save(b, node_iter, _k, k.k);
- break;
-
- case BCH_EXTENT_OVERLAP_BACK:
- /* insert overlaps with end of k: */
- bch_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
- BUG_ON(bkey_deleted(k.k));
- extent_save(b, node_iter, _k, k.k);
-
- /*
- * As the auxiliary tree is indexed by the end of the
- * key and we've just changed the end, update the
- * auxiliary tree.
- */
- bch_bset_fix_invalidated_key(b, t, _k);
- bch_btree_node_iter_fix(iter, b, node_iter, t,
- _k, _k->u64s, _k->u64s);
- break;
-
- case BCH_EXTENT_OVERLAP_ALL: {
- struct bpos orig_pos = k.k->p;
-
- /* The insert key completely covers k, invalidate k */
- if (!bkey_whiteout(k.k))
- btree_keys_account_key_drop(&b->nr,
- t - b->set, _k);
-
- bch_drop_subtract(s, k);
- k.k->p = bkey_start_pos(&insert->k);
- if (!__extent_save(b, node_iter, _k, k.k)) {
- /*
- * Couldn't repack: we aren't necessarily able
- * to repack if the new key is outside the range
- * of the old extent, so we have to split
- * @insert:
- */
- k.k->p = orig_pos;
- extent_save(b, node_iter, _k, k.k);
-
- if (extent_insert_advance_pos(s, k.s_c) ==
- BTREE_HOOK_RESTART_TRANS)
- return BTREE_INSERT_NEED_TRAVERSE;
-
- extent_insert_committed(s);
- /*
- * We split and inserted upto at k.k->p - that
- * has to coincide with iter->pos, so that we
- * don't have anything more we have to insert
- * until we recheck our journal reservation:
- */
- EBUG_ON(bkey_cmp(s->committed, k.k->p));
- } else {
- bch_bset_fix_invalidated_key(b, t, _k);
- bch_btree_node_iter_fix(iter, b, node_iter, t,
- _k, _k->u64s, _k->u64s);
- }
-
- break;
- }
- case BCH_EXTENT_OVERLAP_MIDDLE: {
- BKEY_PADDED(k) split;
- /*
- * The insert key falls 'in the middle' of k
- * The insert key splits k in 3:
- * - start only in k, preserve
- * - middle common section, invalidate in k
- * - end only in k, preserve
- *
- * We update the old key to preserve the start,
- * insert will be the new common section,
- * we manually insert the end that we are preserving.
- *
- * modify k _before_ doing the insert (which will move
- * what k points to)
- */
- bkey_reassemble(&split.k, k.s_c);
- split.k.k.needs_whiteout |= bset_written(b, bset(b, t));
-
- bch_cut_back(bkey_start_pos(&insert->k), &split.k.k);
- BUG_ON(bkey_deleted(&split.k.k));
-
- bch_cut_subtract_front(s, insert->k.p, k);
- BUG_ON(bkey_deleted(k.k));
- extent_save(b, node_iter, _k, k.k);
-
- bch_add_sectors(s, bkey_i_to_s_c(&split.k),
- bkey_start_offset(&split.k.k),
- split.k.k.size);
- extent_bset_insert(c, iter, &split.k);
- break;
- }
- }
-
- return BTREE_INSERT_OK;
-}
-
-static enum btree_insert_ret
-bch_delete_fixup_extent(struct extent_insert_state *s)
-{
- struct bch_fs *c = s->trans->c;
- struct btree_iter *iter = s->insert->iter;
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
- struct bkey_packed *_k;
- struct bkey unpacked;
- struct bkey_i *insert = s->insert->k;
- enum btree_insert_ret ret = BTREE_INSERT_OK;
-
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-
- s->whiteout = *insert;
- s->do_journal = false;
-
- while (bkey_cmp(s->committed, insert->k.p) < 0 &&
- (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
- (_k = bch_btree_node_iter_peek_all(node_iter, b))) {
- struct bset_tree *t = bch_bkey_to_bset(b, _k);
- struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
- enum bch_extent_overlap overlap;
-
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
- EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
-
- if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
- break;
-
- if (bkey_whiteout(k.k)) {
- s->committed = bpos_min(insert->k.p, k.k->p);
- goto next;
- }
-
- overlap = bch_extent_overlap(&insert->k, k.k);
-
- ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
- if (ret != BTREE_INSERT_OK)
- goto stop;
-
- switch (extent_insert_advance_pos(s, k.s_c)) {
- case BTREE_HOOK_DO_INSERT:
- break;
- case BTREE_HOOK_NO_INSERT:
- continue;
- case BTREE_HOOK_RESTART_TRANS:
- ret = BTREE_INSERT_NEED_TRAVERSE;
- goto stop;
- }
-
- s->do_journal = true;
-
- if (overlap == BCH_EXTENT_OVERLAP_ALL) {
- btree_keys_account_key_drop(&b->nr,
- t - b->set, _k);
- bch_subtract_sectors(s, k.s_c,
- bkey_start_offset(k.k), k.k->size);
- _k->type = KEY_TYPE_DISCARD;
- reserve_whiteout(b, t, _k);
- } else if (k.k->needs_whiteout ||
- bset_written(b, bset(b, t))) {
- struct bkey_i discard = *insert;
-
- switch (overlap) {
- case BCH_EXTENT_OVERLAP_FRONT:
- bch_cut_front(bkey_start_pos(k.k), &discard);
- break;
- case BCH_EXTENT_OVERLAP_BACK:
- bch_cut_back(k.k->p, &discard.k);
- break;
- default:
- break;
- }
-
- discard.k.needs_whiteout = true;
-
- ret = extent_squash(s, insert, t, _k, k, overlap);
- BUG_ON(ret != BTREE_INSERT_OK);
-
- extent_bset_insert(c, iter, &discard);
- } else {
- ret = extent_squash(s, insert, t, _k, k, overlap);
- BUG_ON(ret != BTREE_INSERT_OK);
- }
-next:
- bch_cut_front(s->committed, insert);
- bch_btree_iter_set_pos_same_leaf(iter, s->committed);
- }
-
- if (bkey_cmp(s->committed, insert->k.p) < 0 &&
- ret == BTREE_INSERT_OK &&
- extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
- ret = BTREE_INSERT_NEED_TRAVERSE;
-stop:
- extent_insert_committed(s);
-
- bch_fs_usage_apply(c, &s->stats, s->trans->disk_res,
- gc_pos_btree_node(b));
-
- EBUG_ON(bkey_cmp(iter->pos, s->committed));
- EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
-
- bch_cut_front(iter->pos, insert);
-
- if (insert->k.size && iter->at_end_of_leaf)
- ret = BTREE_INSERT_NEED_TRAVERSE;
-
- EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
-
- return ret;
-}
-
-/**
- * bch_extent_insert_fixup - insert a new extent and deal with overlaps
- *
- * this may result in not actually doing the insert, or inserting some subset
- * of the insert key. For cmpxchg operations this is where that logic lives.
- *
- * All subsets of @insert that need to be inserted are inserted using
- * bch_btree_insert_and_journal(). If @b or @res fills up, this function
- * returns false, setting @iter->pos for the prefix of @insert that actually got
- * inserted.
- *
- * BSET INVARIANTS: this function is responsible for maintaining all the
- * invariants for bsets of extents in memory. things get really hairy with 0
- * size extents
- *
- * within one bset:
- *
- * bkey_start_pos(bkey_next(k)) >= k
- * or bkey_start_offset(bkey_next(k)) >= k->offset
- *
- * i.e. strict ordering, no overlapping extents.
- *
- * multiple bsets (i.e. full btree node):
- *
- * ∀ k, j
- * k.size != 0 ∧ j.size != 0 →
- * ¬ (k > bkey_start_pos(j) ∧ k < j)
- *
- * i.e. no two overlapping keys _of nonzero size_
- *
- * We can't realistically maintain this invariant for zero size keys because of
- * the key merging done in bch_btree_insert_key() - for two mergeable keys k, j
- * there may be another 0 size key between them in another bset, and it will
- * thus overlap with the merged key.
- *
- * In addition, the end of iter->pos indicates how much has been processed.
- * If the end of iter->pos is not the same as the end of insert, then
- * key insertion needs to continue/be retried.
- */
-enum btree_insert_ret
-bch_insert_fixup_extent(struct btree_insert *trans,
- struct btree_insert_entry *insert)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter *iter = insert->iter;
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
- struct bkey_packed *_k;
- struct bkey unpacked;
- enum btree_insert_ret ret = BTREE_INSERT_OK;
-
- struct extent_insert_state s = {
- .trans = trans,
- .insert = insert,
- .committed = insert->iter->pos,
- .deleting = bkey_whiteout(&insert->k->k),
- };
-
- EBUG_ON(iter->level);
- EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size);
-
- if (s.deleting)
- return bch_delete_fixup_extent(&s);
-
- /*
- * As we process overlapping extents, we advance @iter->pos both to
- * signal to our caller (btree_insert_key()) how much of @insert->k has
- * been inserted, and also to keep @iter->pos consistent with
- * @insert->k and the node iterator that we're advancing:
- */
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
-
- if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
- bch_add_sectors(&s, bkey_i_to_s_c(insert->k),
- bkey_start_offset(&insert->k->k),
- insert->k->k.size);
-
- while (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
- (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK &&
- (_k = bch_btree_node_iter_peek_all(node_iter, b))) {
- struct bset_tree *t = bch_bkey_to_bset(b, _k);
- struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
- enum bch_extent_overlap overlap;
-
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
- EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
-
- if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
- break;
-
- overlap = bch_extent_overlap(&insert->k->k, k.k);
-
- ret = extent_insert_check_split_compressed(&s, k.s_c, overlap);
- if (ret != BTREE_INSERT_OK)
- goto stop;
-
- if (!k.k->size)
- goto squash;
-
- /*
- * Only call advance pos & call hook for nonzero size extents:
- * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer
- * overlaps with @k:
- */
- switch (extent_insert_advance_pos(&s, k.s_c)) {
- case BTREE_HOOK_DO_INSERT:
- break;
- case BTREE_HOOK_NO_INSERT:
- continue;
- case BTREE_HOOK_RESTART_TRANS:
- ret = BTREE_INSERT_NEED_TRAVERSE;
- goto stop;
- }
-
- if (k.k->size &&
- (k.k->needs_whiteout || bset_written(b, bset(b, t))))
- insert->k->k.needs_whiteout = true;
-
- if (overlap == BCH_EXTENT_OVERLAP_ALL &&
- bkey_whiteout(k.k) &&
- k.k->needs_whiteout) {
- unreserve_whiteout(b, t, _k);
- _k->needs_whiteout = false;
- }
-squash:
- ret = extent_squash(&s, insert->k, t, _k, k, overlap);
- if (ret != BTREE_INSERT_OK)
- goto stop;
- }
-
- if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
- ret == BTREE_INSERT_OK &&
- extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
- ret = BTREE_INSERT_NEED_TRAVERSE;
-stop:
- extent_insert_committed(&s);
- /*
- * Subtract any remaining sectors from @insert, if we bailed out early
- * and didn't fully insert @insert:
- */
- if (insert->k->k.size &&
- !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
- bch_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
- bkey_start_offset(&insert->k->k),
- insert->k->k.size);
-
- bch_fs_usage_apply(c, &s.stats, trans->disk_res,
- gc_pos_btree_node(b));
-
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
- EBUG_ON(bkey_cmp(iter->pos, s.committed));
- EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) != iter->at_end_of_leaf);
-
- if (insert->k->k.size && iter->at_end_of_leaf)
- ret = BTREE_INSERT_NEED_TRAVERSE;
-
- EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
-
- return ret;
-}
-
-static const char *bch_extent_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
-{
- if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
- return "value too big";
-
- if (!k.k->size)
- return "zero key size";
-
- switch (k.k->type) {
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const union bch_extent_entry *entry;
- const union bch_extent_crc *crc;
- const struct bch_extent_ptr *ptr;
- unsigned size_ondisk = e.k->size;
- const char *reason;
-
- extent_for_each_entry(e, entry) {
- if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
- return "invalid extent entry type";
-
- if (extent_entry_is_crc(entry)) {
- crc = entry_to_crc(entry);
-
- if (crc_offset(crc) + e.k->size >
- crc_uncompressed_size(e.k, crc))
- return "checksum offset + key size > uncompressed size";
-
- size_ondisk = crc_compressed_size(e.k, crc);
-
- if (!bch_checksum_type_valid(c, crc_csum_type(crc)))
- return "invalid checksum type";
-
- if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
- return "invalid compression type";
- } else {
- ptr = entry_to_ptr(entry);
-
- reason = extent_ptr_invalid(c, e, &entry->ptr,
- size_ondisk, false);
- if (reason)
- return reason;
- }
- }
-
- return NULL;
- }
-
- case BCH_RESERVATION: {
- struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
- if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
- return "incorrect value size";
-
- if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
- return "invalid nr_replicas";
-
- return NULL;
- }
-
- default:
- return "invalid value type";
- }
-}
-
-static void bch_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
- struct bkey_s_c_extent e)
-{
- const struct bch_extent_ptr *ptr;
- struct bch_dev *ca;
- struct bucket *g;
- unsigned seq, stale;
- char buf[160];
- bool bad;
- unsigned ptrs_per_tier[BCH_TIER_MAX];
- unsigned replicas = 0;
-
- /*
- * XXX: we should be doing most/all of these checks at startup time,
- * where we check bkey_invalid() in btree_node_read_done()
- *
- * But note that we can't check for stale pointers or incorrect gc marks
- * until after journal replay is done (it might be an extent that's
- * going to get overwritten during replay)
- */
-
- memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
-
- extent_for_each_ptr(e, ptr) {
- ca = c->devs[ptr->dev];
- g = PTR_BUCKET(ca, ptr);
- replicas++;
- ptrs_per_tier[ca->mi.tier]++;
-
- /*
- * If journal replay hasn't finished, we might be seeing keys
- * that will be overwritten by the time journal replay is done:
- */
- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
- continue;
-
- stale = 0;
-
- do {
- struct bucket_mark mark;
-
- seq = read_seqcount_begin(&c->gc_pos_lock);
- mark = READ_ONCE(g->mark);
-
- /* between mark and bucket gen */
- smp_rmb();
-
- stale = ptr_stale(ca, ptr);
-
- bch_fs_bug_on(stale && !ptr->cached, c,
- "stale dirty pointer");
-
- bch_fs_bug_on(stale > 96, c,
- "key too stale: %i",
- stale);
-
- if (stale)
- break;
-
- bad = (mark.data_type != BUCKET_DATA ||
- (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
- !mark.owned_by_allocator &&
- !(ptr->cached
- ? mark.cached_sectors
- : mark.dirty_sectors)));
- } while (read_seqcount_retry(&c->gc_pos_lock, seq));
-
- if (bad)
- goto bad_ptr;
- }
-
- if (replicas > BCH_REPLICAS_MAX) {
- bch_bkey_val_to_text(c, btree_node_type(b), buf,
- sizeof(buf), e.s_c);
- bch_fs_bug(c,
- "extent key bad (too many replicas: %u): %s",
- replicas, buf);
- return;
- }
-
- if (!bkey_extent_is_cached(e.k) &&
- replicas < c->sb.data_replicas_have) {
- bch_bkey_val_to_text(c, btree_node_type(b), buf,
- sizeof(buf), e.s_c);
- bch_fs_bug(c,
- "extent key bad (too few replicas, %u < %u): %s",
- replicas, c->sb.data_replicas_have, buf);
- return;
- }
-
- return;
-
-bad_ptr:
- bch_bkey_val_to_text(c, btree_node_type(b), buf,
- sizeof(buf), e.s_c);
- bch_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu prio %i "
- "gen %i last_gc %i mark 0x%08x",
- buf, PTR_BUCKET_NR(ca, ptr),
- g->read_prio, PTR_BUCKET(ca, ptr)->mark.gen,
- ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
- (unsigned) g->mark.counter);
- return;
-}
-
-static void bch_extent_debugcheck(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k)
-{
- switch (k.k->type) {
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- bch_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
- break;
- case BCH_RESERVATION:
- break;
- default:
- BUG();
- }
-}
-
-static void bch_extent_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
-{
- char *out = buf, *end = buf + size;
- const char *invalid;
-
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
-
- if (bkey_extent_is_data(k.k))
- out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
-
- invalid = bch_extent_invalid(c, k);
- if (invalid)
- p(" invalid: %s", invalid);
-#undef p
-}
-
-static unsigned PTR_TIER(struct bch_fs *c,
- const struct bch_extent_ptr *ptr)
-{
- return c->devs[ptr->dev]->mi.tier;
-}
-
-static void bch_extent_crc_init(union bch_extent_crc *crc,
- unsigned compressed_size,
- unsigned uncompressed_size,
- unsigned compression_type,
- unsigned nonce,
- struct bch_csum csum, unsigned csum_type)
-{
- if (bch_crc_bytes[csum_type] <= 4 &&
- uncompressed_size <= CRC32_SIZE_MAX &&
- nonce <= CRC32_NONCE_MAX) {
- crc->crc32 = (struct bch_extent_crc32) {
- .type = 1 << BCH_EXTENT_ENTRY_crc32,
- ._compressed_size = compressed_size - 1,
- ._uncompressed_size = uncompressed_size - 1,
- .offset = 0,
- .compression_type = compression_type,
- .csum_type = csum_type,
- .csum = *((__le32 *) &csum.lo),
- };
- return;
- }
-
- if (bch_crc_bytes[csum_type] <= 10 &&
- uncompressed_size <= CRC64_SIZE_MAX &&
- nonce <= CRC64_NONCE_MAX) {
- crc->crc64 = (struct bch_extent_crc64) {
- .type = 1 << BCH_EXTENT_ENTRY_crc64,
- ._compressed_size = compressed_size - 1,
- ._uncompressed_size = uncompressed_size - 1,
- .offset = 0,
- .nonce = nonce,
- .compression_type = compression_type,
- .csum_type = csum_type,
- .csum_lo = csum.lo,
- .csum_hi = *((__le16 *) &csum.hi),
- };
- return;
- }
-
- if (bch_crc_bytes[csum_type] <= 16 &&
- uncompressed_size <= CRC128_SIZE_MAX &&
- nonce <= CRC128_NONCE_MAX) {
- crc->crc128 = (struct bch_extent_crc128) {
- .type = 1 << BCH_EXTENT_ENTRY_crc128,
- ._compressed_size = compressed_size - 1,
- ._uncompressed_size = uncompressed_size - 1,
- .offset = 0,
- .nonce = nonce,
- .compression_type = compression_type,
- .csum_type = csum_type,
- .csum = csum,
- };
- return;
- }
-
- BUG();
-}
-
-void bch_extent_crc_append(struct bkey_i_extent *e,
- unsigned compressed_size,
- unsigned uncompressed_size,
- unsigned compression_type,
- unsigned nonce,
- struct bch_csum csum, unsigned csum_type)
-{
- union bch_extent_crc *crc;
-
- BUG_ON(compressed_size > uncompressed_size);
- BUG_ON(uncompressed_size != e->k.size);
- BUG_ON(!compressed_size || !uncompressed_size);
-
- /*
- * Look up the last crc entry, so we can check if we need to add
- * another:
- */
- extent_for_each_crc(extent_i_to_s(e), crc)
- ;
-
- if (!crc && !csum_type && !compression_type)
- return;
-
- if (crc &&
- crc_compressed_size(&e->k, crc) == compressed_size &&
- crc_uncompressed_size(&e->k, crc) == uncompressed_size &&
- crc_offset(crc) == 0 &&
- crc_nonce(crc) == nonce &&
- crc_csum_type(crc) == csum_type &&
- crc_compression_type(crc) == compression_type &&
- crc_csum(crc).lo == csum.lo &&
- crc_csum(crc).hi == csum.hi)
- return;
-
- bch_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)),
- compressed_size,
- uncompressed_size,
- compression_type,
- nonce, csum, csum_type);
- __extent_entry_push(e);
-}
-
-/*
- * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
- *
- * Returns true if @k should be dropped entirely
- *
- * For existing keys, only called when btree nodes are being rewritten, not when
- * they're merely being compacted/resorted in memory.
- */
-bool bch_extent_normalize(struct bch_fs *c, struct bkey_s k)
-{
- struct bkey_s_extent e;
-
- switch (k.k->type) {
- case KEY_TYPE_ERROR:
- return false;
-
- case KEY_TYPE_DELETED:
- case KEY_TYPE_COOKIE:
- return true;
-
- case KEY_TYPE_DISCARD:
- return bversion_zero(k.k->version);
-
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- e = bkey_s_to_extent(k);
-
- bch_extent_drop_stale(c, e);
-
- if (!bkey_val_u64s(e.k)) {
- if (bkey_extent_is_cached(e.k)) {
- k.k->type = KEY_TYPE_DISCARD;
- if (bversion_zero(k.k->version))
- return true;
- } else {
- k.k->type = KEY_TYPE_ERROR;
- }
- }
-
- return false;
- case BCH_RESERVATION:
- return false;
- default:
- BUG();
- }
-}
-
-void bch_extent_mark_replicas_cached(struct bch_fs *c,
- struct bkey_s_extent e,
- unsigned nr_cached)
-{
- struct bch_extent_ptr *ptr;
- bool have_higher_tier;
- unsigned tier = 0;
-
- if (!nr_cached)
- return;
-
- do {
- have_higher_tier = false;
-
- extent_for_each_ptr(e, ptr) {
- if (!ptr->cached &&
- PTR_TIER(c, ptr) == tier) {
- ptr->cached = true;
- nr_cached--;
- if (!nr_cached)
- return;
- }
-
- if (PTR_TIER(c, ptr) > tier)
- have_higher_tier = true;
- }
-
- tier++;
- } while (have_higher_tier);
-}
-
-/*
- * This picks a non-stale pointer, preferabbly from a device other than
- * avoid. Avoid can be NULL, meaning pick any. If there are no non-stale
- * pointers to other devices, it will still pick a pointer from avoid.
- * Note that it prefers lowered-numbered pointers to higher-numbered pointers
- * as the pointers are sorted by tier, hence preferring pointers to tier 0
- * rather than pointers to tier 1.
- */
-void bch_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
- struct bch_dev *avoid,
- struct extent_pick_ptr *ret)
-{
- struct bkey_s_c_extent e;
- const union bch_extent_crc *crc;
- const struct bch_extent_ptr *ptr;
-
- switch (k.k->type) {
- case KEY_TYPE_DELETED:
- case KEY_TYPE_DISCARD:
- case KEY_TYPE_COOKIE:
- ret->ca = NULL;
- return;
-
- case KEY_TYPE_ERROR:
- ret->ca = ERR_PTR(-EIO);
- return;
-
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- e = bkey_s_c_to_extent(k);
- ret->ca = NULL;
-
- extent_for_each_ptr_crc(e, ptr, crc) {
- struct bch_dev *ca = c->devs[ptr->dev];
-
- if (ptr->cached && ptr_stale(ca, ptr))
- continue;
-
- if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
- continue;
-
- if (ret->ca &&
- (ca == avoid ||
- ret->ca->mi.tier < ca->mi.tier))
- continue;
-
- if (!percpu_ref_tryget(&ca->io_ref))
- continue;
-
- if (ret->ca)
- percpu_ref_put(&ret->ca->io_ref);
-
- *ret = (struct extent_pick_ptr) {
- .crc = crc_to_128(e.k, crc),
- .ptr = *ptr,
- .ca = ca,
- };
- }
-
- if (!ret->ca && !bkey_extent_is_cached(e.k))
- ret->ca = ERR_PTR(-EIO);
- return;
-
- case BCH_RESERVATION:
- ret->ca = NULL;
- return;
-
- default:
- BUG();
- }
-}
-
-static enum merge_result bch_extent_merge(struct bch_fs *c,
- struct btree *bk,
- struct bkey_i *l, struct bkey_i *r)
-{
- struct bkey_s_extent el, er;
- union bch_extent_entry *en_l, *en_r;
-
- if (key_merging_disabled(c))
- return BCH_MERGE_NOMERGE;
-
- /*
- * Generic header checks
- * Assumes left and right are in order
- * Left and right must be exactly aligned
- */
-
- if (l->k.u64s != r->k.u64s ||
- l->k.type != r->k.type ||
- bversion_cmp(l->k.version, r->k.version) ||
- bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
- return BCH_MERGE_NOMERGE;
-
- switch (l->k.type) {
- case KEY_TYPE_DELETED:
- case KEY_TYPE_DISCARD:
- case KEY_TYPE_ERROR:
- /* These types are mergeable, and no val to check */
- break;
-
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- el = bkey_i_to_s_extent(l);
- er = bkey_i_to_s_extent(r);
-
- extent_for_each_entry(el, en_l) {
- struct bch_extent_ptr *lp, *rp;
- unsigned bucket_size;
-
- en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
-
- if ((extent_entry_type(en_l) !=
- extent_entry_type(en_r)) ||
- extent_entry_is_crc(en_l))
- return BCH_MERGE_NOMERGE;
-
- lp = &en_l->ptr;
- rp = &en_r->ptr;
-
- if (lp->offset + el.k->size != rp->offset ||
- lp->dev != rp->dev ||
- lp->gen != rp->gen)
- return BCH_MERGE_NOMERGE;
-
- /* We don't allow extents to straddle buckets: */
- bucket_size = c->devs[lp->dev]->mi.bucket_size;
-
- if ((lp->offset & ~((u64) bucket_size - 1)) !=
- (rp->offset & ~((u64) bucket_size - 1)))
- return BCH_MERGE_NOMERGE;
- }
-
- break;
- case BCH_RESERVATION: {
- struct bkey_i_reservation *li = bkey_i_to_reservation(l);
- struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
-
- if (li->v.generation != ri->v.generation ||
- li->v.nr_replicas != ri->v.nr_replicas)
- return BCH_MERGE_NOMERGE;
- break;
- }
- default:
- return BCH_MERGE_NOMERGE;
- }
-
- l->k.needs_whiteout |= r->k.needs_whiteout;
-
- /* Keys with no pointers aren't restricted to one bucket and could
- * overflow KEY_SIZE
- */
- if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
- bch_key_resize(&l->k, KEY_SIZE_MAX);
- bch_cut_front(l->k.p, r);
- return BCH_MERGE_PARTIAL;
- }
-
- bch_key_resize(&l->k, l->k.size + r->k.size);
-
- return BCH_MERGE_MERGE;
-}
-
-static void extent_i_save(struct btree *b, struct bkey_packed *dst,
- struct bkey_i *src)
-{
- struct bkey_format *f = &b->format;
- struct bkey_i *dst_unpacked;
-
- BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
-
- /*
- * We don't want the bch_verify_key_order() call in extent_save(),
- * because we may be out of order with deleted keys that are about to be
- * removed by extent_bset_insert()
- */
-
- if ((dst_unpacked = packed_to_bkey(dst)))
- bkey_copy(dst_unpacked, src);
- else
- BUG_ON(!bkey_pack(dst, src, f));
-}
-
-static bool extent_merge_one_overlapping(struct btree_iter *iter,
- struct bpos new_pos,
- struct bset_tree *t,
- struct bkey_packed *k, struct bkey uk,
- bool check, bool could_pack)
-{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
-
- BUG_ON(!bkey_deleted(k));
-
- if (check) {
- return !bkey_packed(k) || could_pack;
- } else {
- uk.p = new_pos;
- extent_save(b, node_iter, k, &uk);
- bch_bset_fix_invalidated_key(b, t, k);
- bch_btree_node_iter_fix(iter, b, node_iter, t,
- k, k->u64s, k->u64s);
- return true;
- }
-}
-
-static bool extent_merge_do_overlapping(struct btree_iter *iter,
- struct bkey *m, bool back_merge)
-{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
- struct bset_tree *t;
- struct bkey_packed *k;
- struct bkey uk;
- struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
- bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
- bool check = true;
-
- /*
- * @m is the new merged extent:
- *
- * The merge took place in the last bset; we know there can't be any 0
- * size extents overlapping with m there because if so they would have
- * been between the two extents we merged.
- *
- * But in the other bsets, we have to check for and fix such extents:
- */
-do_fixup:
- for_each_bset(b, t) {
- if (t == bset_tree_last(b))
- break;
-
- /*
- * if we don't find this bset in the iterator we already got to
- * the end of that bset, so start searching from the end.
- */
- k = bch_btree_node_iter_bset_pos(node_iter, b, t);
-
- if (k == btree_bkey_last(b, t))
- k = bkey_prev_all(b, t, k);
- if (!k)
- continue;
-
- if (back_merge) {
- /*
- * Back merge: 0 size extents will be before the key
- * that was just inserted (and thus the iterator
- * position) - walk backwards to find them
- */
- for (;
- k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
- k = bkey_prev_all(b, t, k)) {
- if (bkey_cmp(uk.p, m->p) >= 0)
- continue;
-
- if (!extent_merge_one_overlapping(iter, new_pos,
- t, k, uk, check, could_pack))
- return false;
- }
- } else {
- /* Front merge - walk forwards */
- for (;
- k != btree_bkey_last(b, t) &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(uk.p, m->p) < 0);
- k = bkey_next(k)) {
- if (bkey_cmp(uk.p,
- bkey_start_pos(m)) <= 0)
- continue;
-
- if (!extent_merge_one_overlapping(iter, new_pos,
- t, k, uk, check, could_pack))
- return false;
- }
- }
- }
-
- if (check) {
- check = false;
- goto do_fixup;
- }
-
- return true;
-}
-
-/*
- * When merging an extent that we're inserting into a btree node, the new merged
- * extent could overlap with an existing 0 size extent - if we don't fix that,
- * it'll break the btree node iterator so this code finds those 0 size extents
- * and shifts them out of the way.
- *
- * Also unpacks and repacks.
- */
-static bool bch_extent_merge_inline(struct bch_fs *c,
- struct btree_iter *iter,
- struct bkey_packed *l,
- struct bkey_packed *r,
- bool back_merge)
-{
- struct btree *b = iter->nodes[0];
- struct btree_node_iter *node_iter = &iter->node_iters[0];
- const struct bkey_format *f = &b->format;
- struct bset_tree *t = bset_tree_last(b);
- struct bkey_packed *m;
- BKEY_PADDED(k) li;
- BKEY_PADDED(k) ri;
- struct bkey_i *mi;
- struct bkey tmp;
-
- /*
- * We need to save copies of both l and r, because we might get a
- * partial merge (which modifies both) and then fails to repack
- */
- bkey_unpack(b, &li.k, l);
- bkey_unpack(b, &ri.k, r);
-
- m = back_merge ? l : r;
- mi = back_merge ? &li.k : &ri.k;
-
- /* l & r should be in last bset: */
- EBUG_ON(bch_bkey_to_bset(b, m) != t);
-
- switch (bch_extent_merge(c, b, &li.k, &ri.k)) {
- case BCH_MERGE_NOMERGE:
- return false;
- case BCH_MERGE_PARTIAL:
- if (bkey_packed(m) && !bkey_pack_key((void *) &tmp, &mi->k, f))
- return false;
-
- if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
- return false;
-
- extent_i_save(b, m, mi);
- bch_bset_fix_invalidated_key(b, t, m);
-
- /*
- * Update iterator to reflect what we just inserted - otherwise,
- * the iter_fix() call is going to put us _before_ the key we
- * just partially merged with:
- */
- if (back_merge)
- bch_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
-
- bch_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
- t, m, m->u64s, m->u64s);
-
- if (!back_merge)
- bkey_copy(packed_to_bkey(l), &li.k);
- else
- bkey_copy(packed_to_bkey(r), &ri.k);
- return false;
- case BCH_MERGE_MERGE:
- if (bkey_packed(m) && !bkey_pack_key((void *) &tmp, &li.k.k, f))
- return false;
-
- if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
- return false;
-
- extent_i_save(b, m, &li.k);
- bch_bset_fix_invalidated_key(b, t, m);
-
- bch_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
- t, m, m->u64s, m->u64s);
- return true;
- default:
- BUG();
- }
-}
-
-const struct bkey_ops bch_bkey_extent_ops = {
- .key_invalid = bch_extent_invalid,
- .key_debugcheck = bch_extent_debugcheck,
- .val_to_text = bch_extent_to_text,
- .swab = bch_ptr_swab,
- .key_normalize = bch_ptr_normalize,
- .key_merge = bch_extent_merge,
- .is_extents = true,
-};
diff --git a/libbcache/extents.h b/libbcache/extents.h
deleted file mode 100644
index 1d63b79d..00000000
--- a/libbcache/extents.h
+++ /dev/null
@@ -1,587 +0,0 @@
-#ifndef _BCACHE_EXTENTS_H
-#define _BCACHE_EXTENTS_H
-
-#include "bcache.h"
-#include "bkey.h"
-
-#include <linux/bcache.h>
-
-struct btree_node_iter;
-struct btree_insert;
-struct btree_insert_entry;
-struct extent_insert_hook;
-
-struct btree_nr_keys bch_key_sort_fix_overlapping(struct bset *,
- struct btree *,
- struct btree_node_iter *);
-struct btree_nr_keys bch_extent_sort_fix_overlapping(struct bch_fs *c,
- struct bset *,
- struct btree *,
- struct btree_node_iter *);
-
-extern const struct bkey_ops bch_bkey_btree_ops;
-extern const struct bkey_ops bch_bkey_extent_ops;
-
-struct bch_fs;
-struct journal_res;
-
-struct extent_pick_ptr {
- struct bch_extent_crc128 crc;
- struct bch_extent_ptr ptr;
- struct bch_dev *ca;
-};
-
-struct extent_pick_ptr
-bch_btree_pick_ptr(struct bch_fs *, const struct btree *);
-
-void bch_extent_pick_ptr_avoiding(struct bch_fs *, struct bkey_s_c,
- struct bch_dev *, struct extent_pick_ptr *);
-
-static inline void
-bch_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
- struct extent_pick_ptr *ret)
-{
- bch_extent_pick_ptr_avoiding(c, k, NULL, ret);
-}
-
-enum extent_insert_hook_ret
-bch_extent_cmpxchg(struct extent_insert_hook *, struct bpos, struct bpos,
- struct bkey_s_c, const struct bkey_i *);
-
-enum btree_insert_ret
-bch_insert_fixup_extent(struct btree_insert *,
- struct btree_insert_entry *);
-
-bool bch_extent_normalize(struct bch_fs *, struct bkey_s);
-void bch_extent_mark_replicas_cached(struct bch_fs *,
- struct bkey_s_extent, unsigned);
-
-unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent);
-unsigned bch_extent_nr_dirty_ptrs(struct bkey_s_c);
-
-static inline bool bkey_extent_is_data(const struct bkey *k)
-{
- switch (k->type) {
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool bkey_extent_is_allocation(const struct bkey *k)
-{
- switch (k->type) {
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- case BCH_RESERVATION:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool bkey_extent_is_cached(const struct bkey *k)
-{
- return k->type == BCH_EXTENT_CACHED;
-}
-
-static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
-{
- EBUG_ON(k->type != BCH_EXTENT &&
- k->type != BCH_EXTENT_CACHED);
-
- k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
-}
-
-static inline unsigned
-__extent_entry_type(const union bch_extent_entry *e)
-{
- return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
-}
-
-static inline enum bch_extent_entry_type
-extent_entry_type(const union bch_extent_entry *e)
-{
- int ret = __ffs(e->type);
-
- EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
-
- return ret;
-}
-
-static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
-{
- switch (extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_crc32:
- return sizeof(struct bch_extent_crc32);
- case BCH_EXTENT_ENTRY_crc64:
- return sizeof(struct bch_extent_crc64);
- case BCH_EXTENT_ENTRY_crc128:
- return sizeof(struct bch_extent_crc128);
- case BCH_EXTENT_ENTRY_ptr:
- return sizeof(struct bch_extent_ptr);
- default:
- BUG();
- }
-}
-
-static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
-{
- return extent_entry_bytes(entry) / sizeof(u64);
-}
-
-static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
-{
- return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
-}
-
-static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
-{
- return !extent_entry_is_ptr(e);
-}
-
-union bch_extent_crc {
- u8 type;
- struct bch_extent_crc32 crc32;
- struct bch_extent_crc64 crc64;
- struct bch_extent_crc128 crc128;
-};
-
-/* downcast, preserves const */
-#define to_entry(_entry) \
-({ \
- BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \
- !type_is(_entry, struct bch_extent_ptr *)); \
- \
- __builtin_choose_expr( \
- (type_is_exact(_entry, const union bch_extent_crc *) || \
- type_is_exact(_entry, const struct bch_extent_ptr *)), \
- (const union bch_extent_entry *) (_entry), \
- (union bch_extent_entry *) (_entry)); \
-})
-
-#define __entry_to_crc(_entry) \
- __builtin_choose_expr( \
- type_is_exact(_entry, const union bch_extent_entry *), \
- (const union bch_extent_crc *) (_entry), \
- (union bch_extent_crc *) (_entry))
-
-#define entry_to_crc(_entry) \
-({ \
- EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \
- \
- __entry_to_crc(_entry); \
-})
-
-#define entry_to_ptr(_entry) \
-({ \
- EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \
- \
- __builtin_choose_expr( \
- type_is_exact(_entry, const union bch_extent_entry *), \
- (const struct bch_extent_ptr *) (_entry), \
- (struct bch_extent_ptr *) (_entry)); \
-})
-
-enum bch_extent_crc_type {
- BCH_EXTENT_CRC_NONE,
- BCH_EXTENT_CRC32,
- BCH_EXTENT_CRC64,
- BCH_EXTENT_CRC128,
-};
-
-static inline enum bch_extent_crc_type
-__extent_crc_type(const union bch_extent_crc *crc)
-{
- if (!crc)
- return BCH_EXTENT_CRC_NONE;
-
- switch (extent_entry_type(to_entry(crc))) {
- case BCH_EXTENT_ENTRY_crc32:
- return BCH_EXTENT_CRC32;
- case BCH_EXTENT_ENTRY_crc64:
- return BCH_EXTENT_CRC64;
- case BCH_EXTENT_ENTRY_crc128:
- return BCH_EXTENT_CRC128;
- default:
- BUG();
- }
-}
-
-#define extent_crc_type(_crc) \
-({ \
- BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) && \
- !type_is(_crc, struct bch_extent_crc64 *) && \
- !type_is(_crc, struct bch_extent_crc128 *) && \
- !type_is(_crc, union bch_extent_crc *)); \
- \
- type_is(_crc, struct bch_extent_crc32 *) ? BCH_EXTENT_CRC32 \
- : type_is(_crc, struct bch_extent_crc64 *) ? BCH_EXTENT_CRC64 \
- : type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128 \
- : __extent_crc_type((union bch_extent_crc *) _crc); \
-})
-
-#define extent_entry_next(_entry) \
- ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
-
-#define extent_entry_last(_e) \
- vstruct_idx((_e).v, bkey_val_u64s((_e).k))
-
-/* Iterate over all entries: */
-
-#define extent_for_each_entry_from(_e, _entry, _start) \
- for ((_entry) = _start; \
- (_entry) < extent_entry_last(_e); \
- (_entry) = extent_entry_next(_entry))
-
-#define extent_for_each_entry(_e, _entry) \
- extent_for_each_entry_from(_e, _entry, (_e).v->start)
-
-/* Iterate over crcs only: */
-
-#define extent_crc_next(_e, _p) \
-({ \
- typeof(&(_e).v->start[0]) _entry = _p; \
- \
- while ((_entry) < extent_entry_last(_e) && \
- !extent_entry_is_crc(_entry)) \
- (_entry) = extent_entry_next(_entry); \
- \
- entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL); \
-})
-
-#define extent_for_each_crc(_e, _crc) \
- for ((_crc) = extent_crc_next(_e, (_e).v->start); \
- (_crc); \
- (_crc) = extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
-
-/* Iterate over pointers, with crcs: */
-
-#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter) \
-({ \
- __label__ out; \
- typeof(&(_e).v->start[0]) _entry; \
- \
- extent_for_each_entry_from(_e, _entry, to_entry(_ptr)) \
- if (extent_entry_is_crc(_entry)) { \
- (_crc) = entry_to_crc(_entry); \
- } else { \
- _ptr = entry_to_ptr(_entry); \
- if (_filter) \
- goto out; \
- } \
- \
- _ptr = NULL; \
-out: \
- _ptr; \
-})
-
-#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter) \
- for ((_crc) = NULL, \
- (_ptr) = &(_e).v->start->ptr; \
- ((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\
- (_ptr)++)
-
-#define extent_for_each_ptr_crc(_e, _ptr, _crc) \
- extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true)
-
-/* Iterate over pointers only, and from a given position: */
-
-#define extent_ptr_next_filter(_e, _ptr, _filter) \
-({ \
- typeof(__entry_to_crc(&(_e).v->start[0])) _crc; \
- \
- extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter); \
-})
-
-#define extent_ptr_next(_e, _ptr) \
- extent_ptr_next_filter(_e, _ptr, true)
-
-#define extent_for_each_ptr_filter(_e, _ptr, _filter) \
- for ((_ptr) = &(_e).v->start->ptr; \
- ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter)); \
- (_ptr)++)
-
-#define extent_for_each_ptr(_e, _ptr) \
- extent_for_each_ptr_filter(_e, _ptr, true)
-
-#define extent_ptr_prev(_e, _ptr) \
-({ \
- typeof(&(_e).v->start->ptr) _p; \
- typeof(&(_e).v->start->ptr) _prev = NULL; \
- \
- extent_for_each_ptr(_e, _p) { \
- if (_p == (_ptr)) \
- break; \
- _prev = _p; \
- } \
- \
- _prev; \
-})
-
-/*
- * Use this when you'll be dropping pointers as you iterate. Quadratic,
- * unfortunately:
- */
-#define extent_for_each_ptr_backwards(_e, _ptr) \
- for ((_ptr) = extent_ptr_prev(_e, NULL); \
- (_ptr); \
- (_ptr) = extent_ptr_prev(_e, _ptr))
-
-void bch_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned,
- unsigned, unsigned, struct bch_csum, unsigned);
-
-static inline void __extent_entry_push(struct bkey_i_extent *e)
-{
- union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
-
- EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
- BKEY_EXTENT_VAL_U64s_MAX);
-
- e->k.u64s += extent_entry_u64s(entry);
-}
-
-static inline void extent_ptr_append(struct bkey_i_extent *e,
- struct bch_extent_ptr ptr)
-{
- ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
- extent_entry_last(extent_i_to_s(e))->ptr = ptr;
- __extent_entry_push(e);
-}
-
-static inline struct bch_extent_crc128 crc_to_128(const struct bkey *k,
- const union bch_extent_crc *crc)
-{
- EBUG_ON(!k->size);
-
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- return (struct bch_extent_crc128) {
- ._compressed_size = k->size - 1,
- ._uncompressed_size = k->size - 1,
- };
- case BCH_EXTENT_CRC32:
- return (struct bch_extent_crc128) {
- .type = 1 << BCH_EXTENT_ENTRY_crc128,
- ._compressed_size = crc->crc32._compressed_size,
- ._uncompressed_size = crc->crc32._uncompressed_size,
- .offset = crc->crc32.offset,
- .csum_type = crc->crc32.csum_type,
- .compression_type = crc->crc32.compression_type,
- .csum.lo = crc->crc32.csum,
- };
- case BCH_EXTENT_CRC64:
- return (struct bch_extent_crc128) {
- .type = 1 << BCH_EXTENT_ENTRY_crc128,
- ._compressed_size = crc->crc64._compressed_size,
- ._uncompressed_size = crc->crc64._uncompressed_size,
- .offset = crc->crc64.offset,
- .nonce = crc->crc64.nonce,
- .csum_type = crc->crc64.csum_type,
- .compression_type = crc->crc64.compression_type,
- .csum.lo = crc->crc64.csum_lo,
- .csum.hi = crc->crc64.csum_hi,
- };
- case BCH_EXTENT_CRC128:
- return crc->crc128;
- default:
- BUG();
- }
-}
-
-#define crc_compressed_size(_k, _crc) \
-({ \
- unsigned _size = 0; \
- \
- switch (extent_crc_type(_crc)) { \
- case BCH_EXTENT_CRC_NONE: \
- _size = ((const struct bkey *) (_k))->size; \
- break; \
- case BCH_EXTENT_CRC32: \
- _size = ((struct bch_extent_crc32 *) _crc) \
- ->_compressed_size + 1; \
- break; \
- case BCH_EXTENT_CRC64: \
- _size = ((struct bch_extent_crc64 *) _crc) \
- ->_compressed_size + 1; \
- break; \
- case BCH_EXTENT_CRC128: \
- _size = ((struct bch_extent_crc128 *) _crc) \
- ->_compressed_size + 1; \
- break; \
- } \
- _size; \
-})
-
-#define crc_uncompressed_size(_k, _crc) \
-({ \
- unsigned _size = 0; \
- \
- switch (extent_crc_type(_crc)) { \
- case BCH_EXTENT_CRC_NONE: \
- _size = ((const struct bkey *) (_k))->size; \
- break; \
- case BCH_EXTENT_CRC32: \
- _size = ((struct bch_extent_crc32 *) _crc) \
- ->_uncompressed_size + 1; \
- break; \
- case BCH_EXTENT_CRC64: \
- _size = ((struct bch_extent_crc64 *) _crc) \
- ->_uncompressed_size + 1; \
- break; \
- case BCH_EXTENT_CRC128: \
- _size = ((struct bch_extent_crc128 *) _crc) \
- ->_uncompressed_size + 1; \
- break; \
- } \
- _size; \
-})
-
-static inline unsigned crc_offset(const union bch_extent_crc *crc)
-{
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- return 0;
- case BCH_EXTENT_CRC32:
- return crc->crc32.offset;
- case BCH_EXTENT_CRC64:
- return crc->crc64.offset;
- case BCH_EXTENT_CRC128:
- return crc->crc128.offset;
- default:
- BUG();
- }
-}
-
-static inline unsigned crc_nonce(const union bch_extent_crc *crc)
-{
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- case BCH_EXTENT_CRC32:
- return 0;
- case BCH_EXTENT_CRC64:
- return crc->crc64.nonce;
- case BCH_EXTENT_CRC128:
- return crc->crc128.nonce;
- default:
- BUG();
- }
-}
-
-static inline unsigned crc_csum_type(const union bch_extent_crc *crc)
-{
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- return 0;
- case BCH_EXTENT_CRC32:
- return crc->crc32.csum_type;
- case BCH_EXTENT_CRC64:
- return crc->crc64.csum_type;
- case BCH_EXTENT_CRC128:
- return crc->crc128.csum_type;
- default:
- BUG();
- }
-}
-
-static inline unsigned crc_compression_type(const union bch_extent_crc *crc)
-{
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- return 0;
- case BCH_EXTENT_CRC32:
- return crc->crc32.compression_type;
- case BCH_EXTENT_CRC64:
- return crc->crc64.compression_type;
- case BCH_EXTENT_CRC128:
- return crc->crc128.compression_type;
- default:
- BUG();
- }
-}
-
-static inline struct bch_csum crc_csum(const union bch_extent_crc *crc)
-{
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- return (struct bch_csum) { 0 };
- case BCH_EXTENT_CRC32:
- return (struct bch_csum) { .lo = crc->crc32.csum };
- case BCH_EXTENT_CRC64:
- return (struct bch_csum) {
- .lo = crc->crc64.csum_lo,
- .hi = crc->crc64.csum_hi,
- };
- case BCH_EXTENT_CRC128:
- return crc->crc128.csum;
- default:
- BUG();
- }
-}
-
-static inline unsigned bkey_extent_is_compressed(struct bkey_s_c k)
-{
- struct bkey_s_c_extent e;
- const struct bch_extent_ptr *ptr;
- const union bch_extent_crc *crc;
- unsigned ret = 0;
-
- switch (k.k->type) {
- case BCH_EXTENT:
- case BCH_EXTENT_CACHED:
- e = bkey_s_c_to_extent(k);
-
- extent_for_each_ptr_crc(e, ptr, crc)
- if (!ptr->cached &&
- crc_compression_type(crc) != BCH_COMPRESSION_NONE &&
- crc_compressed_size(e.k, crc) < k.k->size)
- ret = max_t(unsigned, ret,
- crc_compressed_size(e.k, crc));
- }
-
- return ret;
-}
-
-static inline unsigned extent_current_nonce(struct bkey_s_c_extent e)
-{
- const union bch_extent_crc *crc;
-
- extent_for_each_crc(e, crc)
- if (bch_csum_type_is_encryption(crc_csum_type(crc)))
- return crc_offset(crc) + crc_nonce(crc);
-
- return 0;
-}
-
-void bch_extent_narrow_crcs(struct bkey_s_extent);
-void bch_extent_drop_redundant_crcs(struct bkey_s_extent);
-
-/* Doesn't cleanup redundant crcs */
-static inline void __bch_extent_drop_ptr(struct bkey_s_extent e,
- struct bch_extent_ptr *ptr)
-{
- EBUG_ON(ptr < &e.v->start->ptr ||
- ptr >= &extent_entry_last(e)->ptr);
- EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
- memmove_u64s_down(ptr, ptr + 1,
- (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
- e.k->u64s -= sizeof(*ptr) / sizeof(u64);
-}
-
-static inline void bch_extent_drop_ptr(struct bkey_s_extent e,
- struct bch_extent_ptr *ptr)
-{
- __bch_extent_drop_ptr(e, ptr);
- bch_extent_drop_redundant_crcs(e);
-}
-
-const struct bch_extent_ptr *
-bch_extent_has_device(struct bkey_s_c_extent, unsigned);
-
-bool bch_cut_front(struct bpos, struct bkey_i *);
-bool bch_cut_back(struct bpos, struct bkey *);
-void bch_key_resize(struct bkey *, unsigned);
-
-#endif /* _BCACHE_EXTENTS_H */
diff --git a/libbcache/eytzinger.h b/libbcache/eytzinger.h
deleted file mode 100644
index 13d54e5e..00000000
--- a/libbcache/eytzinger.h
+++ /dev/null
@@ -1,196 +0,0 @@
-#ifndef _EYTZINGER_H
-#define _EYTZINGER_H
-
-#include <linux/bitops.h>
-#include <linux/log2.h>
-
-#include "util.h"
-
-/*
- * Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array
- *
- * We used one based indexing, not zero based: with one based indexing, each
- * level of the tree starts at a power of two - leading to better alignment -
- * and it's what you want for implementing next/prev and to/from inorder.
- *
- * To/from inorder also uses 1 based indexing.
- *
- * Size parameter is treated as if we were using 0 based indexing, however:
- * valid nodes, and inorder indices, are in the range [1..size)
- */
-
-static inline unsigned eytzinger_child(unsigned j, unsigned child)
-{
- EBUG_ON(child > 1);
-
- return (j << 1) + child;
-}
-
-static inline unsigned eytzinger_left_child(unsigned j)
-{
- return eytzinger_child(j, 0);
-}
-
-static inline unsigned eytzinger_right_child(unsigned j)
-{
- return eytzinger_child(j, 1);
-}
-
-static inline unsigned eytzinger_first(unsigned size)
-{
- return rounddown_pow_of_two(size - 1);
-}
-
-static inline unsigned eytzinger_last(unsigned size)
-{
- return rounddown_pow_of_two(size) - 1;
-}
-
-/*
- * eytzinger_next() and eytzinger_prev() have the nice properties that
- *
- * eytzinger_next(0) == eytzinger_first())
- * eytzinger_prev(0) == eytzinger_last())
- *
- * eytzinger_prev(eytzinger_first()) == 0
- * eytzinger_next(eytzinger_last()) == 0
- */
-
-static inline unsigned eytzinger_next(unsigned j, unsigned size)
-{
- EBUG_ON(j >= size);
-
- if (eytzinger_right_child(j) < size) {
- j = eytzinger_right_child(j);
-
- j <<= __fls(size) - __fls(j);
- j >>= j >= size;
- } else {
- j >>= ffz(j) + 1;
- }
-
- return j;
-}
-
-static inline unsigned eytzinger_prev(unsigned j, unsigned size)
-{
- EBUG_ON(j >= size);
-
- if (eytzinger_left_child(j) < size) {
- j = eytzinger_left_child(j);
-
- j <<= __fls(size) - __fls(j);
- j -= 1;
- j >>= j >= size;
- } else {
- j >>= __ffs(j) + 1;
- }
-
- return j;
-}
-
-static inline unsigned eytzinger_extra(unsigned size)
-{
- return (size - rounddown_pow_of_two(size - 1)) << 1;
-}
-
-static inline unsigned __eytzinger_to_inorder(unsigned j, unsigned size,
- unsigned extra)
-{
- unsigned b = __fls(j);
- unsigned shift = __fls(size - 1) - b;
- int s;
-
- EBUG_ON(!j || j >= size);
-
- j ^= 1U << b;
- j <<= 1;
- j |= 1;
- j <<= shift;
-
- /*
- * sign bit trick:
- *
- * if (j > extra)
- * j -= (j - extra) >> 1;
- */
- s = extra - j;
- j += (s >> 1) & (s >> 31);
-
- return j;
-}
-
-static inline unsigned __inorder_to_eytzinger(unsigned j, unsigned size,
- unsigned extra)
-{
- unsigned shift;
- int s;
-
- EBUG_ON(!j || j >= size);
-
- /*
- * sign bit trick:
- *
- * if (j > extra)
- * j += j - extra;
- */
- s = extra - j;
- j -= s & (s >> 31);
-
- shift = __ffs(j);
-
- j >>= shift + 1;
- j |= 1U << (__fls(size - 1) - shift);
-
- return j;
-}
-
-static inline unsigned eytzinger_to_inorder(unsigned j, unsigned size)
-{
- return __eytzinger_to_inorder(j, size, eytzinger_extra(size));
-}
-
-static inline unsigned inorder_to_eytzinger(unsigned j, unsigned size)
-{
- return __inorder_to_eytzinger(j, size, eytzinger_extra(size));
-}
-
-#define eytzinger_for_each(_i, _size) \
- for ((_i) = eytzinger_first((_size)); \
- (_i) != 0; \
- (_i) = eytzinger_next((_i), (_size)))
-
-#if 0
-void eytzinger_test(void)
-{
- unsigned i, j, size;
-
- for (size = 2;
- size < 65536000;
- size++) {
- if (!(size % 4096))
- printk(KERN_INFO "tree size %u\n", size);
-
- assert(eytzinger_prev(0, size) == eytzinger_last(size));
- assert(eytzinger_next(0, size) == eytzinger_first(size));
-
- assert(eytzinger_prev(eytzinger_first(size), size) == 0);
- assert(eytzinger_next(eytzinger_last(size), size) == 0);
-
- eytzinger_for_each(j, size) {
- assert(from_inorder(i, size) == j);
- assert(to_inorder(j, size) == i);
-
- if (j != eytzinger_last(size)) {
- unsigned next = eytzinger_next(j, size);
-
- assert(eytzinger_prev(next, size) == j);
- }
- }
- }
-
-}
-#endif
-
-#endif /* _EYTZINGER_H */
diff --git a/libbcache/fifo.h b/libbcache/fifo.h
deleted file mode 100644
index 2908ca23..00000000
--- a/libbcache/fifo.h
+++ /dev/null
@@ -1,123 +0,0 @@
-#ifndef _BCACHE_FIFO_H
-#define _BCACHE_FIFO_H
-
-#define DECLARE_FIFO(type, name) \
- struct { \
- size_t front, back, size, mask; \
- type *data; \
- } name
-
-#define init_fifo(fifo, _size, _gfp) \
-({ \
- bool _ret = true; \
- gfp_t gfp_flags = (_gfp); \
- \
- if (gfp_flags & GFP_KERNEL) \
- gfp_flags |= __GFP_NOWARN; \
- \
- (fifo)->size = (_size); \
- (fifo)->front = (fifo)->back = 0; \
- (fifo)->data = NULL; \
- \
- if ((fifo)->size) { \
- size_t _allocated_size, _bytes; \
- \
- _allocated_size = roundup_pow_of_two((fifo)->size); \
- _bytes = _allocated_size * sizeof(*(fifo)->data); \
- \
- (fifo)->mask = _allocated_size - 1; \
- \
- if (_bytes < KMALLOC_MAX_SIZE) \
- (fifo)->data = kmalloc(_bytes, gfp_flags); \
- if ((!(fifo)->data) && (gfp_flags & GFP_KERNEL)) \
- (fifo)->data = vmalloc(_bytes); \
- if ((!(fifo)->data)) \
- _ret = false; \
- } \
- _ret; \
-})
-
-#define free_fifo(fifo) \
-do { \
- kvfree((fifo)->data); \
- (fifo)->data = NULL; \
-} while (0)
-
-#define fifo_swap(l, r) \
-do { \
- swap((l)->front, (r)->front); \
- swap((l)->back, (r)->back); \
- swap((l)->size, (r)->size); \
- swap((l)->mask, (r)->mask); \
- swap((l)->data, (r)->data); \
-} while (0)
-
-#define fifo_move(dest, src) \
-do { \
- typeof(*((dest)->data)) _t; \
- while (!fifo_full(dest) && \
- fifo_pop(src, _t)) \
- fifo_push(dest, _t); \
-} while (0)
-
-#define fifo_used(fifo) (((fifo)->back - (fifo)->front))
-#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo))
-
-#define fifo_empty(fifo) ((fifo)->front == (fifo)->back)
-#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size)
-
-#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask])
-#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
-
-#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
-
-#define fifo_push_back(fifo, i) \
-({ \
- bool _r = !fifo_full((fifo)); \
- if (_r) \
- (fifo)->data[(fifo)->back++ & (fifo)->mask] = (i); \
- _r; \
-})
-
-#define fifo_pop_front(fifo, i) \
-({ \
- bool _r = !fifo_empty((fifo)); \
- if (_r) \
- (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \
- _r; \
-})
-
-#define fifo_push_front(fifo, i) \
-({ \
- bool _r = !fifo_full((fifo)); \
- if (_r) \
- (fifo)->data[--(fifo)->front & (fifo)->mask] = (i); \
- _r; \
-})
-
-#define fifo_pop_back(fifo, i) \
-({ \
- bool _r = !fifo_empty((fifo)); \
- if (_r) \
- (i) = (fifo)->data[--(fifo)->back & (fifo)->mask] \
- _r; \
-})
-
-#define fifo_push(fifo, i) fifo_push_back(fifo, (i))
-#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i))
-#define fifo_peek(fifo) fifo_peek_front(fifo)
-
-#define fifo_for_each_entry(_entry, _fifo, _iter) \
- for (_iter = (_fifo)->front; \
- ((_iter != (_fifo)->back) && \
- (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \
- _iter++)
-
-#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \
- for (_iter = (_fifo)->front; \
- ((_iter != (_fifo)->back) && \
- (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \
- _iter++)
-
-#endif /* _BCACHE_FIFO_H */
-
diff --git a/libbcache/fs-gc.c b/libbcache/fs-gc.c
deleted file mode 100644
index 1f6a65ec..00000000
--- a/libbcache/fs-gc.c
+++ /dev/null
@@ -1,924 +0,0 @@
-
-#include "bcache.h"
-#include "btree_update.h"
-#include "dirent.h"
-#include "error.h"
-#include "fs.h"
-#include "fs-gc.h"
-#include "inode.h"
-#include "keylist.h"
-#include "super.h"
-
-#include <linux/generic-radix-tree.h>
-
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-
-static int remove_dirent(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_s_c_dirent dirent)
-{
- struct qstr name;
- struct bch_inode_unpacked dir_inode;
- struct bch_hash_info dir_hash_info;
- u64 dir_inum = dirent.k->p.inode;
- int ret;
- char *buf;
-
- name.len = bch_dirent_name_bytes(dirent);
- buf = kmalloc(name.len + 1, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- memcpy(buf, dirent.v->d_name, name.len);
- buf[name.len] = '\0';
- name.name = buf;
-
- /* Unlock iter so we don't deadlock, after copying name: */
- bch_btree_iter_unlock(iter);
-
- ret = bch_inode_find_by_inum(c, dir_inum, &dir_inode);
- if (ret)
- goto err;
-
- dir_hash_info = bch_hash_info_init(&dir_inode);
-
- ret = bch_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL);
-err:
- kfree(buf);
- return ret;
-}
-
-static int reattach_inode(struct bch_fs *c,
- struct bch_inode_unpacked *lostfound_inode,
- u64 inum)
-{
- struct bch_hash_info lostfound_hash_info =
- bch_hash_info_init(lostfound_inode);
- struct bkey_inode_buf packed;
- char name_buf[20];
- struct qstr name;
- int ret;
-
- snprintf(name_buf, sizeof(name_buf), "%llu", inum);
- name = (struct qstr) QSTR(name_buf);
-
- lostfound_inode->i_nlink++;
-
- bch_inode_pack(&packed, lostfound_inode);
-
- ret = bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
- NULL, NULL, NULL, 0);
- if (ret)
- return ret;
-
- return bch_dirent_create(c, lostfound_inode->inum,
- &lostfound_hash_info,
- DT_DIR, &name, inum, NULL, 0);
-}
-
-struct inode_walker {
- bool first_this_inode;
- bool have_inode;
- u64 cur_inum;
- struct bch_inode_unpacked inode;
-};
-
-static struct inode_walker inode_walker_init(void)
-{
- return (struct inode_walker) {
- .cur_inum = -1,
- .have_inode = false,
- };
-}
-
-static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum)
-{
- w->first_this_inode = inum != w->cur_inum;
- w->cur_inum = inum;
-
- if (w->first_this_inode) {
- int ret = bch_inode_find_by_inum(c, inum, &w->inode);
-
- if (ret && ret != -ENOENT)
- return ret;
-
- w->have_inode = !ret;
- }
-
- return 0;
-}
-
-/*
- * Walk extents: verify that extents have a corresponding S_ISREG inode, and
- * that i_size an i_sectors are consistent
- */
-noinline_for_stack
-static int check_extents(struct bch_fs *c)
-{
- struct inode_walker w = inode_walker_init();
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 i_sectors;
- int ret = 0;
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
- POS(BCACHE_ROOT_INO, 0), k) {
- if (k.k->type == KEY_TYPE_DISCARD)
- continue;
-
- ret = walk_inode(c, &w, k.k->p.inode);
- if (ret)
- break;
-
- unfixable_fsck_err_on(!w.have_inode, c,
- "extent type %u for missing inode %llu",
- k.k->type, k.k->p.inode);
-
- unfixable_fsck_err_on(w.first_this_inode && w.have_inode &&
- w.inode.i_sectors !=
- (i_sectors = bch_count_inode_sectors(c, w.cur_inum)),
- c, "i_sectors wrong: got %llu, should be %llu",
- w.inode.i_sectors, i_sectors);
-
- unfixable_fsck_err_on(w.have_inode &&
- !S_ISREG(w.inode.i_mode) && !S_ISLNK(w.inode.i_mode), c,
- "extent type %u for non regular file, inode %llu mode %o",
- k.k->type, k.k->p.inode, w.inode.i_mode);
-
- unfixable_fsck_err_on(k.k->type != BCH_RESERVATION &&
- k.k->p.offset > round_up(w.inode.i_size, PAGE_SIZE) >> 9, c,
- "extent type %u offset %llu past end of inode %llu, i_size %llu",
- k.k->type, k.k->p.offset, k.k->p.inode, w.inode.i_size);
- }
-fsck_err:
- return bch_btree_iter_unlock(&iter) ?: ret;
-}
-
-/*
- * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
- * validate d_type
- */
-noinline_for_stack
-static int check_dirents(struct bch_fs *c)
-{
- struct inode_walker w = inode_walker_init();
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
- POS(BCACHE_ROOT_INO, 0), k) {
- struct bkey_s_c_dirent d;
- struct bch_inode_unpacked target;
- bool have_target;
- u64 d_inum;
-
- ret = walk_inode(c, &w, k.k->p.inode);
- if (ret)
- break;
-
- unfixable_fsck_err_on(!w.have_inode, c,
- "dirent in nonexisting directory %llu",
- k.k->p.inode);
-
- unfixable_fsck_err_on(!S_ISDIR(w.inode.i_mode), c,
- "dirent in non directory inode %llu, type %u",
- k.k->p.inode, mode_to_type(w.inode.i_mode));
-
- if (k.k->type != BCH_DIRENT)
- continue;
-
- d = bkey_s_c_to_dirent(k);
- d_inum = le64_to_cpu(d.v->d_inum);
-
- if (fsck_err_on(d_inum == d.k->p.inode, c,
- "dirent points to own directory")) {
- ret = remove_dirent(c, &iter, d);
- if (ret)
- goto err;
- continue;
- }
-
- ret = bch_inode_find_by_inum(c, d_inum, &target);
- if (ret && ret != -ENOENT)
- break;
-
- have_target = !ret;
- ret = 0;
-
- if (fsck_err_on(!have_target, c,
- "dirent points to missing inode %llu, type %u filename %s",
- d_inum, d.v->d_type, d.v->d_name)) {
- ret = remove_dirent(c, &iter, d);
- if (ret)
- goto err;
- continue;
- }
-
- if (fsck_err_on(have_target &&
- d.v->d_type !=
- mode_to_type(le16_to_cpu(target.i_mode)), c,
- "incorrect d_type: got %u should be %u, filename %s",
- d.v->d_type,
- mode_to_type(le16_to_cpu(target.i_mode)),
- d.v->d_name)) {
- struct bkey_i_dirent *n;
-
- n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
- if (!n) {
- ret = -ENOMEM;
- goto err;
- }
-
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_type = mode_to_type(le16_to_cpu(target.i_mode));
-
- ret = bch_btree_insert_at(c, NULL, NULL, NULL,
- BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(&iter, &n->k_i));
- kfree(n);
- if (ret)
- goto err;
-
- }
- }
-err:
-fsck_err:
- return bch_btree_iter_unlock(&iter) ?: ret;
-}
-
-/*
- * Walk xattrs: verify that they all have a corresponding inode
- */
-noinline_for_stack
-static int check_xattrs(struct bch_fs *c)
-{
- struct inode_walker w = inode_walker_init();
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
- POS(BCACHE_ROOT_INO, 0), k) {
- ret = walk_inode(c, &w, k.k->p.inode);
- if (ret)
- break;
-
- unfixable_fsck_err_on(!w.have_inode, c,
- "xattr for missing inode %llu",
- k.k->p.inode);
- }
-fsck_err:
- return bch_btree_iter_unlock(&iter) ?: ret;
-}
-
-/* Get root directory, create if it doesn't exist: */
-static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
-{
- struct bkey_inode_buf packed;
- int ret;
-
- ret = bch_inode_find_by_inum(c, BCACHE_ROOT_INO, root_inode);
- if (ret && ret != -ENOENT)
- return ret;
-
- if (fsck_err_on(ret, c, "root directory missing"))
- goto create_root;
-
- if (fsck_err_on(!S_ISDIR(root_inode->i_mode), c,
- "root inode not a directory"))
- goto create_root;
-
- return 0;
-fsck_err:
- return ret;
-create_root:
- bch_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
- root_inode->inum = BCACHE_ROOT_INO;
-
- bch_inode_pack(&packed, root_inode);
-
- return bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
- NULL, NULL, NULL, 0);
-}
-
-/* Get lost+found, create if it doesn't exist: */
-static int check_lostfound(struct bch_fs *c,
- struct bch_inode_unpacked *root_inode,
- struct bch_inode_unpacked *lostfound_inode)
-{
- struct qstr lostfound = QSTR("lost+found");
- struct bch_hash_info root_hash_info =
- bch_hash_info_init(root_inode);
- struct bkey_inode_buf packed;
- u64 inum;
- int ret;
-
- inum = bch_dirent_lookup(c, BCACHE_ROOT_INO, &root_hash_info,
- &lostfound);
- if (!inum) {
- bch_notice(c, "creating lost+found");
- goto create_lostfound;
- }
-
- ret = bch_inode_find_by_inum(c, inum, lostfound_inode);
- if (ret && ret != -ENOENT)
- return ret;
-
- if (fsck_err_on(ret, c, "lost+found missing"))
- goto create_lostfound;
-
- if (fsck_err_on(!S_ISDIR(lostfound_inode->i_mode), c,
- "lost+found inode not a directory"))
- goto create_lostfound;
-
- return 0;
-fsck_err:
- return ret;
-create_lostfound:
- root_inode->i_nlink++;
-
- bch_inode_pack(&packed, root_inode);
-
- ret = bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
- NULL, NULL, NULL, 0);
- if (ret)
- return ret;
-
- bch_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
- bch_inode_pack(&packed, lostfound_inode);
-
- ret = bch_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0,
- &c->unused_inode_hint);
- if (ret)
- return ret;
-
- lostfound_inode->inum = packed.inode.k.p.inode;
-
- ret = bch_dirent_create(c, BCACHE_ROOT_INO, &root_hash_info, DT_DIR,
- &lostfound, lostfound_inode->inum, NULL, 0);
- if (ret)
- return ret;
-
- return 0;
-}
-
-struct inode_bitmap {
- unsigned long *bits;
- size_t size;
-};
-
-static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
-{
- return nr < b->size ? test_bit(nr, b->bits) : false;
-}
-
-static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
-{
- if (nr >= b->size) {
- size_t new_size = max(max(PAGE_SIZE * 8,
- b->size * 2),
- nr + 1);
- void *n;
-
- new_size = roundup_pow_of_two(new_size);
- n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO);
- if (!n)
- return -ENOMEM;
-
- b->bits = n;
- b->size = new_size;
- }
-
- __set_bit(nr, b->bits);
- return 0;
-}
-
-struct pathbuf {
- size_t nr;
- size_t size;
-
- struct pathbuf_entry {
- u64 inum;
- u64 offset;
- } *entries;
-};
-
-static int path_down(struct pathbuf *p, u64 inum)
-{
- if (p->nr == p->size) {
- size_t new_size = max(256UL, p->size * 2);
- void *n = krealloc(p->entries,
- new_size * sizeof(p->entries[0]),
- GFP_KERNEL);
- if (!n)
- return -ENOMEM;
-
- p->entries = n;
- p->size = new_size;
- };
-
- p->entries[p->nr++] = (struct pathbuf_entry) {
- .inum = inum,
- .offset = 0,
- };
- return 0;
-}
-
-noinline_for_stack
-static int check_directory_structure(struct bch_fs *c,
- struct bch_inode_unpacked *lostfound_inode)
-{
- struct inode_bitmap dirs_done = { NULL, 0 };
- struct pathbuf path = { 0, 0, NULL };
- struct pathbuf_entry *e;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_dirent dirent;
- bool had_unreachable;
- u64 d_inum;
- int ret = 0;
-
- /* DFS: */
-restart_dfs:
- ret = inode_bitmap_set(&dirs_done, BCACHE_ROOT_INO);
- if (ret)
- goto err;
-
- ret = path_down(&path, BCACHE_ROOT_INO);
- if (ret)
- return ret;
-
- while (path.nr) {
-next:
- e = &path.entries[path.nr - 1];
-
- if (e->offset == U64_MAX)
- goto up;
-
- for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
- POS(e->inum, e->offset + 1), k) {
- if (k.k->p.inode != e->inum)
- break;
-
- e->offset = k.k->p.offset;
-
- if (k.k->type != BCH_DIRENT)
- continue;
-
- dirent = bkey_s_c_to_dirent(k);
-
- if (dirent.v->d_type != DT_DIR)
- continue;
-
- d_inum = le64_to_cpu(dirent.v->d_inum);
-
- if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c,
- "directory with multiple hardlinks")) {
- ret = remove_dirent(c, &iter, dirent);
- if (ret)
- goto err;
- continue;
- }
-
- ret = inode_bitmap_set(&dirs_done, d_inum);
- if (ret)
- goto err;
-
- ret = path_down(&path, d_inum);
- if (ret)
- goto err;
-
- bch_btree_iter_unlock(&iter);
- goto next;
- }
- ret = bch_btree_iter_unlock(&iter);
- if (ret)
- goto err;
-up:
- path.nr--;
- }
-
- had_unreachable = false;
-
- for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
- if (k.k->type != BCH_INODE_FS ||
- !S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->i_mode)))
- continue;
-
- if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.inode), c,
- "unreachable directory found (inum %llu)",
- k.k->p.inode)) {
- bch_btree_iter_unlock(&iter);
-
- ret = reattach_inode(c, lostfound_inode, k.k->p.inode);
- if (ret)
- goto err;
-
- had_unreachable = true;
- }
- }
- ret = bch_btree_iter_unlock(&iter);
- if (ret)
- goto err;
-
- if (had_unreachable) {
- bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
- kfree(dirs_done.bits);
- kfree(path.entries);
- memset(&dirs_done, 0, sizeof(dirs_done));
- memset(&path, 0, sizeof(path));
- goto restart_dfs;
- }
-
-out:
- kfree(dirs_done.bits);
- kfree(path.entries);
- return ret;
-err:
-fsck_err:
- ret = bch_btree_iter_unlock(&iter) ?: ret;
- goto out;
-}
-
-struct nlink {
- u32 count;
- u32 dir_count;
-};
-
-typedef GENRADIX(struct nlink) nlink_table;
-
-static void inc_link(struct bch_fs *c, nlink_table *links,
- u64 range_start, u64 *range_end,
- u64 inum, bool dir)
-{
- struct nlink *link;
-
- if (inum < range_start || inum >= *range_end)
- return;
-
- link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
- if (!link) {
- bch_verbose(c, "allocation failed during fs gc - will need another pass");
- *range_end = inum;
- return;
- }
-
- if (dir)
- link->dir_count++;
- else
- link->count++;
-}
-
-noinline_for_stack
-static int bch_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
- u64 range_start, u64 *range_end)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
- u64 d_inum;
- int ret;
-
- inc_link(c, links, range_start, range_end, BCACHE_ROOT_INO, false);
-
- for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, k) {
- switch (k.k->type) {
- case BCH_DIRENT:
- d = bkey_s_c_to_dirent(k);
- d_inum = le64_to_cpu(d.v->d_inum);
-
- if (d.v->d_type == DT_DIR)
- inc_link(c, links, range_start, range_end,
- d.k->p.inode, true);
-
- inc_link(c, links, range_start, range_end,
- d_inum, false);
-
- break;
- }
-
- bch_btree_iter_cond_resched(&iter);
- }
- ret = bch_btree_iter_unlock(&iter);
- if (ret)
- bch_err(c, "error in fs gc: btree error %i while walking dirents", ret);
-
- return ret;
-}
-
-s64 bch_count_inode_sectors(struct bch_fs *c, u64 inum)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 sectors = 0;
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inum, 0), k) {
- if (k.k->p.inode != inum)
- break;
-
- if (bkey_extent_is_allocation(k.k))
- sectors += k.k->size;
- }
-
- return bch_btree_iter_unlock(&iter) ?: sectors;
-}
-
-static int bch_gc_do_inode(struct bch_fs *c,
- struct bch_inode_unpacked *lostfound_inode,
- struct btree_iter *iter,
- struct bkey_s_c_inode inode, struct nlink link)
-{
- struct bch_inode_unpacked u;
- int ret = 0;
- u32 i_nlink, real_i_nlink;
- bool do_update = false;
-
- ret = bch_inode_unpack(inode, &u);
- if (bch_fs_inconsistent_on(ret, c,
- "error unpacking inode %llu in fs-gc",
- inode.k->p.inode))
- return ret;
-
- i_nlink = u.i_nlink + nlink_bias(u.i_mode);
-
- fsck_err_on(i_nlink < link.count, c,
- "inode %llu i_link too small (%u < %u, type %i)",
- inode.k->p.inode, i_nlink,
- link.count, mode_to_type(u.i_mode));
-
- /* These should have been caught/fixed by earlier passes: */
- if (S_ISDIR(u.i_mode)) {
- need_fsck_err_on(link.count > 1, c,
- "directory %llu with multiple hardlinks: %u",
- inode.k->p.inode, link.count);
-
- real_i_nlink = link.count * 2 + link.dir_count;
- } else {
- need_fsck_err_on(link.dir_count, c,
- "found dirents for non directory %llu",
- inode.k->p.inode);
-
- real_i_nlink = link.count + link.dir_count;
- }
-
- if (!link.count) {
- fsck_err_on(c->sb.clean, c,
- "filesystem marked clean, "
- "but found orphaned inode %llu",
- inode.k->p.inode);
-
- if (fsck_err_on(S_ISDIR(u.i_mode) &&
- bch_empty_dir(c, inode.k->p.inode), c,
- "non empty directory with link count 0, "
- "inode nlink %u, dir links found %u",
- i_nlink, link.dir_count)) {
- ret = reattach_inode(c, lostfound_inode,
- inode.k->p.inode);
- if (ret)
- return ret;
- }
-
- bch_verbose(c, "deleting inode %llu", inode.k->p.inode);
-
- ret = bch_inode_rm(c, inode.k->p.inode);
- if (ret)
- bch_err(c, "error in fs gc: error %i "
- "while deleting inode", ret);
- return ret;
- }
-
- if (u.i_flags & BCH_INODE_I_SIZE_DIRTY) {
- fsck_err_on(c->sb.clean, c,
- "filesystem marked clean, "
- "but inode %llu has i_size dirty",
- inode.k->p.inode);
-
- bch_verbose(c, "truncating inode %llu", inode.k->p.inode);
-
- /*
- * XXX: need to truncate partial blocks too here - or ideally
- * just switch units to bytes and that issue goes away
- */
-
- ret = bch_inode_truncate(c, inode.k->p.inode,
- round_up(u.i_size, PAGE_SIZE) >> 9,
- NULL, NULL);
- if (ret) {
- bch_err(c, "error in fs gc: error %i "
- "truncating inode", ret);
- return ret;
- }
-
- /*
- * We truncated without our normal sector accounting hook, just
- * make sure we recalculate it:
- */
- u.i_flags |= BCH_INODE_I_SECTORS_DIRTY;
-
- u.i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
- do_update = true;
- }
-
- if (u.i_flags & BCH_INODE_I_SECTORS_DIRTY) {
- s64 sectors;
-
- fsck_err_on(c->sb.clean, c,
- "filesystem marked clean, "
- "but inode %llu has i_sectors dirty",
- inode.k->p.inode);
-
- bch_verbose(c, "recounting sectors for inode %llu",
- inode.k->p.inode);
-
- sectors = bch_count_inode_sectors(c, inode.k->p.inode);
- if (sectors < 0) {
- bch_err(c, "error in fs gc: error %i "
- "recounting inode sectors",
- (int) sectors);
- return sectors;
- }
-
- u.i_sectors = sectors;
- u.i_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
- do_update = true;
- }
-
- if (i_nlink != real_i_nlink) {
- fsck_err_on(c->sb.clean, c,
- "filesystem marked clean, "
- "but inode %llu has wrong i_nlink "
- "(type %u i_nlink %u, should be %u)",
- inode.k->p.inode, mode_to_type(u.i_mode),
- i_nlink, real_i_nlink);
-
- bch_verbose(c, "setting inode %llu nlinks from %u to %u",
- inode.k->p.inode, i_nlink, real_i_nlink);
- u.i_nlink = real_i_nlink - nlink_bias(u.i_mode);;
- do_update = true;
- }
-
- if (do_update) {
- struct bkey_inode_buf p;
-
- bch_inode_pack(&p, &u);
-
- ret = bch_btree_insert_at(c, NULL, NULL, NULL,
- BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
- if (ret && ret != -EINTR)
- bch_err(c, "error in fs gc: error %i "
- "updating inode", ret);
- }
-fsck_err:
- return ret;
-}
-
-noinline_for_stack
-static int bch_gc_walk_inodes(struct bch_fs *c,
- struct bch_inode_unpacked *lostfound_inode,
- nlink_table *links,
- u64 range_start, u64 range_end)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct nlink *link, zero_links = { 0, 0 };
- struct genradix_iter nlinks_iter;
- int ret = 0, ret2 = 0;
- u64 nlinks_pos;
-
- bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(range_start, 0));
- genradix_iter_init(&nlinks_iter);
-
- while ((k = bch_btree_iter_peek(&iter)).k &&
- !btree_iter_err(k)) {
-peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links);
-
- if (!link && (!k.k || iter.pos.inode >= range_end))
- break;
-
- nlinks_pos = range_start + nlinks_iter.pos;
- if (iter.pos.inode > nlinks_pos) {
- /* Should have been caught by dirents pass: */
- need_fsck_err_on(link && link->count, c,
- "missing inode %llu (nlink %u)",
- nlinks_pos, link->count);
- genradix_iter_advance(&nlinks_iter, links);
- goto peek_nlinks;
- }
-
- if (iter.pos.inode < nlinks_pos || !link)
- link = &zero_links;
-
- if (k.k && k.k->type == BCH_INODE_FS) {
- /*
- * Avoid potential deadlocks with iter for
- * truncate/rm/etc.:
- */
- bch_btree_iter_unlock(&iter);
-
- ret = bch_gc_do_inode(c, lostfound_inode, &iter,
- bkey_s_c_to_inode(k), *link);
- if (ret == -EINTR)
- continue;
- if (ret)
- break;
-
- if (link->count)
- atomic_long_inc(&c->nr_inodes);
- } else {
- /* Should have been caught by dirents pass: */
- need_fsck_err_on(link->count, c,
- "missing inode %llu (nlink %u)",
- nlinks_pos, link->count);
- }
-
- if (nlinks_pos == iter.pos.inode)
- genradix_iter_advance(&nlinks_iter, links);
-
- bch_btree_iter_advance_pos(&iter);
- bch_btree_iter_cond_resched(&iter);
- }
-fsck_err:
- ret2 = bch_btree_iter_unlock(&iter);
- if (ret2)
- bch_err(c, "error in fs gc: btree error %i while walking inodes", ret2);
-
- return ret ?: ret2;
-}
-
-noinline_for_stack
-static int check_inode_nlinks(struct bch_fs *c,
- struct bch_inode_unpacked *lostfound_inode)
-{
- nlink_table links;
- u64 this_iter_range_start, next_iter_range_start = 0;
- int ret = 0;
-
- genradix_init(&links);
-
- do {
- this_iter_range_start = next_iter_range_start;
- next_iter_range_start = U64_MAX;
-
- ret = bch_gc_walk_dirents(c, &links,
- this_iter_range_start,
- &next_iter_range_start);
- if (ret)
- break;
-
- ret = bch_gc_walk_inodes(c, lostfound_inode, &links,
- this_iter_range_start,
- next_iter_range_start);
- if (ret)
- break;
-
- genradix_free(&links);
- } while (next_iter_range_start != U64_MAX);
-
- genradix_free(&links);
-
- return ret;
-}
-
-/*
- * Checks for inconsistencies that shouldn't happen, unless we have a bug.
- * Doesn't fix them yet, mainly because they haven't yet been observed:
- */
-int bch_fsck(struct bch_fs *c, bool full_fsck)
-{
- struct bch_inode_unpacked root_inode, lostfound_inode;
- int ret;
-
- ret = check_root(c, &root_inode);
- if (ret)
- return ret;
-
- ret = check_lostfound(c, &root_inode, &lostfound_inode);
- if (ret)
- return ret;
-
- if (!full_fsck)
- goto check_nlinks;
-
- ret = check_extents(c);
- if (ret)
- return ret;
-
- ret = check_dirents(c);
- if (ret)
- return ret;
-
- ret = check_xattrs(c);
- if (ret)
- return ret;
-
- ret = check_directory_structure(c, &lostfound_inode);
- if (ret)
- return ret;
-check_nlinks:
- ret = check_inode_nlinks(c, &lostfound_inode);
- if (ret)
- return ret;
-
- return 0;
-}
diff --git a/libbcache/fs-gc.h b/libbcache/fs-gc.h
deleted file mode 100644
index ac86fd22..00000000
--- a/libbcache/fs-gc.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _BCACHE_FS_GC_H
-#define _BCACHE_FS_GC_H
-
-s64 bch_count_inode_sectors(struct bch_fs *, u64);
-int bch_fsck(struct bch_fs *, bool);
-
-#endif /* _BCACHE_FS_GC_H */
diff --git a/libbcache/fs-io.c b/libbcache/fs-io.c
deleted file mode 100644
index afc8c208..00000000
--- a/libbcache/fs-io.c
+++ /dev/null
@@ -1,2496 +0,0 @@
-
-#include "bcache.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "clock.h"
-#include "error.h"
-#include "fs.h"
-#include "fs-gc.h"
-#include "fs-io.h"
-#include "inode.h"
-#include "journal.h"
-#include "io.h"
-#include "keylist.h"
-
-#include <linux/aio.h>
-#include <linux/backing-dev.h>
-#include <linux/falloc.h>
-#include <linux/migrate.h>
-#include <linux/mmu_context.h>
-#include <linux/pagevec.h>
-#include <linux/task_io_accounting_ops.h>
-#include <linux/uio.h>
-#include <linux/writeback.h>
-#include <trace/events/writeback.h>
-
-struct bio_set *bch_writepage_bioset;
-struct bio_set *bch_dio_read_bioset;
-struct bio_set *bch_dio_write_bioset;
-
-/* pagecache_block must be held */
-static int write_invalidate_inode_pages_range(struct address_space *mapping,
- loff_t start, loff_t end)
-{
- int ret;
-
- /*
- * XXX: the way this is currently implemented, we can spin if a process
- * is continually redirtying a specific page
- */
- do {
- if (!mapping->nrpages &&
- !mapping->nrexceptional)
- return 0;
-
- ret = filemap_write_and_wait_range(mapping, start, end);
- if (ret)
- break;
-
- if (!mapping->nrpages)
- return 0;
-
- ret = invalidate_inode_pages2_range(mapping,
- start >> PAGE_SHIFT,
- end >> PAGE_SHIFT);
- } while (ret == -EBUSY);
-
- return ret;
-}
-
-/* i_size updates: */
-
-static int inode_set_size(struct bch_inode_info *ei,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- loff_t *new_i_size = p;
-
- lockdep_assert_held(&ei->update_lock);
-
- bi->i_size = *new_i_size;
-
- if (atomic_long_read(&ei->i_size_dirty_count))
- bi->i_flags |= BCH_INODE_I_SIZE_DIRTY;
- else
- bi->i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-
- return 0;
-}
-
-static int __must_check bch_write_inode_size(struct bch_fs *c,
- struct bch_inode_info *ei,
- loff_t new_size)
-{
- return __bch_write_inode(c, ei, inode_set_size, &new_size);
-}
-
-static inline void i_size_dirty_put(struct bch_inode_info *ei)
-{
- atomic_long_dec_bug(&ei->i_size_dirty_count);
-}
-
-static inline void i_size_dirty_get(struct bch_inode_info *ei)
-{
- lockdep_assert_held(&ei->vfs_inode.i_rwsem);
-
- atomic_long_inc(&ei->i_size_dirty_count);
-}
-
-/* i_sectors accounting: */
-
-static enum extent_insert_hook_ret
-i_sectors_hook_fn(struct extent_insert_hook *hook,
- struct bpos committed_pos,
- struct bpos next_pos,
- struct bkey_s_c k,
- const struct bkey_i *insert)
-{
- struct i_sectors_hook *h = container_of(hook,
- struct i_sectors_hook, hook);
- s64 sectors = next_pos.offset - committed_pos.offset;
- int sign = bkey_extent_is_allocation(&insert->k) -
- (k.k && bkey_extent_is_allocation(k.k));
-
- EBUG_ON(!(h->ei->i_flags & BCH_INODE_I_SECTORS_DIRTY));
- EBUG_ON(!atomic_long_read(&h->ei->i_sectors_dirty_count));
-
- h->sectors += sectors * sign;
-
- return BTREE_HOOK_DO_INSERT;
-}
-
-static int inode_set_i_sectors_dirty(struct bch_inode_info *ei,
- struct bch_inode_unpacked *bi, void *p)
-{
- BUG_ON(bi->i_flags & BCH_INODE_I_SECTORS_DIRTY);
-
- bi->i_flags |= BCH_INODE_I_SECTORS_DIRTY;
- return 0;
-}
-
-static int inode_clear_i_sectors_dirty(struct bch_inode_info *ei,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- BUG_ON(!(bi->i_flags & BCH_INODE_I_SECTORS_DIRTY));
-
- bi->i_sectors = atomic64_read(&ei->i_sectors);
- bi->i_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
- return 0;
-}
-
-static void i_sectors_dirty_put(struct bch_inode_info *ei,
- struct i_sectors_hook *h)
-{
- struct inode *inode = &ei->vfs_inode;
-
- if (h->sectors) {
- spin_lock(&inode->i_lock);
- inode->i_blocks += h->sectors;
- spin_unlock(&inode->i_lock);
-
- atomic64_add(h->sectors, &ei->i_sectors);
- EBUG_ON(atomic64_read(&ei->i_sectors) < 0);
- }
-
- EBUG_ON(atomic_long_read(&ei->i_sectors_dirty_count) <= 0);
-
- mutex_lock(&ei->update_lock);
-
- if (atomic_long_dec_and_test(&ei->i_sectors_dirty_count)) {
- struct bch_fs *c = ei->vfs_inode.i_sb->s_fs_info;
- int ret = __bch_write_inode(c, ei, inode_clear_i_sectors_dirty, NULL);
-
- ret = ret;
- }
-
- mutex_unlock(&ei->update_lock);
-}
-
-static int __must_check i_sectors_dirty_get(struct bch_inode_info *ei,
- struct i_sectors_hook *h)
-{
- int ret = 0;
-
- h->hook.fn = i_sectors_hook_fn;
- h->sectors = 0;
-#ifdef CONFIG_BCACHE_DEBUG
- h->ei = ei;
-#endif
-
- if (atomic_long_inc_not_zero(&ei->i_sectors_dirty_count))
- return 0;
-
- mutex_lock(&ei->update_lock);
-
- if (!(ei->i_flags & BCH_INODE_I_SECTORS_DIRTY)) {
- struct bch_fs *c = ei->vfs_inode.i_sb->s_fs_info;
-
- ret = __bch_write_inode(c, ei, inode_set_i_sectors_dirty, NULL);
- }
-
- if (!ret)
- atomic_long_inc(&ei->i_sectors_dirty_count);
-
- mutex_unlock(&ei->update_lock);
-
- return ret;
-}
-
-struct bchfs_extent_trans_hook {
- struct bchfs_write_op *op;
- struct extent_insert_hook hook;
-
- struct bch_inode_unpacked inode_u;
- struct bkey_inode_buf inode_p;
-
- bool need_inode_update;
-};
-
-static enum extent_insert_hook_ret
-bchfs_extent_update_hook(struct extent_insert_hook *hook,
- struct bpos committed_pos,
- struct bpos next_pos,
- struct bkey_s_c k,
- const struct bkey_i *insert)
-{
- struct bchfs_extent_trans_hook *h = container_of(hook,
- struct bchfs_extent_trans_hook, hook);
- struct bch_inode_info *ei = h->op->ei;
- struct inode *inode = &ei->vfs_inode;
- int sign = bkey_extent_is_allocation(&insert->k) -
- (k.k && bkey_extent_is_allocation(k.k));
- s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign;
- u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
- bool do_pack = false;
-
- BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
-
- /* XXX: ei->i_size locking */
- if (offset > ei->i_size) {
- BUG_ON(ei->i_flags & BCH_INODE_I_SIZE_DIRTY);
-
- if (!h->need_inode_update) {
- h->need_inode_update = true;
- return BTREE_HOOK_RESTART_TRANS;
- }
-
- h->inode_u.i_size = offset;
- do_pack = true;
-
- ei->i_size = offset;
-
- if (h->op->is_dio)
- i_size_write(inode, offset);
- }
-
- if (sectors) {
- if (!h->need_inode_update) {
- h->need_inode_update = true;
- return BTREE_HOOK_RESTART_TRANS;
- }
-
- h->inode_u.i_sectors += sectors;
- do_pack = true;
-
- atomic64_add(sectors, &ei->i_sectors);
-
- h->op->sectors_added += sectors;
-
- if (h->op->is_dio) {
- spin_lock(&inode->i_lock);
- inode->i_blocks += sectors;
- spin_unlock(&inode->i_lock);
- }
- }
-
- if (do_pack)
- bch_inode_pack(&h->inode_p, &h->inode_u);
-
- return BTREE_HOOK_DO_INSERT;
-}
-
-static int bchfs_write_index_update(struct bch_write_op *wop)
-{
- struct bchfs_write_op *op = container_of(wop,
- struct bchfs_write_op, op);
- struct keylist *keys = &op->op.insert_keys;
- struct btree_iter extent_iter, inode_iter;
- struct bchfs_extent_trans_hook hook;
- struct bkey_i *k = bch_keylist_front(keys);
- int ret;
-
- BUG_ON(k->k.p.inode != op->ei->vfs_inode.i_ino);
-
- bch_btree_iter_init_intent(&extent_iter, wop->c, BTREE_ID_EXTENTS,
- bkey_start_pos(&bch_keylist_front(keys)->k));
- bch_btree_iter_init_intent(&inode_iter, wop->c, BTREE_ID_INODES,
- POS(extent_iter.pos.inode, 0));
-
- hook.op = op;
- hook.hook.fn = bchfs_extent_update_hook;
- hook.need_inode_update = false;
-
- do {
- ret = bch_btree_iter_traverse(&extent_iter);
- if (ret)
- goto err;
-
- /* XXX: ei->i_size locking */
- k = bch_keylist_front(keys);
- if (min(k->k.p.offset << 9, op->new_i_size) > op->ei->i_size)
- hook.need_inode_update = true;
-
- if (hook.need_inode_update) {
- struct bkey_s_c inode;
-
- if (!btree_iter_linked(&inode_iter))
- bch_btree_iter_link(&extent_iter, &inode_iter);
-
- inode = bch_btree_iter_peek_with_holes(&inode_iter);
- if ((ret = btree_iter_err(inode)))
- goto err;
-
- if (WARN_ONCE(inode.k->type != BCH_INODE_FS,
- "inode %llu not found when updating",
- extent_iter.pos.inode)) {
- ret = -ENOENT;
- break;
- }
-
- if (WARN_ONCE(bkey_bytes(inode.k) >
- sizeof(hook.inode_p),
- "inode %llu too big (%zu bytes, buf %zu)",
- extent_iter.pos.inode,
- bkey_bytes(inode.k),
- sizeof(hook.inode_p))) {
- ret = -ENOENT;
- break;
- }
-
- bkey_reassemble(&hook.inode_p.inode.k_i, inode);
- ret = bch_inode_unpack(bkey_s_c_to_inode(inode),
- &hook.inode_u);
- if (WARN_ONCE(ret,
- "error %i unpacking inode %llu",
- ret, extent_iter.pos.inode)) {
- ret = -ENOENT;
- break;
- }
-
- ret = bch_btree_insert_at(wop->c, &wop->res,
- &hook.hook, op_journal_seq(wop),
- BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&extent_iter, k),
- BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter,
- &hook.inode_p.inode.k_i, 2));
- } else {
- ret = bch_btree_insert_at(wop->c, &wop->res,
- &hook.hook, op_journal_seq(wop),
- BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&extent_iter, k));
- }
-err:
- if (ret == -EINTR)
- continue;
- if (ret)
- break;
-
- bch_keylist_pop_front(keys);
- } while (!bch_keylist_empty(keys));
-
- bch_btree_iter_unlock(&extent_iter);
- bch_btree_iter_unlock(&inode_iter);
-
- return ret;
-}
-
-/* page state: */
-
-/* stored in page->private: */
-
-/*
- * bch_page_state has to (unfortunately) be manipulated with cmpxchg - we could
- * almost protected it with the page lock, except that bch_writepage_io_done has
- * to update the sector counts (and from interrupt/bottom half context).
- */
-struct bch_page_state {
-union { struct {
- /*
- * page is _fully_ written on disk, and not compressed - which means to
- * write this page we don't have to reserve space (the new write will
- * never take up more space on disk than what it's overwriting)
- */
- unsigned allocated:1;
-
- /* Owns PAGE_SECTORS sized reservation: */
- unsigned reserved:1;
- unsigned nr_replicas:4;
-
- /*
- * Number of sectors on disk - for i_blocks
- * Uncompressed size, not compressed size:
- */
- u8 sectors;
- u8 dirty_sectors;
-};
- /* for cmpxchg: */
- unsigned long v;
-};
-};
-
-#define page_state_cmpxchg(_ptr, _new, _expr) \
-({ \
- unsigned long _v = READ_ONCE((_ptr)->v); \
- struct bch_page_state _old; \
- \
- do { \
- _old.v = _new.v = _v; \
- _expr; \
- \
- EBUG_ON(_new.sectors + _new.dirty_sectors > PAGE_SECTORS);\
- } while (_old.v != _new.v && \
- (_v = cmpxchg(&(_ptr)->v, _old.v, _new.v)) != _old.v); \
- \
- _old; \
-})
-
-static inline struct bch_page_state *page_state(struct page *page)
-{
- struct bch_page_state *s = (void *) &page->private;
-
- BUILD_BUG_ON(sizeof(*s) > sizeof(page->private));
-
- if (!PagePrivate(page))
- SetPagePrivate(page);
-
- return s;
-}
-
-static void bch_put_page_reservation(struct bch_fs *c, struct page *page)
-{
- struct disk_reservation res = { .sectors = PAGE_SECTORS };
- struct bch_page_state s;
-
- s = page_state_cmpxchg(page_state(page), s, {
- if (!s.reserved)
- return;
- s.reserved = 0;
- });
-
- bch_disk_reservation_put(c, &res);
-}
-
-static int bch_get_page_reservation(struct bch_fs *c, struct page *page,
- bool check_enospc)
-{
- struct bch_page_state *s = page_state(page), new;
- struct disk_reservation res;
- int ret = 0;
-
- BUG_ON(s->allocated && s->sectors != PAGE_SECTORS);
-
- if (s->allocated || s->reserved)
- return 0;
-
- ret = bch_disk_reservation_get(c, &res, PAGE_SECTORS, !check_enospc
- ? BCH_DISK_RESERVATION_NOFAIL : 0);
- if (ret)
- return ret;
-
- page_state_cmpxchg(s, new, {
- if (new.reserved) {
- bch_disk_reservation_put(c, &res);
- return 0;
- }
- new.reserved = 1;
- new.nr_replicas = res.nr_replicas;
- });
-
- return 0;
-}
-
-static void bch_clear_page_bits(struct page *page)
-{
- struct inode *inode = page->mapping->host;
- struct bch_fs *c = inode->i_sb->s_fs_info;
- struct disk_reservation res = { .sectors = PAGE_SECTORS };
- struct bch_page_state s;
-
- if (!PagePrivate(page))
- return;
-
- s = xchg(page_state(page), (struct bch_page_state) { .v = 0 });
- ClearPagePrivate(page);
-
- if (s.dirty_sectors) {
- spin_lock(&inode->i_lock);
- inode->i_blocks -= s.dirty_sectors;
- spin_unlock(&inode->i_lock);
- }
-
- if (s.reserved)
- bch_disk_reservation_put(c, &res);
-}
-
-int bch_set_page_dirty(struct page *page)
-{
- struct bch_page_state old, new;
-
- old = page_state_cmpxchg(page_state(page), new,
- new.dirty_sectors = PAGE_SECTORS - new.sectors;
- );
-
- if (old.dirty_sectors != new.dirty_sectors) {
- struct inode *inode = page->mapping->host;
-
- spin_lock(&inode->i_lock);
- inode->i_blocks += new.dirty_sectors - old.dirty_sectors;
- spin_unlock(&inode->i_lock);
- }
-
- return __set_page_dirty_nobuffers(page);
-}
-
-/* readpages/writepages: */
-
-static bool bio_can_add_page_contig(struct bio *bio, struct page *page)
-{
- sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9);
-
- return bio->bi_vcnt < bio->bi_max_vecs &&
- bio_end_sector(bio) == offset;
-}
-
-static int bio_add_page_contig(struct bio *bio, struct page *page)
-{
- sector_t offset = (sector_t) page->index << (PAGE_SHIFT - 9);
-
- BUG_ON(!bio->bi_max_vecs);
-
- if (!bio->bi_vcnt)
- bio->bi_iter.bi_sector = offset;
- else if (!bio_can_add_page_contig(bio, page))
- return -1;
-
- bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
- .bv_page = page,
- .bv_len = PAGE_SIZE,
- .bv_offset = 0,
- };
-
- bio->bi_iter.bi_size += PAGE_SIZE;
-
- return 0;
-}
-
-static void bch_readpages_end_io(struct bio *bio)
-{
- struct bio_vec *bv;
- int i;
-
- bio_for_each_segment_all(bv, bio, i) {
- struct page *page = bv->bv_page;
-
- if (!bio->bi_error) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
- }
-
- bio_put(bio);
-}
-
-static inline struct page *__readpage_next_page(struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
-{
- struct page *page;
- int ret;
-
- while (*nr_pages) {
- page = list_entry(pages->prev, struct page, lru);
- prefetchw(&page->flags);
- list_del(&page->lru);
-
- ret = add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS);
-
- /* if add_to_page_cache_lru() succeeded, page is locked: */
- put_page(page);
-
- if (!ret)
- return page;
-
- (*nr_pages)--;
- }
-
- return NULL;
-}
-
-#define for_each_readpage_page(_mapping, _pages, _nr_pages, _page) \
- for (; \
- ((_page) = __readpage_next_page(_mapping, _pages, &(_nr_pages)));\
- (_nr_pages)--)
-
-static void bch_mark_pages_unalloc(struct bio *bio)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
-
- bio_for_each_segment(bv, bio, iter)
- page_state(bv.bv_page)->allocated = 0;
-}
-
-static void bch_add_page_sectors(struct bio *bio, struct bkey_s_c k)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
-
- bio_for_each_segment(bv, bio, iter) {
- struct bch_page_state *s = page_state(bv.bv_page);
-
- /* sectors in @k from the start of this page: */
- unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset);
-
- unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
-
- if (!s->sectors)
- s->nr_replicas = bch_extent_nr_dirty_ptrs(k);
- else
- s->nr_replicas = min_t(unsigned, s->nr_replicas,
- bch_extent_nr_dirty_ptrs(k));
-
- BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
- s->sectors += page_sectors;
- }
-}
-
-static void bchfs_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
-{
- struct bio *bio = &rbio->bio;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bio_vec *bv;
- unsigned i;
- int ret;
-
- bch_increment_clock(c, bio_sectors(bio), READ);
-
- /*
- * Initialize page state:
- * If a page is partly allocated and partly a hole, we want it to be
- * marked BCH_PAGE_UNALLOCATED - so we initially mark all pages
- * allocated and then mark them unallocated as we find holes:
- *
- * Note that the bio hasn't been split yet - it's the only bio that
- * points to these pages. As we walk extents and split @bio, that
- * necessarily be true, the splits won't necessarily be on page
- * boundaries:
- */
- bio_for_each_segment_all(bv, bio, i) {
- struct bch_page_state *s = page_state(bv->bv_page);
-
- EBUG_ON(s->reserved);
-
- s->allocated = 1;
- s->sectors = 0;
- }
-
- for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
- POS(inode, bio->bi_iter.bi_sector), k) {
- BKEY_PADDED(k) tmp;
- struct extent_pick_ptr pick;
- unsigned bytes, sectors;
- bool is_last;
-
- bkey_reassemble(&tmp.k, k);
- bch_btree_iter_unlock(&iter);
- k = bkey_i_to_s_c(&tmp.k);
-
- if (!bkey_extent_is_allocation(k.k) ||
- bkey_extent_is_compressed(k))
- bch_mark_pages_unalloc(bio);
-
- bch_extent_pick_ptr(c, k, &pick);
- if (IS_ERR(pick.ca)) {
- bcache_io_error(c, bio, "no device to read from");
- bio_endio(bio);
- return;
- }
-
- sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
- bio->bi_iter.bi_sector;
- bytes = sectors << 9;
- is_last = bytes == bio->bi_iter.bi_size;
- swap(bio->bi_iter.bi_size, bytes);
-
- if (bkey_extent_is_allocation(k.k))
- bch_add_page_sectors(bio, k);
-
- if (pick.ca) {
- PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
- c->prio_clock[READ].hand;
-
- bch_read_extent(c, rbio, k, &pick,
- BCH_READ_RETRY_IF_STALE|
- BCH_READ_PROMOTE|
- (is_last ? BCH_READ_IS_LAST : 0));
- } else {
- zero_fill_bio_iter(bio, bio->bi_iter);
-
- if (is_last)
- bio_endio(bio);
- }
-
- if (is_last)
- return;
-
- swap(bio->bi_iter.bi_size, bytes);
- bio_advance(bio, bytes);
- }
-
- /*
- * If we get here, it better have been because there was an error
- * reading a btree node
- */
- ret = bch_btree_iter_unlock(&iter);
- BUG_ON(!ret);
- bcache_io_error(c, bio, "btree IO error %i", ret);
- bio_endio(bio);
-}
-
-int bch_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
-{
- struct inode *inode = mapping->host;
- struct bch_fs *c = inode->i_sb->s_fs_info;
- struct bch_read_bio *rbio = NULL;
- struct page *page;
-
- pr_debug("reading %u pages", nr_pages);
-
- if (current->pagecache_lock != &mapping->add_lock)
- pagecache_add_get(&mapping->add_lock);
-
- for_each_readpage_page(mapping, pages, nr_pages, page) {
-again:
- if (!rbio) {
- rbio = container_of(bio_alloc_bioset(GFP_NOFS,
- min_t(unsigned, nr_pages,
- BIO_MAX_PAGES),
- &c->bio_read),
- struct bch_read_bio, bio);
-
- rbio->bio.bi_end_io = bch_readpages_end_io;
- }
-
- if (bio_add_page_contig(&rbio->bio, page)) {
- bchfs_read(c, rbio, inode->i_ino);
- rbio = NULL;
- goto again;
- }
- }
-
- if (rbio)
- bchfs_read(c, rbio, inode->i_ino);
-
- if (current->pagecache_lock != &mapping->add_lock)
- pagecache_add_put(&mapping->add_lock);
-
- pr_debug("success");
- return 0;
-}
-
-int bch_readpage(struct file *file, struct page *page)
-{
- struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
- struct bch_fs *c = inode->i_sb->s_fs_info;
- struct bch_read_bio *rbio;
-
- rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1,
- &c->bio_read),
- struct bch_read_bio, bio);
- bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
- rbio->bio.bi_end_io = bch_readpages_end_io;
-
- bio_add_page_contig(&rbio->bio, page);
- bchfs_read(c, rbio, inode->i_ino);
-
- return 0;
-}
-
-struct bch_writepage_state {
- struct bch_writepage_io *io;
-};
-
-static void bch_writepage_io_free(struct closure *cl)
-{
- struct bch_writepage_io *io = container_of(cl,
- struct bch_writepage_io, cl);
- struct bio *bio = &io->bio.bio;
-
- bio_put(bio);
-}
-
-static void bch_writepage_io_done(struct closure *cl)
-{
- struct bch_writepage_io *io = container_of(cl,
- struct bch_writepage_io, cl);
- struct bch_fs *c = io->op.op.c;
- struct bio *bio = &io->bio.bio;
- struct bio_vec *bvec;
- unsigned i;
-
- atomic_sub(bio->bi_vcnt, &c->writeback_pages);
- wake_up(&c->writeback_wait);
-
- bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (io->op.op.error) {
- SetPageError(page);
- if (page->mapping)
- set_bit(AS_EIO, &page->mapping->flags);
- }
-
- if (io->op.op.written >= PAGE_SECTORS) {
- struct bch_page_state old, new;
-
- old = page_state_cmpxchg(page_state(page), new, {
- new.sectors = PAGE_SECTORS;
- new.dirty_sectors = 0;
- });
-
- io->op.sectors_added -= old.dirty_sectors;
- io->op.op.written -= PAGE_SECTORS;
- }
- }
-
- /*
- * racing with fallocate can cause us to add fewer sectors than
- * expected - but we shouldn't add more sectors than expected:
- *
- * (error (due to going RO) halfway through a page can screw that up
- * slightly)
- */
- BUG_ON(io->op.sectors_added >= (s64) PAGE_SECTORS);
-
- /*
- * PageWriteback is effectively our ref on the inode - fixup i_blocks
- * before calling end_page_writeback:
- */
- if (io->op.sectors_added) {
- struct inode *inode = &io->op.ei->vfs_inode;
-
- spin_lock(&inode->i_lock);
- inode->i_blocks += io->op.sectors_added;
- spin_unlock(&inode->i_lock);
- }
-
- bio_for_each_segment_all(bvec, bio, i)
- end_page_writeback(bvec->bv_page);
-
- closure_return_with_destructor(&io->cl, bch_writepage_io_free);
-}
-
-static void bch_writepage_do_io(struct bch_writepage_state *w)
-{
- struct bch_writepage_io *io = w->io;
-
- w->io = NULL;
- atomic_add(io->bio.bio.bi_vcnt, &io->op.op.c->writeback_pages);
-
- io->op.op.pos.offset = io->bio.bio.bi_iter.bi_sector;
-
- closure_call(&io->op.op.cl, bch_write, NULL, &io->cl);
- continue_at(&io->cl, bch_writepage_io_done, NULL);
-}
-
-/*
- * Get a bch_writepage_io and add @page to it - appending to an existing one if
- * possible, else allocating a new one:
- */
-static void bch_writepage_io_alloc(struct bch_fs *c,
- struct bch_writepage_state *w,
- struct bch_inode_info *ei,
- struct page *page)
-{
- u64 inum = ei->vfs_inode.i_ino;
- unsigned nr_replicas = page_state(page)->nr_replicas;
-
- EBUG_ON(!nr_replicas);
- /* XXX: disk_reservation->gen isn't plumbed through */
-
- if (!w->io) {
-alloc_io:
- w->io = container_of(bio_alloc_bioset(GFP_NOFS,
- BIO_MAX_PAGES,
- bch_writepage_bioset),
- struct bch_writepage_io, bio.bio);
-
- closure_init(&w->io->cl, NULL);
- w->io->op.ei = ei;
- w->io->op.sectors_added = 0;
- w->io->op.is_dio = false;
- bch_write_op_init(&w->io->op.op, c, &w->io->bio,
- (struct disk_reservation) {
- .nr_replicas = c->opts.data_replicas,
- },
- foreground_write_point(c, inum),
- POS(inum, 0),
- &ei->journal_seq, 0);
- w->io->op.op.index_update_fn = bchfs_write_index_update;
- }
-
- if (w->io->op.op.res.nr_replicas != nr_replicas ||
- bio_add_page_contig(&w->io->bio.bio, page)) {
- bch_writepage_do_io(w);
- goto alloc_io;
- }
-
- /*
- * We shouldn't ever be handed pages for multiple inodes in a single
- * pass - right?
- */
- BUG_ON(ei != w->io->op.ei);
-}
-
-static int __bch_writepage(struct bch_fs *c, struct page *page,
- struct writeback_control *wbc,
- struct bch_writepage_state *w)
-{
- struct inode *inode = page->mapping->host;
- struct bch_inode_info *ei = to_bch_ei(inode);
- struct bch_page_state new, old;
- unsigned offset;
- loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
-
- EBUG_ON(!PageUptodate(page));
-
- /* Is the page fully inside i_size? */
- if (page->index < end_index)
- goto do_io;
-
- /* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_SIZE - 1);
- if (page->index > end_index || !offset) {
- unlock_page(page);
- return 0;
- }
-
- /*
- * The page straddles i_size. It must be zeroed out on each and every
- * writepage invocation because it may be mmapped. "A file is mapped
- * in multiples of the page size. For a file that is not a multiple of
- * the page size, the remaining memory is zeroed when mapped, and
- * writes to that region are not written out to the file."
- */
- zero_user_segment(page, offset, PAGE_SIZE);
-do_io:
- bch_writepage_io_alloc(c, w, ei, page);
-
- /* while page is locked: */
- w->io->op.new_i_size = i_size;
-
- if (wbc->sync_mode == WB_SYNC_ALL)
- w->io->bio.bio.bi_opf |= WRITE_SYNC;
-
- /* Before unlocking the page, transfer reservation to w->io: */
- old = page_state_cmpxchg(page_state(page), new, {
- EBUG_ON(!new.reserved &&
- (new.sectors != PAGE_SECTORS ||
- !new.allocated));
-
- if (new.allocated &&
- w->io->op.op.compression_type != BCH_COMPRESSION_NONE)
- new.allocated = 0;
- else if (!new.reserved)
- goto out;
- new.reserved = 0;
- });
-
- w->io->op.op.res.sectors += PAGE_SECTORS *
- (old.reserved - new.reserved) *
- old.nr_replicas;
-out:
- BUG_ON(PageWriteback(page));
- set_page_writeback(page);
- unlock_page(page);
-
- return 0;
-}
-
-int bch_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
- struct bch_fs *c = mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w = { NULL };
- struct pagecache_iter iter;
- struct page *page;
- int ret = 0;
- int done = 0;
- pgoff_t uninitialized_var(writeback_index);
- pgoff_t index;
- pgoff_t end; /* Inclusive */
- pgoff_t done_index;
- int cycled;
- int range_whole = 0;
- int tag;
-
- if (wbc->range_cyclic) {
- writeback_index = mapping->writeback_index; /* prev offset */
- index = writeback_index;
- if (index == 0)
- cycled = 1;
- else
- cycled = 0;
- end = -1;
- } else {
- index = wbc->range_start >> PAGE_SHIFT;
- end = wbc->range_end >> PAGE_SHIFT;
- if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = 1;
- cycled = 1; /* ignore range_cyclic tests */
- }
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
-retry:
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, index, end);
-
- done_index = index;
-get_pages:
- for_each_pagecache_tag(&iter, mapping, tag, index, end, page) {
- done_index = page->index;
-
- if (w.io &&
- !bio_can_add_page_contig(&w.io->bio.bio, page))
- bch_writepage_do_io(&w);
-
- if (!w.io &&
- atomic_read(&c->writeback_pages) >=
- c->writeback_pages_max) {
- /* don't sleep with pages pinned: */
- pagecache_iter_release(&iter);
-
- __wait_event(c->writeback_wait,
- atomic_read(&c->writeback_pages) <
- c->writeback_pages_max);
- goto get_pages;
- }
-
- lock_page(page);
-
- /*
- * Page truncated or invalidated. We can freely skip it
- * then, even for data integrity operations: the page
- * has disappeared concurrently, so there could be no
- * real expectation of this data interity operation
- * even if there is now a new, dirty page at the same
- * pagecache address.
- */
- if (unlikely(page->mapping != mapping)) {
-continue_unlock:
- unlock_page(page);
- continue;
- }
-
- if (!PageDirty(page)) {
- /* someone wrote it for us */
- goto continue_unlock;
- }
-
- if (PageWriteback(page)) {
- if (wbc->sync_mode != WB_SYNC_NONE)
- wait_on_page_writeback(page);
- else
- goto continue_unlock;
- }
-
- BUG_ON(PageWriteback(page));
- if (!clear_page_dirty_for_io(page))
- goto continue_unlock;
-
- trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
- ret = __bch_writepage(c, page, wbc, &w);
- if (unlikely(ret)) {
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
- unlock_page(page);
- ret = 0;
- } else {
- /*
- * done_index is set past this page,
- * so media errors will not choke
- * background writeout for the entire
- * file. This has consequences for
- * range_cyclic semantics (ie. it may
- * not be suitable for data integrity
- * writeout).
- */
- done_index = page->index + 1;
- done = 1;
- break;
- }
- }
-
- /*
- * We stop writing back only if we are not doing
- * integrity sync. In case of integrity sync we have to
- * keep going until we have written all the pages
- * we tagged for writeback prior to entering this loop.
- */
- if (--wbc->nr_to_write <= 0 &&
- wbc->sync_mode == WB_SYNC_NONE) {
- done = 1;
- break;
- }
- }
- pagecache_iter_release(&iter);
-
- if (w.io)
- bch_writepage_do_io(&w);
-
- if (!cycled && !done) {
- /*
- * range_cyclic:
- * We hit the last page and there is more work to be done: wrap
- * back to the start of the file
- */
- cycled = 1;
- index = 0;
- end = writeback_index - 1;
- goto retry;
- }
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
- mapping->writeback_index = done_index;
-
- return ret;
-}
-
-int bch_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w = { NULL };
- int ret;
-
- ret = __bch_writepage(c, page, wbc, &w);
- if (w.io)
- bch_writepage_do_io(&w);
-
- return ret;
-}
-
-static void bch_read_single_page_end_io(struct bio *bio)
-{
- complete(bio->bi_private);
-}
-
-static int bch_read_single_page(struct page *page,
- struct address_space *mapping)
-{
- struct inode *inode = mapping->host;
- struct bch_fs *c = inode->i_sb->s_fs_info;
- struct bch_read_bio *rbio;
- int ret;
- DECLARE_COMPLETION_ONSTACK(done);
-
- rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1,
- &c->bio_read),
- struct bch_read_bio, bio);
- bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
- rbio->bio.bi_private = &done;
- rbio->bio.bi_end_io = bch_read_single_page_end_io;
- bio_add_page_contig(&rbio->bio, page);
-
- bchfs_read(c, rbio, inode->i_ino);
- wait_for_completion(&done);
-
- ret = rbio->bio.bi_error;
- bio_put(&rbio->bio);
-
- if (ret < 0)
- return ret;
-
- SetPageUptodate(page);
- return 0;
-}
-
-int bch_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- struct inode *inode = mapping->host;
- struct bch_fs *c = inode->i_sb->s_fs_info;
- pgoff_t index = pos >> PAGE_SHIFT;
- unsigned offset = pos & (PAGE_SIZE - 1);
- struct page *page;
- int ret = -ENOMEM;
-
- BUG_ON(inode_unhashed(mapping->host));
-
- /* Not strictly necessary - same reason as mkwrite(): */
- pagecache_add_get(&mapping->add_lock);
-
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- goto err_unlock;
-
- if (PageUptodate(page))
- goto out;
-
- /* If we're writing entire page, don't need to read it in first: */
- if (len == PAGE_SIZE)
- goto out;
-
- if (!offset && pos + len >= inode->i_size) {
- zero_user_segment(page, len, PAGE_SIZE);
- flush_dcache_page(page);
- goto out;
- }
-
- if (index > inode->i_size >> PAGE_SHIFT) {
- zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
- flush_dcache_page(page);
- goto out;
- }
-readpage:
- ret = bch_read_single_page(page, mapping);
- if (ret)
- goto err;
-out:
- ret = bch_get_page_reservation(c, page, true);
- if (ret) {
- if (!PageUptodate(page)) {
- /*
- * If the page hasn't been read in, we won't know if we
- * actually need a reservation - we don't actually need
- * to read here, we just need to check if the page is
- * fully backed by uncompressed data:
- */
- goto readpage;
- }
-
- goto err;
- }
-
- *pagep = page;
- return 0;
-err:
- unlock_page(page);
- put_page(page);
- *pagep = NULL;
-err_unlock:
- pagecache_add_put(&mapping->add_lock);
- return ret;
-}
-
-int bch_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = page->mapping->host;
- struct bch_fs *c = inode->i_sb->s_fs_info;
-
- lockdep_assert_held(&inode->i_rwsem);
-
- if (unlikely(copied < len && !PageUptodate(page))) {
- /*
- * The page needs to be read in, but that would destroy
- * our partial write - simplest thing is to just force
- * userspace to redo the write:
- */
- zero_user(page, 0, PAGE_SIZE);
- flush_dcache_page(page);
- copied = 0;
- }
-
- if (pos + copied > inode->i_size)
- i_size_write(inode, pos + copied);
-
- if (copied) {
- if (!PageUptodate(page))
- SetPageUptodate(page);
- if (!PageDirty(page))
- set_page_dirty(page);
- } else {
- bch_put_page_reservation(c, page);
- }
-
- unlock_page(page);
- put_page(page);
- pagecache_add_put(&mapping->add_lock);
-
- return copied;
-}
-
-/* O_DIRECT */
-
-static void bch_dio_read_complete(struct closure *cl)
-{
- struct dio_read *dio = container_of(cl, struct dio_read, cl);
-
- dio->req->ki_complete(dio->req, dio->ret, 0);
- bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
-}
-
-static void bch_direct_IO_read_endio(struct bio *bio)
-{
- struct dio_read *dio = bio->bi_private;
-
- if (bio->bi_error)
- dio->ret = bio->bi_error;
-
- closure_put(&dio->cl);
-}
-
-static void bch_direct_IO_read_split_endio(struct bio *bio)
-{
- bch_direct_IO_read_endio(bio);
- bio_check_pages_dirty(bio); /* transfers ownership */
-}
-
-static int bch_direct_IO_read(struct bch_fs *c, struct kiocb *req,
- struct file *file, struct inode *inode,
- struct iov_iter *iter, loff_t offset)
-{
- struct dio_read *dio;
- struct bio *bio;
- bool sync = is_sync_kiocb(req);
- ssize_t ret;
-
- if ((offset|iter->count) & (block_bytes(c) - 1))
- return -EINVAL;
-
- ret = min_t(loff_t, iter->count,
- max_t(loff_t, 0, i_size_read(inode) - offset));
- iov_iter_truncate(iter, round_up(ret, block_bytes(c)));
-
- if (!ret)
- return ret;
-
- bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_PAGES),
- bch_dio_read_bioset);
-
- bio->bi_end_io = bch_direct_IO_read_endio;
-
- dio = container_of(bio, struct dio_read, rbio.bio);
- closure_init(&dio->cl, NULL);
-
- /*
- * this is a _really_ horrible hack just to avoid an atomic sub at the
- * end:
- */
- if (!sync) {
- set_closure_fn(&dio->cl, bch_dio_read_complete, NULL);
- atomic_set(&dio->cl.remaining,
- CLOSURE_REMAINING_INITIALIZER -
- CLOSURE_RUNNING +
- CLOSURE_DESTRUCTOR);
- } else {
- atomic_set(&dio->cl.remaining,
- CLOSURE_REMAINING_INITIALIZER + 1);
- }
-
- dio->req = req;
- dio->ret = ret;
-
- goto start;
- while (iter->count) {
- bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_PAGES),
- &c->bio_read);
- bio->bi_end_io = bch_direct_IO_read_split_endio;
-start:
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
- bio->bi_iter.bi_sector = offset >> 9;
- bio->bi_private = dio;
-
- ret = bio_get_user_pages(bio, iter, 1);
- if (ret < 0) {
- /* XXX: fault inject this path */
- bio->bi_error = ret;
- bio_endio(bio);
- break;
- }
-
- offset += bio->bi_iter.bi_size;
- bio_set_pages_dirty(bio);
-
- if (iter->count)
- closure_get(&dio->cl);
-
- bch_read(c, container_of(bio,
- struct bch_read_bio, bio),
- inode->i_ino);
- }
-
- if (sync) {
- closure_sync(&dio->cl);
- closure_debug_destroy(&dio->cl);
- ret = dio->ret;
- bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
- return ret;
- } else {
- return -EIOCBQUEUED;
- }
-}
-
-static long __bch_dio_write_complete(struct dio_write *dio)
-{
- struct file *file = dio->req->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = file->f_inode;
- long ret = dio->error ?: dio->written;
-
- bch_disk_reservation_put(dio->c, &dio->res);
-
- __pagecache_block_put(&mapping->add_lock);
- inode_dio_end(inode);
-
- if (dio->iovec && dio->iovec != dio->inline_vecs)
- kfree(dio->iovec);
-
- bio_put(&dio->bio.bio);
- return ret;
-}
-
-static void bch_dio_write_complete(struct closure *cl)
-{
- struct dio_write *dio = container_of(cl, struct dio_write, cl);
- struct kiocb *req = dio->req;
-
- req->ki_complete(req, __bch_dio_write_complete(dio), 0);
-}
-
-static void bch_dio_write_done(struct dio_write *dio)
-{
- struct bio_vec *bv;
- int i;
-
- dio->written += dio->iop.op.written << 9;
-
- if (dio->iop.op.error)
- dio->error = dio->iop.op.error;
-
- bio_for_each_segment_all(bv, &dio->bio.bio, i)
- put_page(bv->bv_page);
-
- if (dio->iter.count)
- bio_reset(&dio->bio.bio);
-}
-
-static void bch_do_direct_IO_write(struct dio_write *dio)
-{
- struct file *file = dio->req->ki_filp;
- struct inode *inode = file->f_inode;
- struct bch_inode_info *ei = to_bch_ei(inode);
- struct bio *bio = &dio->bio.bio;
- unsigned flags = 0;
- int ret;
-
- if ((dio->req->ki_flags & IOCB_DSYNC) &&
- !dio->c->opts.journal_flush_disabled)
- flags |= BCH_WRITE_FLUSH;
-
- bio->bi_iter.bi_sector = (dio->offset + dio->written) >> 9;
-
- ret = bio_get_user_pages(bio, &dio->iter, 0);
- if (ret < 0) {
- /*
- * these didn't get initialized, but bch_dio_write_done() will
- * look at them:
- */
- dio->iop.op.error = 0;
- dio->iop.op.written = 0;
- dio->error = ret;
- return;
- }
-
- dio->iop.ei = ei;
- dio->iop.sectors_added = 0;
- dio->iop.is_dio = true;
- dio->iop.new_i_size = U64_MAX;
- bch_write_op_init(&dio->iop.op, dio->c, &dio->bio,
- dio->res,
- foreground_write_point(dio->c, inode->i_ino),
- POS(inode->i_ino, bio->bi_iter.bi_sector),
- &ei->journal_seq, flags);
- dio->iop.op.index_update_fn = bchfs_write_index_update;
-
- dio->res.sectors -= bio_sectors(bio);
- dio->iop.op.res.sectors = bio_sectors(bio);
-
- task_io_account_write(bio->bi_iter.bi_size);
-
- closure_call(&dio->iop.op.cl, bch_write, NULL, &dio->cl);
-}
-
-static void bch_dio_write_loop_async(struct closure *cl)
-{
- struct dio_write *dio =
- container_of(cl, struct dio_write, cl);
- struct address_space *mapping = dio->req->ki_filp->f_mapping;
-
- bch_dio_write_done(dio);
-
- if (dio->iter.count && !dio->error) {
- use_mm(dio->mm);
- pagecache_block_get(&mapping->add_lock);
-
- bch_do_direct_IO_write(dio);
-
- pagecache_block_put(&mapping->add_lock);
- unuse_mm(dio->mm);
-
- continue_at(&dio->cl, bch_dio_write_loop_async, NULL);
- } else {
-#if 0
- closure_return_with_destructor(cl, bch_dio_write_complete);
-#else
- closure_debug_destroy(cl);
- bch_dio_write_complete(cl);
-#endif
- }
-}
-
-static int bch_direct_IO_write(struct bch_fs *c, struct kiocb *req,
- struct file *file, struct inode *inode,
- struct iov_iter *iter, loff_t offset)
-{
- struct address_space *mapping = file->f_mapping;
- struct dio_write *dio;
- struct bio *bio;
- ssize_t ret;
- bool sync = is_sync_kiocb(req);
-
- lockdep_assert_held(&inode->i_rwsem);
-
- if (unlikely(!iter->count))
- return 0;
-
- if (unlikely((offset|iter->count) & (block_bytes(c) - 1)))
- return -EINVAL;
-
- bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_PAGES),
- bch_dio_write_bioset);
- dio = container_of(bio, struct dio_write, bio.bio);
- dio->req = req;
- dio->c = c;
- dio->written = 0;
- dio->error = 0;
- dio->offset = offset;
- dio->iovec = NULL;
- dio->iter = *iter;
- dio->mm = current->mm;
- closure_init(&dio->cl, NULL);
-
- if (offset + iter->count > inode->i_size)
- sync = true;
-
- /*
- * XXX: we shouldn't return -ENOSPC if we're overwriting existing data -
- * if getting a reservation fails we should check if we are doing an
- * overwrite.
- *
- * Have to then guard against racing with truncate (deleting data that
- * we would have been overwriting)
- */
- ret = bch_disk_reservation_get(c, &dio->res, iter->count >> 9, 0);
- if (unlikely(ret)) {
- closure_debug_destroy(&dio->cl);
- bio_put(bio);
- return ret;
- }
-
- inode_dio_begin(inode);
- __pagecache_block_get(&mapping->add_lock);
-
- if (sync) {
- do {
- bch_do_direct_IO_write(dio);
-
- closure_sync(&dio->cl);
- bch_dio_write_done(dio);
- } while (dio->iter.count && !dio->error);
-
- closure_debug_destroy(&dio->cl);
- return __bch_dio_write_complete(dio);
- } else {
- bch_do_direct_IO_write(dio);
-
- if (dio->iter.count && !dio->error) {
- if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
- dio->iovec = kmalloc(dio->iter.nr_segs *
- sizeof(struct iovec),
- GFP_KERNEL);
- if (!dio->iovec)
- dio->error = -ENOMEM;
- } else {
- dio->iovec = dio->inline_vecs;
- }
-
- memcpy(dio->iovec,
- dio->iter.iov,
- dio->iter.nr_segs * sizeof(struct iovec));
- dio->iter.iov = dio->iovec;
- }
-
- continue_at_noreturn(&dio->cl, bch_dio_write_loop_async, NULL);
- return -EIOCBQUEUED;
- }
-}
-
-ssize_t bch_direct_IO(struct kiocb *req, struct iov_iter *iter)
-{
- struct file *file = req->ki_filp;
- struct inode *inode = file->f_inode;
- struct bch_fs *c = inode->i_sb->s_fs_info;
- struct blk_plug plug;
- ssize_t ret;
-
- blk_start_plug(&plug);
- ret = ((iov_iter_rw(iter) == WRITE)
- ? bch_direct_IO_write
- : bch_direct_IO_read)(c, req, file, inode, iter, req->ki_pos);
- blk_finish_plug(&plug);
-
- return ret;
-}
-
-static ssize_t
-bch_direct_write(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_inode;
- struct bch_fs *c = inode->i_sb->s_fs_info;
- struct address_space *mapping = file->f_mapping;
- loff_t pos = iocb->ki_pos;
- ssize_t ret;
-
- pagecache_block_get(&mapping->add_lock);
-
- /* Write and invalidate pagecache range that we're writing to: */
- ret = write_invalidate_inode_pages_range(file->f_mapping, pos,
- pos + iov_iter_count(iter) - 1);
- if (unlikely(ret))
- goto err;
-
- ret = bch_direct_IO_write(c, iocb, file, inode, iter, pos);
-err:
- pagecache_block_put(&mapping->add_lock);
-
- return ret;
-}
-
-static ssize_t __bch_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t ret;
-
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = inode_to_bdi(inode);
- ret = file_remove_privs(file);
- if (ret)
- goto out;
-
- ret = file_update_time(file);
- if (ret)
- goto out;
-
- ret = iocb->ki_flags & IOCB_DIRECT
- ? bch_direct_write(iocb, from)
- : generic_perform_write(file, from, iocb->ki_pos);
-
- if (likely(ret > 0))
- iocb->ki_pos += ret;
-out:
- current->backing_dev_info = NULL;
- return ret;
-}
-
-ssize_t bch_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- bool direct = iocb->ki_flags & IOCB_DIRECT;
- ssize_t ret;
-
- inode_lock(inode);
- ret = generic_write_checks(iocb, from);
- if (ret > 0)
- ret = __bch_write_iter(iocb, from);
- inode_unlock(inode);
-
- if (ret > 0 && !direct)
- ret = generic_write_sync(iocb, ret);
-
- return ret;
-}
-
-int bch_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct page *page = vmf->page;
- struct inode *inode = file_inode(vma->vm_file);
- struct address_space *mapping = inode->i_mapping;
- struct bch_fs *c = inode->i_sb->s_fs_info;
- int ret = VM_FAULT_LOCKED;
-
- sb_start_pagefault(inode->i_sb);
- file_update_time(vma->vm_file);
-
- /*
- * Not strictly necessary, but helps avoid dio writes livelocking in
- * write_invalidate_inode_pages_range() - can drop this if/when we get
- * a write_invalidate_inode_pages_range() that works without dropping
- * page lock before invalidating page
- */
- if (current->pagecache_lock != &mapping->add_lock)
- pagecache_add_get(&mapping->add_lock);
-
- lock_page(page);
- if (page->mapping != mapping ||
- page_offset(page) > i_size_read(inode)) {
- unlock_page(page);
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- if (bch_get_page_reservation(c, page, true)) {
- unlock_page(page);
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
-
- if (!PageDirty(page))
- set_page_dirty(page);
- wait_for_stable_page(page);
-out:
- if (current->pagecache_lock != &mapping->add_lock)
- pagecache_add_put(&mapping->add_lock);
- sb_end_pagefault(inode->i_sb);
- return ret;
-}
-
-void bch_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- EBUG_ON(!PageLocked(page));
- EBUG_ON(PageWriteback(page));
-
- if (offset || length < PAGE_SIZE)
- return;
-
- bch_clear_page_bits(page);
-}
-
-int bch_releasepage(struct page *page, gfp_t gfp_mask)
-{
- EBUG_ON(!PageLocked(page));
- EBUG_ON(PageWriteback(page));
-
- if (PageDirty(page))
- return 0;
-
- bch_clear_page_bits(page);
- return 1;
-}
-
-#ifdef CONFIG_MIGRATION
-int bch_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode)
-{
- int ret;
-
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
- if (ret != MIGRATEPAGE_SUCCESS)
- return ret;
-
- if (PagePrivate(page)) {
- *page_state(newpage) = *page_state(page);
- ClearPagePrivate(page);
- }
-
- migrate_page_copy(newpage, page);
- return MIGRATEPAGE_SUCCESS;
-}
-#endif
-
-int bch_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
- struct inode *inode = file->f_mapping->host;
- struct bch_inode_info *ei = to_bch_ei(inode);
- struct bch_fs *c = inode->i_sb->s_fs_info;
- int ret;
-
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
-
- if (c->opts.journal_flush_disabled)
- return 0;
-
- return bch_journal_flush_seq(&c->journal, ei->journal_seq);
-}
-
-static int __bch_truncate_page(struct address_space *mapping,
- pgoff_t index, loff_t start, loff_t end)
-{
- struct inode *inode = mapping->host;
- struct bch_fs *c = inode->i_sb->s_fs_info;
- unsigned start_offset = start & (PAGE_SIZE - 1);
- unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
- struct page *page;
- int ret = 0;
-
- /* Page boundary? Nothing to do */
- if (!((index == start >> PAGE_SHIFT && start_offset) ||
- (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
- return 0;
-
- /* Above i_size? */
- if (index << PAGE_SHIFT >= inode->i_size)
- return 0;
-
- page = find_lock_page(mapping, index);
- if (!page) {
- struct btree_iter iter;
- struct bkey_s_c k = bkey_s_c_null;
-
- /*
- * XXX: we're doing two index lookups when we end up reading the
- * page
- */
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
- POS(inode->i_ino,
- index << (PAGE_SHIFT - 9)), k) {
- if (bkey_cmp(bkey_start_pos(k.k),
- POS(inode->i_ino,
- (index + 1) << (PAGE_SHIFT - 9))) >= 0)
- break;
-
- if (k.k->type != KEY_TYPE_DISCARD &&
- k.k->type != BCH_RESERVATION) {
- bch_btree_iter_unlock(&iter);
- goto create;
- }
- }
- bch_btree_iter_unlock(&iter);
- return 0;
-create:
- page = find_or_create_page(mapping, index, GFP_KERNEL);
- if (unlikely(!page)) {
- ret = -ENOMEM;
- goto out;
- }
- }
-
- if (!PageUptodate(page)) {
- ret = bch_read_single_page(page, mapping);
- if (ret)
- goto unlock;
- }
-
- /*
- * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
- *
- * XXX: because we aren't currently tracking whether the page has actual
- * data in it (vs. just 0s, or only partially written) this wrong. ick.
- */
- ret = bch_get_page_reservation(c, page, false);
- BUG_ON(ret);
-
- if (index == start >> PAGE_SHIFT &&
- index == end >> PAGE_SHIFT)
- zero_user_segment(page, start_offset, end_offset);
- else if (index == start >> PAGE_SHIFT)
- zero_user_segment(page, start_offset, PAGE_SIZE);
- else if (index == end >> PAGE_SHIFT)
- zero_user_segment(page, 0, end_offset);
-
- if (!PageDirty(page))
- set_page_dirty(page);
-unlock:
- unlock_page(page);
- put_page(page);
-out:
- return ret;
-}
-
-static int bch_truncate_page(struct address_space *mapping, loff_t from)
-{
- return __bch_truncate_page(mapping, from >> PAGE_SHIFT,
- from, from + PAGE_SIZE);
-}
-
-int bch_truncate(struct inode *inode, struct iattr *iattr)
-{
- struct address_space *mapping = inode->i_mapping;
- struct bch_inode_info *ei = to_bch_ei(inode);
- struct bch_fs *c = inode->i_sb->s_fs_info;
- bool shrink = iattr->ia_size <= inode->i_size;
- int ret = 0;
-
- inode_dio_wait(inode);
- pagecache_block_get(&mapping->add_lock);
-
- truncate_setsize(inode, iattr->ia_size);
-
- /* sync appends.. */
- /* XXX what protects ei->i_size? */
- if (iattr->ia_size > ei->i_size)
- ret = filemap_write_and_wait_range(mapping, ei->i_size, S64_MAX);
- if (ret)
- goto err_put_pagecache;
-
- mutex_lock(&ei->update_lock);
- i_size_dirty_get(ei);
- ret = bch_write_inode_size(c, ei, inode->i_size);
- mutex_unlock(&ei->update_lock);
-
- if (unlikely(ret))
- goto err;
-
- /*
- * There might be persistent reservations (from fallocate())
- * above i_size, which bch_inode_truncate() will discard - we're
- * only supposed to discard them if we're doing a real truncate
- * here (new i_size < current i_size):
- */
- if (shrink) {
- struct i_sectors_hook i_sectors_hook;
- int ret;
-
- ret = i_sectors_dirty_get(ei, &i_sectors_hook);
- if (unlikely(ret))
- goto err;
-
- ret = bch_truncate_page(inode->i_mapping, iattr->ia_size);
- if (unlikely(ret)) {
- i_sectors_dirty_put(ei, &i_sectors_hook);
- goto err;
- }
-
- ret = bch_inode_truncate(c, inode->i_ino,
- round_up(iattr->ia_size, PAGE_SIZE) >> 9,
- &i_sectors_hook.hook,
- &ei->journal_seq);
-
- i_sectors_dirty_put(ei, &i_sectors_hook);
-
- if (unlikely(ret))
- goto err;
- }
-
- mutex_lock(&ei->update_lock);
- setattr_copy(inode, iattr);
- inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
-
- /* clear I_SIZE_DIRTY: */
- i_size_dirty_put(ei);
- ret = bch_write_inode_size(c, ei, inode->i_size);
- mutex_unlock(&ei->update_lock);
-
- pagecache_block_put(&mapping->add_lock);
-
- return 0;
-err:
- i_size_dirty_put(ei);
-err_put_pagecache:
- pagecache_block_put(&mapping->add_lock);
- return ret;
-}
-
-static long bch_fpunch(struct inode *inode, loff_t offset, loff_t len)
-{
- struct address_space *mapping = inode->i_mapping;
- struct bch_inode_info *ei = to_bch_ei(inode);
- struct bch_fs *c = inode->i_sb->s_fs_info;
- u64 ino = inode->i_ino;
- u64 discard_start = round_up(offset, PAGE_SIZE) >> 9;
- u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9;
- int ret = 0;
-
- inode_lock(inode);
- inode_dio_wait(inode);
- pagecache_block_get(&mapping->add_lock);
-
- ret = __bch_truncate_page(inode->i_mapping,
- offset >> PAGE_SHIFT,
- offset, offset + len);
- if (unlikely(ret))
- goto out;
-
- if (offset >> PAGE_SHIFT !=
- (offset + len) >> PAGE_SHIFT) {
- ret = __bch_truncate_page(inode->i_mapping,
- (offset + len) >> PAGE_SHIFT,
- offset, offset + len);
- if (unlikely(ret))
- goto out;
- }
-
- truncate_pagecache_range(inode, offset, offset + len - 1);
-
- if (discard_start < discard_end) {
- struct disk_reservation disk_res;
- struct i_sectors_hook i_sectors_hook;
- int ret;
-
- BUG_ON(bch_disk_reservation_get(c, &disk_res, 0, 0));
-
- ret = i_sectors_dirty_get(ei, &i_sectors_hook);
- if (unlikely(ret))
- goto out;
-
- ret = bch_discard(c,
- POS(ino, discard_start),
- POS(ino, discard_end),
- ZERO_VERSION,
- &disk_res,
- &i_sectors_hook.hook,
- &ei->journal_seq);
-
- i_sectors_dirty_put(ei, &i_sectors_hook);
- bch_disk_reservation_put(c, &disk_res);
- }
-out:
- pagecache_block_put(&mapping->add_lock);
- inode_unlock(inode);
-
- return ret;
-}
-
-static long bch_fcollapse(struct inode *inode, loff_t offset, loff_t len)
-{
- struct address_space *mapping = inode->i_mapping;
- struct bch_inode_info *ei = to_bch_ei(inode);
- struct bch_fs *c = inode->i_sb->s_fs_info;
- struct btree_iter src;
- struct btree_iter dst;
- BKEY_PADDED(k) copy;
- struct bkey_s_c k;
- struct i_sectors_hook i_sectors_hook;
- loff_t new_size;
- int ret;
-
- if ((offset | len) & (PAGE_SIZE - 1))
- return -EINVAL;
-
- bch_btree_iter_init_intent(&dst, c, BTREE_ID_EXTENTS,
- POS(inode->i_ino, offset >> 9));
- /* position will be set from dst iter's position: */
- bch_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN);
- bch_btree_iter_link(&src, &dst);
-
- /*
- * We need i_mutex to keep the page cache consistent with the extents
- * btree, and the btree consistent with i_size - we don't need outside
- * locking for the extents btree itself, because we're using linked
- * iterators
- */
- inode_lock(inode);
- inode_dio_wait(inode);
- pagecache_block_get(&mapping->add_lock);
-
- ret = -EINVAL;
- if (offset + len >= inode->i_size)
- goto err;
-
- if (inode->i_size < len)
- goto err;
-
- new_size = inode->i_size - len;
-
- ret = write_invalidate_inode_pages_range(inode->i_mapping,
- offset, LLONG_MAX);
- if (ret)
- goto err;
-
- ret = i_sectors_dirty_get(ei, &i_sectors_hook);
- if (ret)
- goto err;
-
- while (bkey_cmp(dst.pos,
- POS(inode->i_ino,
- round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
- struct disk_reservation disk_res;
-
- bch_btree_iter_set_pos(&src,
- POS(dst.pos.inode, dst.pos.offset + (len >> 9)));
-
- ret = bch_btree_iter_traverse(&dst);
- if (ret)
- goto btree_iter_err;
-
- k = bch_btree_iter_peek_with_holes(&src);
- if ((ret = btree_iter_err(k)))
- goto btree_iter_err;
-
- bkey_reassemble(&copy.k, k);
-
- if (bkey_deleted(&copy.k.k))
- copy.k.k.type = KEY_TYPE_DISCARD;
-
- bch_cut_front(src.pos, &copy.k);
- copy.k.k.p.offset -= len >> 9;
-
- BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(&copy.k.k)));
-
- ret = bch_disk_reservation_get(c, &disk_res, copy.k.k.size,
- BCH_DISK_RESERVATION_NOFAIL);
- BUG_ON(ret);
-
- ret = bch_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
- &ei->journal_seq,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(&dst, &copy.k));
- bch_disk_reservation_put(c, &disk_res);
-btree_iter_err:
- if (ret < 0 && ret != -EINTR)
- goto err_unwind;
-
- bch_btree_iter_cond_resched(&src);
- }
-
- bch_btree_iter_unlock(&src);
- bch_btree_iter_unlock(&dst);
-
- ret = bch_inode_truncate(c, inode->i_ino,
- round_up(new_size, PAGE_SIZE) >> 9,
- &i_sectors_hook.hook,
- &ei->journal_seq);
- if (ret)
- goto err_unwind;
-
- i_sectors_dirty_put(ei, &i_sectors_hook);
-
- mutex_lock(&ei->update_lock);
- i_size_write(inode, new_size);
- ret = bch_write_inode_size(c, ei, inode->i_size);
- mutex_unlock(&ei->update_lock);
-
- pagecache_block_put(&mapping->add_lock);
- inode_unlock(inode);
-
- return ret;
-err_unwind:
- /*
- * XXX: we've left data with multiple pointers... which isn't a _super_
- * serious problem...
- */
- i_sectors_dirty_put(ei, &i_sectors_hook);
-err:
- bch_btree_iter_unlock(&src);
- bch_btree_iter_unlock(&dst);
- pagecache_block_put(&mapping->add_lock);
- inode_unlock(inode);
- return ret;
-}
-
-static long bch_fallocate(struct inode *inode, int mode,
- loff_t offset, loff_t len)
-{
- struct address_space *mapping = inode->i_mapping;
- struct bch_inode_info *ei = to_bch_ei(inode);
- struct bch_fs *c = inode->i_sb->s_fs_info;
- struct i_sectors_hook i_sectors_hook;
- struct btree_iter iter;
- struct bpos end;
- loff_t block_start, block_end;
- loff_t new_size = offset + len;
- unsigned sectors;
- unsigned replicas = READ_ONCE(c->opts.data_replicas);
- int ret;
-
- bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
-
- inode_lock(inode);
- inode_dio_wait(inode);
- pagecache_block_get(&mapping->add_lock);
-
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- new_size > inode->i_size) {
- ret = inode_newsize_ok(inode, new_size);
- if (ret)
- goto err;
- }
-
- if (mode & FALLOC_FL_ZERO_RANGE) {
- ret = __bch_truncate_page(inode->i_mapping,
- offset >> PAGE_SHIFT,
- offset, offset + len);
-
- if (!ret &&
- offset >> PAGE_SHIFT !=
- (offset + len) >> PAGE_SHIFT)
- ret = __bch_truncate_page(inode->i_mapping,
- (offset + len) >> PAGE_SHIFT,
- offset, offset + len);
-
- if (unlikely(ret))
- goto err;
-
- truncate_pagecache_range(inode, offset, offset + len - 1);
-
- block_start = round_up(offset, PAGE_SIZE);
- block_end = round_down(offset + len, PAGE_SIZE);
- } else {
- block_start = round_down(offset, PAGE_SIZE);
- block_end = round_up(offset + len, PAGE_SIZE);
- }
-
- bch_btree_iter_set_pos(&iter, POS(inode->i_ino, block_start >> 9));
- end = POS(inode->i_ino, block_end >> 9);
-
- ret = i_sectors_dirty_get(ei, &i_sectors_hook);
- if (unlikely(ret))
- goto err;
-
- while (bkey_cmp(iter.pos, end) < 0) {
- struct disk_reservation disk_res = { 0 };
- struct bkey_i_reservation reservation;
- struct bkey_s_c k;
-
- k = bch_btree_iter_peek_with_holes(&iter);
- if ((ret = btree_iter_err(k)))
- goto btree_iter_err;
-
- /* already reserved */
- if (k.k->type == BCH_RESERVATION &&
- bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
- bch_btree_iter_advance_pos(&iter);
- continue;
- }
-
- if (bkey_extent_is_data(k.k)) {
- if (!(mode & FALLOC_FL_ZERO_RANGE)) {
- bch_btree_iter_advance_pos(&iter);
- continue;
- }
- }
-
- bkey_reservation_init(&reservation.k_i);
- reservation.k.type = BCH_RESERVATION;
- reservation.k.p = k.k->p;
- reservation.k.size = k.k->size;
-
- bch_cut_front(iter.pos, &reservation.k_i);
- bch_cut_back(end, &reservation.k);
-
- sectors = reservation.k.size;
- reservation.v.nr_replicas = bch_extent_nr_dirty_ptrs(k);
-
- if (reservation.v.nr_replicas < replicas ||
- bkey_extent_is_compressed(k)) {
- ret = bch_disk_reservation_get(c, &disk_res,
- sectors, 0);
- if (ret)
- goto err_put_sectors_dirty;
-
- reservation.v.nr_replicas = disk_res.nr_replicas;
- }
-
- ret = bch_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
- &ei->journal_seq,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(&iter, &reservation.k_i));
- bch_disk_reservation_put(c, &disk_res);
-btree_iter_err:
- if (ret < 0 && ret != -EINTR)
- goto err_put_sectors_dirty;
-
- }
- bch_btree_iter_unlock(&iter);
-
- i_sectors_dirty_put(ei, &i_sectors_hook);
-
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- new_size > inode->i_size) {
- i_size_write(inode, new_size);
-
- mutex_lock(&ei->update_lock);
- ret = bch_write_inode_size(c, ei, inode->i_size);
- mutex_unlock(&ei->update_lock);
- }
-
- /* blech */
- if ((mode & FALLOC_FL_KEEP_SIZE) &&
- (mode & FALLOC_FL_ZERO_RANGE) &&
- ei->i_size != inode->i_size) {
- /* sync appends.. */
- ret = filemap_write_and_wait_range(mapping, ei->i_size, S64_MAX);
- if (ret)
- goto err;
-
- if (ei->i_size != inode->i_size) {
- mutex_lock(&ei->update_lock);
- ret = bch_write_inode_size(c, ei, inode->i_size);
- mutex_unlock(&ei->update_lock);
- }
- }
-
- pagecache_block_put(&mapping->add_lock);
- inode_unlock(inode);
-
- return 0;
-err_put_sectors_dirty:
- i_sectors_dirty_put(ei, &i_sectors_hook);
-err:
- bch_btree_iter_unlock(&iter);
- pagecache_block_put(&mapping->add_lock);
- inode_unlock(inode);
- return ret;
-}
-
-long bch_fallocate_dispatch(struct file *file, int mode,
- loff_t offset, loff_t len)
-{
- struct inode *inode = file_inode(file);
-
- if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
- return bch_fallocate(inode, mode, offset, len);
-
- if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
- return bch_fpunch(inode, offset, len);
-
- if (mode == FALLOC_FL_COLLAPSE_RANGE)
- return bch_fcollapse(inode, offset, len);
-
- return -EOPNOTSUPP;
-}
-
-static bool page_is_data(struct page *page)
-{
- /* XXX: should only have to check PageDirty */
- return PagePrivate(page) &&
- (page_state(page)->sectors ||
- page_state(page)->dirty_sectors);
-}
-
-static loff_t bch_next_pagecache_data(struct inode *inode,
- loff_t start_offset,
- loff_t end_offset)
-{
- struct address_space *mapping = inode->i_mapping;
- struct page *page;
- pgoff_t index;
-
- for (index = start_offset >> PAGE_SHIFT;
- index < end_offset >> PAGE_SHIFT;
- index++) {
- if (find_get_pages(mapping, index, 1, &page)) {
- lock_page(page);
- index = page->index;
-
- if (page_is_data(page))
- end_offset =
- min(end_offset,
- max(start_offset,
- ((loff_t) index) << PAGE_SHIFT));
- unlock_page(page);
- put_page(page);
- } else {
- break;
- }
- }
-
- return end_offset;
-}
-
-static loff_t bch_seek_data(struct file *file, u64 offset)
-{
- struct inode *inode = file->f_mapping->host;
- struct bch_fs *c = inode->i_sb->s_fs_info;
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 isize, next_data = MAX_LFS_FILESIZE;
- int ret;
-
- isize = i_size_read(inode);
- if (offset >= isize)
- return -ENXIO;
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
- POS(inode->i_ino, offset >> 9), k) {
- if (k.k->p.inode != inode->i_ino) {
- break;
- } else if (bkey_extent_is_data(k.k)) {
- next_data = max(offset, bkey_start_offset(k.k) << 9);
- break;
- } else if (k.k->p.offset >> 9 > isize)
- break;
- }
-
- ret = bch_btree_iter_unlock(&iter);
- if (ret)
- return ret;
-
- if (next_data > offset)
- next_data = bch_next_pagecache_data(inode, offset, next_data);
-
- if (next_data > isize)
- return -ENXIO;
-
- return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
-}
-
-static bool page_slot_is_data(struct address_space *mapping, pgoff_t index)
-{
- struct page *page;
- bool ret;
-
- page = find_lock_entry(mapping, index);
- if (!page || radix_tree_exception(page))
- return false;
-
- ret = page_is_data(page);
- unlock_page(page);
-
- return ret;
-}
-
-static loff_t bch_next_pagecache_hole(struct inode *inode,
- loff_t start_offset,
- loff_t end_offset)
-{
- struct address_space *mapping = inode->i_mapping;
- pgoff_t index;
-
- for (index = start_offset >> PAGE_SHIFT;
- index < end_offset >> PAGE_SHIFT;
- index++)
- if (!page_slot_is_data(mapping, index))
- end_offset = max(start_offset,
- ((loff_t) index) << PAGE_SHIFT);
-
- return end_offset;
-}
-
-static loff_t bch_seek_hole(struct file *file, u64 offset)
-{
- struct inode *inode = file->f_mapping->host;
- struct bch_fs *c = inode->i_sb->s_fs_info;
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 isize, next_hole = MAX_LFS_FILESIZE;
- int ret;
-
- isize = i_size_read(inode);
- if (offset >= isize)
- return -ENXIO;
-
- for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
- POS(inode->i_ino, offset >> 9), k) {
- if (k.k->p.inode != inode->i_ino) {
- next_hole = bch_next_pagecache_hole(inode,
- offset, MAX_LFS_FILESIZE);
- break;
- } else if (!bkey_extent_is_data(k.k)) {
- next_hole = bch_next_pagecache_hole(inode,
- max(offset, bkey_start_offset(k.k) << 9),
- k.k->p.offset << 9);
-
- if (next_hole < k.k->p.offset << 9)
- break;
- } else {
- offset = max(offset, bkey_start_offset(k.k) << 9);
- }
- }
-
- ret = bch_btree_iter_unlock(&iter);
- if (ret)
- return ret;
-
- if (next_hole > isize)
- next_hole = isize;
-
- return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
-}
-
-loff_t bch_llseek(struct file *file, loff_t offset, int whence)
-{
- switch (whence) {
- case SEEK_SET:
- case SEEK_CUR:
- case SEEK_END:
- return generic_file_llseek(file, offset, whence);
- case SEEK_DATA:
- return bch_seek_data(file, offset);
- case SEEK_HOLE:
- return bch_seek_hole(file, offset);
- }
-
- return -EINVAL;
-}
diff --git a/libbcache/fs-io.h b/libbcache/fs-io.h
deleted file mode 100644
index 4c428978..00000000
--- a/libbcache/fs-io.h
+++ /dev/null
@@ -1,96 +0,0 @@
-#ifndef _BCACHE_FS_IO_H
-#define _BCACHE_FS_IO_H
-
-#include "buckets.h"
-#include <linux/uio.h>
-
-int bch_set_page_dirty(struct page *);
-
-int bch_writepage(struct page *, struct writeback_control *);
-int bch_readpage(struct file *, struct page *);
-
-int bch_writepages(struct address_space *, struct writeback_control *);
-int bch_readpages(struct file *, struct address_space *,
- struct list_head *, unsigned);
-
-int bch_write_begin(struct file *, struct address_space *, loff_t,
- unsigned, unsigned, struct page **, void **);
-int bch_write_end(struct file *, struct address_space *, loff_t,
- unsigned, unsigned, struct page *, void *);
-
-ssize_t bch_direct_IO(struct kiocb *, struct iov_iter *);
-
-ssize_t bch_write_iter(struct kiocb *, struct iov_iter *);
-
-int bch_fsync(struct file *, loff_t, loff_t, int);
-
-int bch_truncate(struct inode *, struct iattr *);
-long bch_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-
-loff_t bch_llseek(struct file *, loff_t, int);
-
-int bch_page_mkwrite(struct vm_area_struct *, struct vm_fault *);
-void bch_invalidatepage(struct page *, unsigned int, unsigned int);
-int bch_releasepage(struct page *, gfp_t);
-int bch_migrate_page(struct address_space *, struct page *,
- struct page *, enum migrate_mode);
-
-struct i_sectors_hook {
- struct extent_insert_hook hook;
- s64 sectors;
- struct bch_inode_info *ei;
-};
-
-struct bchfs_write_op {
- struct bch_inode_info *ei;
- s64 sectors_added;
- bool is_dio;
- u64 new_i_size;
- struct bch_write_op op;
-};
-
-struct bch_writepage_io {
- struct closure cl;
-
- struct bchfs_write_op op;
-
- /* must come last: */
- struct bch_write_bio bio;
-};
-
-extern struct bio_set *bch_writepage_bioset;
-
-struct dio_write {
- struct closure cl;
- struct kiocb *req;
- struct bch_fs *c;
- long written;
- long error;
- loff_t offset;
-
- struct disk_reservation res;
-
- struct iovec *iovec;
- struct iovec inline_vecs[UIO_FASTIOV];
- struct iov_iter iter;
-
- struct mm_struct *mm;
-
- struct bchfs_write_op iop;
-
- /* must be last: */
- struct bch_write_bio bio;
-};
-
-extern struct bio_set *bch_dio_write_bioset;
-
-struct dio_read {
- struct closure cl;
- struct kiocb *req;
- long ret;
- struct bch_read_bio rbio;
-};
-
-extern struct bio_set *bch_dio_read_bioset;
-
-#endif /* _BCACHE_FS_IO_H */
diff --git a/libbcache/fs.c b/libbcache/fs.c
deleted file mode 100644
index f1125a32..00000000
--- a/libbcache/fs.c
+++ /dev/null
@@ -1,1481 +0,0 @@
-
-#include "bcache.h"
-#include "acl.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "chardev.h"
-#include "dirent.h"
-#include "extents.h"
-#include "fs.h"
-#include "fs-gc.h"
-#include "fs-io.h"
-#include "inode.h"
-#include "journal.h"
-#include "keylist.h"
-#include "super.h"
-#include "xattr.h"
-
-#include <linux/aio.h>
-#include <linux/backing-dev.h>
-#include <linux/compat.h>
-#include <linux/module.h>
-#include <linux/mount.h>
-#include <linux/random.h>
-#include <linux/statfs.h>
-#include <linux/xattr.h>
-
-static struct kmem_cache *bch_inode_cache;
-
-static void bch_vfs_inode_init(struct bch_fs *,
- struct bch_inode_info *,
- struct bch_inode_unpacked *);
-
-/*
- * I_SIZE_DIRTY requires special handling:
- *
- * To the recovery code, the flag means that there is stale data past i_size
- * that needs to be deleted; it's used for implementing atomic appends and
- * truncates.
- *
- * On append, we set I_SIZE_DIRTY before doing the write, then after the write
- * we clear I_SIZE_DIRTY atomically with updating i_size to the new larger size
- * that exposes the data we just wrote.
- *
- * On truncate, it's the reverse: We set I_SIZE_DIRTY atomically with setting
- * i_size to the new smaller size, then we delete the data that we just made
- * invisible, and then we clear I_SIZE_DIRTY.
- *
- * Because there can be multiple appends in flight at a time, we need a refcount
- * (i_size_dirty_count) instead of manipulating the flag directly. Nonzero
- * refcount means I_SIZE_DIRTY is set, zero means it's cleared.
- *
- * Because write_inode() can be called at any time, i_size_dirty_count means
- * something different to the runtime code - it means to write_inode() "don't
- * update i_size yet".
- *
- * We don't clear I_SIZE_DIRTY directly, we let write_inode() clear it when
- * i_size_dirty_count is zero - but the reverse is not true, I_SIZE_DIRTY must
- * be set explicitly.
- */
-
-int __must_check __bch_write_inode(struct bch_fs *c,
- struct bch_inode_info *ei,
- inode_set_fn set,
- void *p)
-{
- struct btree_iter iter;
- struct inode *inode = &ei->vfs_inode;
- struct bch_inode_unpacked inode_u;
- struct bkey_inode_buf inode_p;
- u64 inum = inode->i_ino;
- unsigned i_nlink = READ_ONCE(inode->i_nlink);
- int ret;
-
- /*
- * We can't write an inode with i_nlink == 0 because it's stored biased;
- * however, we don't need to because if i_nlink is 0 the inode is
- * getting deleted when it's evicted.
- */
- if (!i_nlink)
- return 0;
-
- lockdep_assert_held(&ei->update_lock);
-
- bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(inum, 0));
-
- do {
- struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
-
- if ((ret = btree_iter_err(k)))
- goto out;
-
- if (WARN_ONCE(k.k->type != BCH_INODE_FS,
- "inode %llu not found when updating", inum)) {
- bch_btree_iter_unlock(&iter);
- return -ENOENT;
- }
-
- ret = bch_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
- if (WARN_ONCE(ret,
- "error %i unpacking inode %llu", ret, inum)) {
- ret = -ENOENT;
- break;
- }
-
- if (set) {
- ret = set(ei, &inode_u, p);
- if (ret)
- goto out;
- }
-
- BUG_ON(i_nlink < nlink_bias(inode->i_mode));
-
- inode_u.i_mode = inode->i_mode;
- inode_u.i_uid = i_uid_read(inode);
- inode_u.i_gid = i_gid_read(inode);
- inode_u.i_nlink = i_nlink - nlink_bias(inode->i_mode);
- inode_u.i_dev = inode->i_rdev;
- inode_u.i_atime = timespec_to_bch_time(c, inode->i_atime);
- inode_u.i_mtime = timespec_to_bch_time(c, inode->i_mtime);
- inode_u.i_ctime = timespec_to_bch_time(c, inode->i_ctime);
-
- bch_inode_pack(&inode_p, &inode_u);
-
- ret = bch_btree_insert_at(c, NULL, NULL, &ei->journal_seq,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
- } while (ret == -EINTR);
-
- if (!ret) {
- ei->i_size = inode_u.i_size;
- ei->i_flags = inode_u.i_flags;
- }
-out:
- bch_btree_iter_unlock(&iter);
-
- return ret < 0 ? ret : 0;
-}
-
-int __must_check bch_write_inode(struct bch_fs *c,
- struct bch_inode_info *ei)
-{
- return __bch_write_inode(c, ei, NULL, NULL);
-}
-
-int bch_inc_nlink(struct bch_fs *c, struct bch_inode_info *ei)
-{
- int ret;
-
- mutex_lock(&ei->update_lock);
- inc_nlink(&ei->vfs_inode);
- ret = bch_write_inode(c, ei);
- mutex_unlock(&ei->update_lock);
-
- return ret;
-}
-
-int bch_dec_nlink(struct bch_fs *c, struct bch_inode_info *ei)
-{
- int ret = 0;
-
- mutex_lock(&ei->update_lock);
- drop_nlink(&ei->vfs_inode);
- ret = bch_write_inode(c, ei);
- mutex_unlock(&ei->update_lock);
-
- return ret;
-}
-
-static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct inode *inode;
- struct bch_inode_unpacked inode_u;
- struct bch_inode_info *ei;
- int ret;
-
- pr_debug("inum %llu", inum);
-
- inode = iget_locked(sb, inum);
- if (unlikely(!inode))
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
- return inode;
-
- ret = bch_inode_find_by_inum(c, inum, &inode_u);
- if (ret) {
- iget_failed(inode);
- return ERR_PTR(ret);
- }
-
- ei = to_bch_ei(inode);
- bch_vfs_inode_init(c, ei, &inode_u);
-
- ei->journal_seq = bch_inode_journal_seq(&c->journal, inum);
-
- unlock_new_inode(inode);
-
- return inode;
-}
-
-static struct inode *bch_vfs_inode_create(struct bch_fs *c,
- struct inode *parent,
- umode_t mode, dev_t rdev)
-{
- struct inode *inode;
- struct posix_acl *default_acl = NULL, *acl = NULL;
- struct bch_inode_info *ei;
- struct bch_inode_unpacked inode_u;
- struct bkey_inode_buf inode_p;
- int ret;
-
- inode = new_inode(parent->i_sb);
- if (unlikely(!inode))
- return ERR_PTR(-ENOMEM);
-
- inode_init_owner(inode, parent, mode);
-
- ret = posix_acl_create(parent, &inode->i_mode, &default_acl, &acl);
- if (ret) {
- make_bad_inode(inode);
- goto err;
- }
-
- ei = to_bch_ei(inode);
-
- bch_inode_init(c, &inode_u, i_uid_read(inode),
- i_gid_read(inode), inode->i_mode, rdev);
- bch_inode_pack(&inode_p, &inode_u);
-
- ret = bch_inode_create(c, &inode_p.inode.k_i,
- BLOCKDEV_INODE_MAX, 0,
- &c->unused_inode_hint);
- if (unlikely(ret)) {
- /*
- * indicate to bch_evict_inode that the inode was never actually
- * created:
- */
- make_bad_inode(inode);
- goto err;
- }
-
- inode_u.inum = inode_p.inode.k.p.inode;
- bch_vfs_inode_init(c, ei, &inode_u);
-
- if (default_acl) {
- ret = bch_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
- if (unlikely(ret))
- goto err;
- }
-
- if (acl) {
- ret = bch_set_acl(inode, acl, ACL_TYPE_ACCESS);
- if (unlikely(ret))
- goto err;
- }
-
- insert_inode_hash(inode);
- atomic_long_inc(&c->nr_inodes);
-out:
- posix_acl_release(default_acl);
- posix_acl_release(acl);
- return inode;
-err:
- clear_nlink(inode);
- iput(inode);
- inode = ERR_PTR(ret);
- goto out;
-}
-
-static int bch_vfs_dirent_create(struct bch_fs *c, struct inode *dir,
- u8 type, const struct qstr *name,
- struct inode *dst)
-{
- struct bch_inode_info *dir_ei = to_bch_ei(dir);
- int ret;
-
- ret = bch_dirent_create(c, dir->i_ino, &dir_ei->str_hash,
- type, name, dst->i_ino,
- &dir_ei->journal_seq,
- BCH_HASH_SET_MUST_CREATE);
- if (unlikely(ret))
- return ret;
-
- dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
- mark_inode_dirty_sync(dir);
- return 0;
-}
-
-static int __bch_create(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
-{
- struct bch_inode_info *dir_ei = to_bch_ei(dir);
- struct bch_fs *c = dir->i_sb->s_fs_info;
- struct inode *inode;
- struct bch_inode_info *ei;
- int ret;
-
- inode = bch_vfs_inode_create(c, dir, mode, rdev);
- if (unlikely(IS_ERR(inode)))
- return PTR_ERR(inode);
-
- ei = to_bch_ei(inode);
-
- ret = bch_vfs_dirent_create(c, dir, mode_to_type(mode),
- &dentry->d_name, inode);
- if (unlikely(ret)) {
- clear_nlink(inode);
- iput(inode);
- return ret;
- }
-
- if (dir_ei->journal_seq > ei->journal_seq)
- ei->journal_seq = dir_ei->journal_seq;
-
- d_instantiate(dentry, inode);
- return 0;
-}
-
-/* methods */
-
-static struct dentry *bch_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
-{
- struct bch_fs *c = dir->i_sb->s_fs_info;
- struct bch_inode_info *dir_ei = to_bch_ei(dir);
- struct inode *inode = NULL;
- u64 inum;
-
- inum = bch_dirent_lookup(c, dir->i_ino,
- &dir_ei->str_hash,
- &dentry->d_name);
-
- if (inum)
- inode = bch_vfs_inode_get(dir->i_sb, inum);
-
- return d_splice_alias(inode, dentry);
-}
-
-static int bch_create(struct inode *dir, struct dentry *dentry,
- umode_t mode, bool excl)
-{
- return __bch_create(dir, dentry, mode|S_IFREG, 0);
-}
-
-static int bch_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *dentry)
-{
- struct bch_fs *c = dir->i_sb->s_fs_info;
- struct inode *inode = old_dentry->d_inode;
- struct bch_inode_info *ei = to_bch_ei(inode);
- int ret;
-
- lockdep_assert_held(&inode->i_rwsem);
-
- inode->i_ctime = current_fs_time(dir->i_sb);
-
- ret = bch_inc_nlink(c, ei);
- if (ret)
- return ret;
-
- ihold(inode);
-
- ret = bch_vfs_dirent_create(c, dir, mode_to_type(inode->i_mode),
- &dentry->d_name, inode);
- if (unlikely(ret)) {
- bch_dec_nlink(c, ei);
- iput(inode);
- return ret;
- }
-
- d_instantiate(dentry, inode);
- return 0;
-}
-
-static int bch_unlink(struct inode *dir, struct dentry *dentry)
-{
- struct bch_fs *c = dir->i_sb->s_fs_info;
- struct bch_inode_info *dir_ei = to_bch_ei(dir);
- struct inode *inode = dentry->d_inode;
- struct bch_inode_info *ei = to_bch_ei(inode);
- int ret;
-
- lockdep_assert_held(&inode->i_rwsem);
-
- ret = bch_dirent_delete(c, dir->i_ino, &dir_ei->str_hash,
- &dentry->d_name, &dir_ei->journal_seq);
- if (ret)
- return ret;
-
- if (dir_ei->journal_seq > ei->journal_seq)
- ei->journal_seq = dir_ei->journal_seq;
-
- inode->i_ctime = dir->i_ctime;
-
- if (S_ISDIR(inode->i_mode)) {
- bch_dec_nlink(c, dir_ei);
- drop_nlink(inode);
- }
-
- bch_dec_nlink(c, ei);
-
- return 0;
-}
-
-static int bch_symlink(struct inode *dir, struct dentry *dentry,
- const char *symname)
-{
- struct bch_fs *c = dir->i_sb->s_fs_info;
- struct inode *inode;
- struct bch_inode_info *ei, *dir_ei = to_bch_ei(dir);
- int ret;
-
- inode = bch_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0);
- if (unlikely(IS_ERR(inode)))
- return PTR_ERR(inode);
-
- ei = to_bch_ei(inode);
-
- inode_lock(inode);
- ret = page_symlink(inode, symname, strlen(symname) + 1);
- inode_unlock(inode);
-
- if (unlikely(ret))
- goto err;
-
- ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
- if (unlikely(ret))
- goto err;
-
- /* XXX: racy */
- if (dir_ei->journal_seq < ei->journal_seq)
- dir_ei->journal_seq = ei->journal_seq;
-
- ret = bch_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name, inode);
- if (unlikely(ret))
- goto err;
-
- d_instantiate(dentry, inode);
- return 0;
-err:
- clear_nlink(inode);
- iput(inode);
- return ret;
-}
-
-static int bch_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- struct bch_fs *c = dir->i_sb->s_fs_info;
- int ret;
-
- lockdep_assert_held(&dir->i_rwsem);
-
- ret = __bch_create(dir, dentry, mode|S_IFDIR, 0);
- if (unlikely(ret))
- return ret;
-
- bch_inc_nlink(c, to_bch_ei(dir));
-
- return 0;
-}
-
-static int bch_rmdir(struct inode *dir, struct dentry *dentry)
-{
- struct bch_fs *c = dir->i_sb->s_fs_info;
- struct inode *inode = dentry->d_inode;
-
- if (bch_empty_dir(c, inode->i_ino))
- return -ENOTEMPTY;
-
- return bch_unlink(dir, dentry);
-}
-
-static int bch_mknod(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
-{
- return __bch_create(dir, dentry, mode, rdev);
-}
-
-static int bch_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
-{
- struct bch_fs *c = old_dir->i_sb->s_fs_info;
- struct inode *old_inode = old_dentry->d_inode;
- struct bch_inode_info *ei = to_bch_ei(old_inode);
- struct inode *new_inode = new_dentry->d_inode;
- struct timespec now = current_fs_time(old_dir->i_sb);
- int ret;
-
- lockdep_assert_held(&old_dir->i_rwsem);
- lockdep_assert_held(&new_dir->i_rwsem);
-
- if (new_inode)
- filemap_write_and_wait_range(old_inode->i_mapping,
- 0, LLONG_MAX);
-
- if (new_inode && S_ISDIR(old_inode->i_mode)) {
- lockdep_assert_held(&new_inode->i_rwsem);
-
- if (!S_ISDIR(new_inode->i_mode))
- return -ENOTDIR;
-
- if (bch_empty_dir(c, new_inode->i_ino))
- return -ENOTEMPTY;
-
- ret = bch_dirent_rename(c,
- old_dir, &old_dentry->d_name,
- new_dir, &new_dentry->d_name,
- &ei->journal_seq, BCH_RENAME_OVERWRITE);
- if (unlikely(ret))
- return ret;
-
- clear_nlink(new_inode);
- bch_dec_nlink(c, to_bch_ei(old_dir));
- } else if (new_inode) {
- lockdep_assert_held(&new_inode->i_rwsem);
-
- ret = bch_dirent_rename(c,
- old_dir, &old_dentry->d_name,
- new_dir, &new_dentry->d_name,
- &ei->journal_seq, BCH_RENAME_OVERWRITE);
- if (unlikely(ret))
- return ret;
-
- new_inode->i_ctime = now;
- bch_dec_nlink(c, to_bch_ei(new_inode));
- } else if (S_ISDIR(old_inode->i_mode)) {
- ret = bch_dirent_rename(c,
- old_dir, &old_dentry->d_name,
- new_dir, &new_dentry->d_name,
- &ei->journal_seq, BCH_RENAME);
- if (unlikely(ret))
- return ret;
-
- bch_inc_nlink(c, to_bch_ei(new_dir));
- bch_dec_nlink(c, to_bch_ei(old_dir));
- } else {
- ret = bch_dirent_rename(c,
- old_dir, &old_dentry->d_name,
- new_dir, &new_dentry->d_name,
- &ei->journal_seq, BCH_RENAME);
- if (unlikely(ret))
- return ret;
- }
-
- old_dir->i_ctime = old_dir->i_mtime = now;
- new_dir->i_ctime = new_dir->i_mtime = now;
- mark_inode_dirty_sync(old_dir);
- mark_inode_dirty_sync(new_dir);
-
- old_inode->i_ctime = now;
- mark_inode_dirty_sync(old_inode);
-
- return 0;
-}
-
-static int bch_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
-{
- struct bch_fs *c = old_dir->i_sb->s_fs_info;
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
- struct bch_inode_info *ei = to_bch_ei(old_inode);
- struct timespec now = current_fs_time(old_dir->i_sb);
- int ret;
-
- ret = bch_dirent_rename(c,
- old_dir, &old_dentry->d_name,
- new_dir, &new_dentry->d_name,
- &ei->journal_seq, BCH_RENAME_EXCHANGE);
- if (unlikely(ret))
- return ret;
-
- if (S_ISDIR(old_inode->i_mode) !=
- S_ISDIR(new_inode->i_mode)) {
- if (S_ISDIR(old_inode->i_mode)) {
- bch_inc_nlink(c, to_bch_ei(new_dir));
- bch_dec_nlink(c, to_bch_ei(old_dir));
- } else {
- bch_dec_nlink(c, to_bch_ei(new_dir));
- bch_inc_nlink(c, to_bch_ei(old_dir));
- }
- }
-
- old_dir->i_ctime = old_dir->i_mtime = now;
- new_dir->i_ctime = new_dir->i_mtime = now;
- mark_inode_dirty_sync(old_dir);
- mark_inode_dirty_sync(new_dir);
-
- old_inode->i_ctime = now;
- new_inode->i_ctime = now;
- mark_inode_dirty_sync(old_inode);
- mark_inode_dirty_sync(new_inode);
-
- return 0;
-}
-
-static int bch_rename2(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned flags)
-{
- if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
- return -EINVAL;
-
- if (flags & RENAME_EXCHANGE)
- return bch_rename_exchange(old_dir, old_dentry,
- new_dir, new_dentry);
-
- return bch_rename(old_dir, old_dentry, new_dir, new_dentry);
-}
-
-static int bch_setattr(struct dentry *dentry, struct iattr *iattr)
-{
- struct inode *inode = dentry->d_inode;
- struct bch_inode_info *ei = to_bch_ei(inode);
- struct bch_fs *c = inode->i_sb->s_fs_info;
- int ret = 0;
-
- lockdep_assert_held(&inode->i_rwsem);
-
- pr_debug("i_size was %llu update has %llu",
- inode->i_size, iattr->ia_size);
-
- ret = setattr_prepare(dentry, iattr);
- if (ret)
- return ret;
-
- if (iattr->ia_valid & ATTR_SIZE) {
- ret = bch_truncate(inode, iattr);
- } else {
- mutex_lock(&ei->update_lock);
- setattr_copy(inode, iattr);
- ret = bch_write_inode(c, ei);
- mutex_unlock(&ei->update_lock);
- }
-
- if (unlikely(ret))
- return ret;
-
- if (iattr->ia_valid & ATTR_MODE)
- ret = posix_acl_chmod(inode, inode->i_mode);
-
- return ret;
-}
-
-static int bch_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- struct bch_fs *c = dir->i_sb->s_fs_info;
- struct inode *inode;
-
- /* XXX: i_nlink should be 0? */
- inode = bch_vfs_inode_create(c, dir, mode, 0);
- if (unlikely(IS_ERR(inode)))
- return PTR_ERR(inode);
-
- d_tmpfile(dentry, inode);
- return 0;
-}
-
-static int bch_fill_extent(struct fiemap_extent_info *info,
- const struct bkey_i *k, unsigned flags)
-{
- if (bkey_extent_is_data(&k->k)) {
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
- const struct bch_extent_ptr *ptr;
- const union bch_extent_crc *crc;
- int ret;
-
- extent_for_each_ptr_crc(e, ptr, crc) {
- int flags2 = 0;
- u64 offset = ptr->offset;
-
- if (crc_compression_type(crc))
- flags2 |= FIEMAP_EXTENT_ENCODED;
- else
- offset += crc_offset(crc);
-
- if ((offset & (PAGE_SECTORS - 1)) ||
- (e.k->size & (PAGE_SECTORS - 1)))
- flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
-
- ret = fiemap_fill_next_extent(info,
- bkey_start_offset(e.k) << 9,
- offset << 9,
- e.k->size << 9, flags|flags2);
- if (ret)
- return ret;
- }
-
- return 0;
- } else if (k->k.type == BCH_RESERVATION) {
- return fiemap_fill_next_extent(info,
- bkey_start_offset(&k->k) << 9,
- 0, k->k.size << 9,
- flags|
- FIEMAP_EXTENT_DELALLOC|
- FIEMAP_EXTENT_UNWRITTEN);
- } else {
- BUG();
- }
-}
-
-static int bch_fiemap(struct inode *inode, struct fiemap_extent_info *info,
- u64 start, u64 len)
-{
- struct bch_fs *c = inode->i_sb->s_fs_info;
- struct btree_iter iter;
- struct bkey_s_c k;
- BKEY_PADDED(k) tmp;
- bool have_extent = false;
- int ret = 0;
-
- if (start + len < start)
- return -EINVAL;
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
- POS(inode->i_ino, start >> 9), k)
- if (bkey_extent_is_data(k.k) ||
- k.k->type == BCH_RESERVATION) {
- if (bkey_cmp(bkey_start_pos(k.k),
- POS(inode->i_ino, (start + len) >> 9)) >= 0)
- break;
-
- if (have_extent) {
- ret = bch_fill_extent(info, &tmp.k, 0);
- if (ret)
- goto out;
- }
-
- bkey_reassemble(&tmp.k, k);
- have_extent = true;
- }
-
- if (have_extent)
- ret = bch_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
-out:
- bch_btree_iter_unlock(&iter);
- return ret < 0 ? ret : 0;
-}
-
-static const struct vm_operations_struct bch_vm_ops = {
- .fault = filemap_fault,
- .map_pages = filemap_map_pages,
- .page_mkwrite = bch_page_mkwrite,
-};
-
-static int bch_mmap(struct file *file, struct vm_area_struct *vma)
-{
- file_accessed(file);
-
- vma->vm_ops = &bch_vm_ops;
- return 0;
-}
-
-/* Inode flags: */
-
-static const unsigned bch_inode_flags_to_vfs_flags_map[] = {
- [__BCH_INODE_SYNC] = S_SYNC,
- [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE,
- [__BCH_INODE_APPEND] = S_APPEND,
- [__BCH_INODE_NOATIME] = S_NOATIME,
-};
-
-static const unsigned bch_inode_flags_to_user_flags_map[] = {
- [__BCH_INODE_SYNC] = FS_SYNC_FL,
- [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL,
- [__BCH_INODE_APPEND] = FS_APPEND_FL,
- [__BCH_INODE_NODUMP] = FS_NODUMP_FL,
- [__BCH_INODE_NOATIME] = FS_NOATIME_FL,
-};
-
-/* Set VFS inode flags from bcache inode: */
-static void bch_inode_flags_to_vfs(struct inode *inode)
-{
- unsigned i, flags = to_bch_ei(inode)->i_flags;
-
- for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_vfs_flags_map); i++)
- if (flags & (1 << i))
- inode->i_flags |= bch_inode_flags_to_vfs_flags_map[i];
- else
- inode->i_flags &= ~bch_inode_flags_to_vfs_flags_map[i];
-}
-
-/* Get FS_IOC_GETFLAGS flags from bcache inode: */
-static unsigned bch_inode_flags_to_user_flags(unsigned flags)
-{
- unsigned i, ret = 0;
-
- for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_user_flags_map); i++)
- if (flags & (1 << i))
- ret |= bch_inode_flags_to_user_flags_map[i];
-
- return ret;
-}
-
-static int bch_inode_user_flags_set(struct bch_inode_info *ei,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- /*
- * We're relying on btree locking here for exclusion with other ioctl
- * calls - use the flags in the btree (@bi), not ei->i_flags:
- */
- unsigned bch_flags = bi->i_flags;
- unsigned oldflags = bch_inode_flags_to_user_flags(bch_flags);
- unsigned newflags = *((unsigned *) p);
- unsigned i;
-
- if (((newflags ^ oldflags) & (FS_APPEND_FL|FS_IMMUTABLE_FL)) &&
- !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
- for (i = 0; i < ARRAY_SIZE(bch_inode_flags_to_user_flags_map); i++) {
- if (newflags & bch_inode_flags_to_user_flags_map[i])
- bch_flags |= (1 << i);
- else
- bch_flags &= ~(1 << i);
-
- newflags &= ~bch_inode_flags_to_user_flags_map[i];
- oldflags &= ~bch_inode_flags_to_user_flags_map[i];
- }
-
- if (oldflags != newflags)
- return -EOPNOTSUPP;
-
- bi->i_flags = bch_flags;
- ei->vfs_inode.i_ctime = current_fs_time(ei->vfs_inode.i_sb);
-
- return 0;
-}
-
-#define FS_IOC_GOINGDOWN _IOR ('X', 125, __u32)
-
-static long bch_fs_file_ioctl(struct file *filp, unsigned int cmd,
- unsigned long arg)
-{
- struct inode *inode = file_inode(filp);
- struct super_block *sb = inode->i_sb;
- struct bch_fs *c = sb->s_fs_info;
- struct bch_inode_info *ei = to_bch_ei(inode);
- unsigned flags;
- int ret;
-
- switch (cmd) {
- case FS_IOC_GETFLAGS:
- return put_user(bch_inode_flags_to_user_flags(ei->i_flags),
- (int __user *) arg);
-
- case FS_IOC_SETFLAGS: {
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
-
- if (!inode_owner_or_capable(inode)) {
- ret = -EACCES;
- goto setflags_out;
- }
-
- if (get_user(flags, (int __user *) arg)) {
- ret = -EFAULT;
- goto setflags_out;
- }
-
- if (!S_ISREG(inode->i_mode) &&
- !S_ISDIR(inode->i_mode) &&
- (flags & (FS_NODUMP_FL|FS_NOATIME_FL)) != flags) {
- ret = -EINVAL;
- goto setflags_out;
- }
-
- inode_lock(inode);
-
- mutex_lock(&ei->update_lock);
- ret = __bch_write_inode(c, ei, bch_inode_user_flags_set, &flags);
- mutex_unlock(&ei->update_lock);
-
- if (!ret)
- bch_inode_flags_to_vfs(inode);
-
- inode_unlock(inode);
-setflags_out:
- mnt_drop_write_file(filp);
- return ret;
- }
-
- case FS_IOC_GETVERSION:
- return -ENOTTY;
- case FS_IOC_SETVERSION:
- return -ENOTTY;
-
- case FS_IOC_GOINGDOWN:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- down_write(&sb->s_umount);
- sb->s_flags |= MS_RDONLY;
- bch_fs_emergency_read_only(c);
- up_write(&sb->s_umount);
- return 0;
-
- default:
- return bch_fs_ioctl(c, cmd, (void __user *) arg);
- }
-}
-
-#ifdef CONFIG_COMPAT
-static long bch_compat_fs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- /* These are just misnamed, they actually get/put from/to user an int */
- switch (cmd) {
- case FS_IOC_GETFLAGS:
- cmd = FS_IOC_GETFLAGS;
- break;
- case FS_IOC32_SETFLAGS:
- cmd = FS_IOC_SETFLAGS;
- break;
- default:
- return -ENOIOCTLCMD;
- }
- return bch_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-}
-#endif
-
-/* Directories: */
-
-static loff_t bch_dir_llseek(struct file *file, loff_t offset, int whence)
-{
- return generic_file_llseek_size(file, offset, whence,
- S64_MAX, S64_MAX);
-}
-
-static int bch_vfs_readdir(struct file *file, struct dir_context *ctx)
-{
- struct inode *inode = file_inode(file);
- struct bch_fs *c = inode->i_sb->s_fs_info;
-
- return bch_readdir(c, file, ctx);
-}
-
-static const struct file_operations bch_file_operations = {
- .llseek = bch_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = bch_write_iter,
- .mmap = bch_mmap,
- .open = generic_file_open,
- .fsync = bch_fsync,
- .splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
- .fallocate = bch_fallocate_dispatch,
- .unlocked_ioctl = bch_fs_file_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = bch_compat_fs_ioctl,
-#endif
-};
-
-static const struct inode_operations bch_file_inode_operations = {
- .setattr = bch_setattr,
- .fiemap = bch_fiemap,
- .listxattr = bch_xattr_list,
- .get_acl = bch_get_acl,
- .set_acl = bch_set_acl,
-};
-
-static const struct inode_operations bch_dir_inode_operations = {
- .lookup = bch_lookup,
- .create = bch_create,
- .link = bch_link,
- .unlink = bch_unlink,
- .symlink = bch_symlink,
- .mkdir = bch_mkdir,
- .rmdir = bch_rmdir,
- .mknod = bch_mknod,
- .rename = bch_rename2,
- .setattr = bch_setattr,
- .tmpfile = bch_tmpfile,
- .listxattr = bch_xattr_list,
- .get_acl = bch_get_acl,
- .set_acl = bch_set_acl,
-};
-
-static const struct file_operations bch_dir_file_operations = {
- .llseek = bch_dir_llseek,
- .read = generic_read_dir,
- .iterate = bch_vfs_readdir,
- .fsync = bch_fsync,
- .unlocked_ioctl = bch_fs_file_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = bch_compat_fs_ioctl,
-#endif
-};
-
-static const struct inode_operations bch_symlink_inode_operations = {
- .readlink = generic_readlink,
- .get_link = page_get_link,
- .setattr = bch_setattr,
- .listxattr = bch_xattr_list,
- .get_acl = bch_get_acl,
- .set_acl = bch_set_acl,
-};
-
-static const struct inode_operations bch_special_inode_operations = {
- .setattr = bch_setattr,
- .listxattr = bch_xattr_list,
- .get_acl = bch_get_acl,
- .set_acl = bch_set_acl,
-};
-
-static const struct address_space_operations bch_address_space_operations = {
- .writepage = bch_writepage,
- .readpage = bch_readpage,
- .writepages = bch_writepages,
- .readpages = bch_readpages,
- .set_page_dirty = bch_set_page_dirty,
- .write_begin = bch_write_begin,
- .write_end = bch_write_end,
- .invalidatepage = bch_invalidatepage,
- .releasepage = bch_releasepage,
- .direct_IO = bch_direct_IO,
-#ifdef CONFIG_MIGRATION
- .migratepage = bch_migrate_page,
-#endif
- .error_remove_page = generic_error_remove_page,
-};
-
-static void bch_vfs_inode_init(struct bch_fs *c,
- struct bch_inode_info *ei,
- struct bch_inode_unpacked *bi)
-{
- struct inode *inode = &ei->vfs_inode;
-
- pr_debug("init inode %llu with mode %o",
- bi->inum, bi->i_mode);
-
- ei->i_flags = bi->i_flags;
- ei->i_size = bi->i_size;
-
- inode->i_mode = bi->i_mode;
- i_uid_write(inode, bi->i_uid);
- i_gid_write(inode, bi->i_gid);
-
- atomic64_set(&ei->i_sectors, bi->i_sectors);
- inode->i_blocks = bi->i_sectors;
-
- inode->i_ino = bi->inum;
- set_nlink(inode, bi->i_nlink + nlink_bias(inode->i_mode));
- inode->i_rdev = bi->i_dev;
- inode->i_generation = bi->i_generation;
- inode->i_size = bi->i_size;
- inode->i_atime = bch_time_to_timespec(c, bi->i_atime);
- inode->i_mtime = bch_time_to_timespec(c, bi->i_mtime);
- inode->i_ctime = bch_time_to_timespec(c, bi->i_ctime);
- bch_inode_flags_to_vfs(inode);
-
- ei->str_hash = bch_hash_info_init(bi);
-
- inode->i_mapping->a_ops = &bch_address_space_operations;
-
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- inode->i_op = &bch_file_inode_operations;
- inode->i_fop = &bch_file_operations;
- break;
- case S_IFDIR:
- inode->i_op = &bch_dir_inode_operations;
- inode->i_fop = &bch_dir_file_operations;
- break;
- case S_IFLNK:
- inode_nohighmem(inode);
- inode->i_op = &bch_symlink_inode_operations;
- break;
- default:
- init_special_inode(inode, inode->i_mode, inode->i_rdev);
- inode->i_op = &bch_special_inode_operations;
- break;
- }
-}
-
-static struct inode *bch_alloc_inode(struct super_block *sb)
-{
- struct bch_inode_info *ei;
-
- ei = kmem_cache_alloc(bch_inode_cache, GFP_NOFS);
- if (!ei)
- return NULL;
-
- pr_debug("allocated %p", &ei->vfs_inode);
-
- inode_init_once(&ei->vfs_inode);
- mutex_init(&ei->update_lock);
- ei->journal_seq = 0;
- atomic_long_set(&ei->i_size_dirty_count, 0);
- atomic_long_set(&ei->i_sectors_dirty_count, 0);
-
- return &ei->vfs_inode;
-}
-
-static void bch_i_callback(struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
-
- kmem_cache_free(bch_inode_cache, to_bch_ei(inode));
-}
-
-static void bch_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, bch_i_callback);
-}
-
-static int bch_vfs_write_inode(struct inode *inode,
- struct writeback_control *wbc)
-{
- struct bch_fs *c = inode->i_sb->s_fs_info;
- struct bch_inode_info *ei = to_bch_ei(inode);
- int ret;
-
- mutex_lock(&ei->update_lock);
- ret = bch_write_inode(c, ei);
- mutex_unlock(&ei->update_lock);
-
- if (c->opts.journal_flush_disabled)
- return ret;
-
- if (!ret && wbc->sync_mode == WB_SYNC_ALL)
- ret = bch_journal_flush_seq(&c->journal, ei->journal_seq);
-
- return ret;
-}
-
-static void bch_evict_inode(struct inode *inode)
-{
- struct bch_fs *c = inode->i_sb->s_fs_info;
-
- truncate_inode_pages_final(&inode->i_data);
-
- if (!bch_journal_error(&c->journal) && !is_bad_inode(inode)) {
- struct bch_inode_info *ei = to_bch_ei(inode);
-
- /* XXX - we want to check this stuff iff there weren't IO errors: */
- BUG_ON(atomic_long_read(&ei->i_sectors_dirty_count));
- BUG_ON(atomic64_read(&ei->i_sectors) != inode->i_blocks);
- }
-
- clear_inode(inode);
-
- if (!inode->i_nlink && !is_bad_inode(inode)) {
- bch_inode_rm(c, inode->i_ino);
- atomic_long_dec(&c->nr_inodes);
- }
-}
-
-static int bch_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- struct super_block *sb = dentry->d_sb;
- struct bch_fs *c = sb->s_fs_info;
- u64 fsid;
-
- buf->f_type = BCACHE_STATFS_MAGIC;
- buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = c->capacity >> PAGE_SECTOR_SHIFT;
- buf->f_bfree = (c->capacity - bch_fs_sectors_used(c)) >> PAGE_SECTOR_SHIFT;
- buf->f_bavail = buf->f_bfree;
- buf->f_files = atomic_long_read(&c->nr_inodes);
- buf->f_ffree = U64_MAX;
-
- fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
- le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
- buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
- buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
- buf->f_namelen = NAME_MAX;
-
- return 0;
-}
-
-static int bch_sync_fs(struct super_block *sb, int wait)
-{
- struct bch_fs *c = sb->s_fs_info;
-
- if (!wait) {
- bch_journal_flush_async(&c->journal, NULL);
- return 0;
- }
-
- return bch_journal_flush(&c->journal);
-}
-
-static struct bch_fs *bch_open_as_blockdevs(const char *_dev_name,
- struct bch_opts opts)
-{
- size_t nr_devs = 0, i = 0;
- char *dev_name, *s, **devs;
- struct bch_fs *c = NULL;
- const char *err = "cannot allocate memory";
-
- dev_name = kstrdup(_dev_name, GFP_KERNEL);
- if (!dev_name)
- return NULL;
-
- for (s = dev_name; s; s = strchr(s + 1, ':'))
- nr_devs++;
-
- devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
- if (!devs)
- goto err;
-
- for (i = 0, s = dev_name;
- s;
- (s = strchr(s, ':')) && (*s++ = '\0'))
- devs[i++] = s;
-
- err = bch_fs_open(devs, nr_devs, opts, &c);
- if (err) {
- /*
- * Already open?
- * Look up each block device, make sure they all belong to a
- * filesystem and they all belong to the _same_ filesystem
- */
-
- for (i = 0; i < nr_devs; i++) {
- struct block_device *bdev = lookup_bdev(devs[i]);
- struct bch_fs *c2;
-
- if (IS_ERR(bdev))
- goto err;
-
- c2 = bch_bdev_to_fs(bdev);
- bdput(bdev);
-
- if (!c)
- c = c2;
- else if (c2)
- closure_put(&c2->cl);
-
- if (!c)
- goto err;
- if (c != c2) {
- closure_put(&c->cl);
- goto err;
- }
- }
-
- mutex_lock(&c->state_lock);
-
- if (!bch_fs_running(c)) {
- mutex_unlock(&c->state_lock);
- closure_put(&c->cl);
- err = "incomplete filesystem";
- c = NULL;
- goto err;
- }
-
- mutex_unlock(&c->state_lock);
- }
-
- set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
-err:
- kfree(devs);
- kfree(dev_name);
-
- if (!c)
- pr_err("bch_fs_open err %s", err);
- return c;
-}
-
-static int bch_remount(struct super_block *sb, int *flags, char *data)
-{
- struct bch_fs *c = sb->s_fs_info;
- struct bch_opts opts = bch_opts_empty();
- int ret;
-
- opts.read_only = (*flags & MS_RDONLY) != 0;
-
- ret = bch_parse_mount_opts(&opts, data);
- if (ret)
- return ret;
-
- if (opts.read_only >= 0 &&
- opts.read_only != c->opts.read_only) {
- const char *err = NULL;
-
- if (opts.read_only) {
- bch_fs_read_only(c);
-
- sb->s_flags |= MS_RDONLY;
- } else {
- err = bch_fs_read_write(c);
- if (err) {
- bch_err(c, "error going rw: %s", err);
- return -EINVAL;
- }
-
- sb->s_flags &= ~MS_RDONLY;
- }
-
- c->opts.read_only = opts.read_only;
- }
-
- if (opts.errors >= 0)
- c->opts.errors = opts.errors;
-
- return ret;
-}
-
-static const struct super_operations bch_super_operations = {
- .alloc_inode = bch_alloc_inode,
- .destroy_inode = bch_destroy_inode,
- .write_inode = bch_vfs_write_inode,
- .evict_inode = bch_evict_inode,
- .sync_fs = bch_sync_fs,
- .statfs = bch_statfs,
- .show_options = generic_show_options,
- .remount_fs = bch_remount,
-#if 0
- .put_super = bch_put_super,
- .freeze_fs = bch_freeze,
- .unfreeze_fs = bch_unfreeze,
-#endif
-};
-
-static int bch_test_super(struct super_block *s, void *data)
-{
- return s->s_fs_info == data;
-}
-
-static int bch_set_super(struct super_block *s, void *data)
-{
- s->s_fs_info = data;
- return 0;
-}
-
-static struct dentry *bch_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- struct bch_fs *c;
- struct bch_dev *ca;
- struct super_block *sb;
- struct inode *inode;
- struct bch_opts opts = bch_opts_empty();
- unsigned i;
- int ret;
-
- opts.read_only = (flags & MS_RDONLY) != 0;
-
- ret = bch_parse_mount_opts(&opts, data);
- if (ret)
- return ERR_PTR(ret);
-
- c = bch_open_as_blockdevs(dev_name, opts);
- if (!c)
- return ERR_PTR(-ENOENT);
-
- sb = sget(fs_type, bch_test_super, bch_set_super, flags|MS_NOSEC, c);
- if (IS_ERR(sb)) {
- closure_put(&c->cl);
- return ERR_CAST(sb);
- }
-
- BUG_ON(sb->s_fs_info != c);
-
- if (sb->s_root) {
- closure_put(&c->cl);
-
- if ((flags ^ sb->s_flags) & MS_RDONLY) {
- ret = -EBUSY;
- goto err_put_super;
- }
- goto out;
- }
-
- /* XXX: blocksize */
- sb->s_blocksize = PAGE_SIZE;
- sb->s_blocksize_bits = PAGE_SHIFT;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_op = &bch_super_operations;
- sb->s_xattr = bch_xattr_handlers;
- sb->s_magic = BCACHE_STATFS_MAGIC;
- sb->s_time_gran = c->sb.time_precision;
- c->vfs_sb = sb;
- sb->s_bdi = &c->bdi;
- strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
-
- for_each_online_member(ca, c, i) {
- struct block_device *bdev = ca->disk_sb.bdev;
-
- /* XXX: create an anonymous device for multi device filesystems */
- sb->s_bdev = bdev;
- sb->s_dev = bdev->bd_dev;
- percpu_ref_put(&ca->io_ref);
- break;
- }
-
- if (opts.posix_acl < 0)
- sb->s_flags |= MS_POSIXACL;
- else
- sb->s_flags |= opts.posix_acl ? MS_POSIXACL : 0;
-
- inode = bch_vfs_inode_get(sb, BCACHE_ROOT_INO);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- goto err_put_super;
- }
-
- sb->s_root = d_make_root(inode);
- if (!sb->s_root) {
- ret = -ENOMEM;
- goto err_put_super;
- }
-
- sb->s_flags |= MS_ACTIVE;
-out:
- return dget(sb->s_root);
-
-err_put_super:
- deactivate_locked_super(sb);
- return ERR_PTR(ret);
-}
-
-static void bch_kill_sb(struct super_block *sb)
-{
- struct bch_fs *c = sb->s_fs_info;
-
- generic_shutdown_super(sb);
-
- if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
- bch_fs_stop(c);
- else
- closure_put(&c->cl);
-}
-
-static struct file_system_type bcache_fs_type = {
- .owner = THIS_MODULE,
- .name = "bcache",
- .mount = bch_mount,
- .kill_sb = bch_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
-};
-
-MODULE_ALIAS_FS("bcache");
-
-void bch_vfs_exit(void)
-{
- unregister_filesystem(&bcache_fs_type);
- if (bch_dio_write_bioset)
- bioset_free(bch_dio_write_bioset);
- if (bch_dio_read_bioset)
- bioset_free(bch_dio_read_bioset);
- if (bch_writepage_bioset)
- bioset_free(bch_writepage_bioset);
- if (bch_inode_cache)
- kmem_cache_destroy(bch_inode_cache);
-}
-
-int __init bch_vfs_init(void)
-{
- int ret = -ENOMEM;
-
- bch_inode_cache = KMEM_CACHE(bch_inode_info, 0);
- if (!bch_inode_cache)
- goto err;
-
- bch_writepage_bioset =
- bioset_create(4, offsetof(struct bch_writepage_io, bio.bio));
- if (!bch_writepage_bioset)
- goto err;
-
- bch_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, rbio.bio));
- if (!bch_dio_read_bioset)
- goto err;
-
- bch_dio_write_bioset = bioset_create(4, offsetof(struct dio_write, bio.bio));
- if (!bch_dio_write_bioset)
- goto err;
-
- ret = register_filesystem(&bcache_fs_type);
- if (ret)
- goto err;
-
- return 0;
-err:
- bch_vfs_exit();
- return ret;
-}
diff --git a/libbcache/fs.h b/libbcache/fs.h
deleted file mode 100644
index 1c0a2b15..00000000
--- a/libbcache/fs.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef _BCACHE_FS_H
-#define _BCACHE_FS_H
-
-#include "str_hash.h"
-
-#include <linux/seqlock.h>
-
-struct bch_inode_info {
- struct inode vfs_inode;
-
- struct mutex update_lock;
- u64 journal_seq;
-
- atomic_long_t i_size_dirty_count;
-
- /*
- * these are updated whenever we update the inode in the btree - for
- * e.g. fsync
- */
- u64 i_size;
- u32 i_flags;
-
- atomic_long_t i_sectors_dirty_count;
- atomic64_t i_sectors;
-
- struct bch_hash_info str_hash;
-};
-
-#define to_bch_ei(_inode) \
- container_of(_inode, struct bch_inode_info, vfs_inode)
-
-static inline u8 mode_to_type(umode_t mode)
-{
- return (mode >> 12) & 15;
-}
-
-static inline unsigned nlink_bias(umode_t mode)
-{
- return S_ISDIR(mode) ? 2 : 1;
-}
-
-struct bch_inode_unpacked;
-
-#ifndef NO_BCACHE_FS
-
-/* returns 0 if we want to do the update, or error is passed up */
-typedef int (*inode_set_fn)(struct bch_inode_info *,
- struct bch_inode_unpacked *, void *);
-
-int __must_check __bch_write_inode(struct bch_fs *, struct bch_inode_info *,
- inode_set_fn, void *);
-int __must_check bch_write_inode(struct bch_fs *,
- struct bch_inode_info *);
-
-void bch_vfs_exit(void);
-int bch_vfs_init(void);
-
-#else
-
-static inline void bch_vfs_exit(void) {}
-static inline int bch_vfs_init(void) { return 0; }
-
-#endif
-
-#endif /* _BCACHE_FS_H */
diff --git a/libbcache/inode.c b/libbcache/inode.c
deleted file mode 100644
index 2e15497f..00000000
--- a/libbcache/inode.c
+++ /dev/null
@@ -1,451 +0,0 @@
-
-#include "bcache.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "extents.h"
-#include "inode.h"
-#include "io.h"
-#include "keylist.h"
-
-#include <linux/random.h>
-
-#include <asm/unaligned.h>
-
-#define FIELD_BYTES() \
-
-static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
-static const u8 bits_table[8] = {
- 1 * 8 - 1,
- 2 * 8 - 2,
- 3 * 8 - 3,
- 4 * 8 - 4,
- 6 * 8 - 5,
- 8 * 8 - 6,
- 10 * 8 - 7,
- 13 * 8 - 8,
-};
-
-static int inode_encode_field(u8 *out, u8 *end, const u64 in[2])
-{
- unsigned bytes, bits, shift;
-
- if (likely(!in[1]))
- bits = fls64(in[0]);
- else
- bits = fls64(in[1]) + 64;
-
- for (shift = 1; shift <= 8; shift++)
- if (bits < bits_table[shift - 1])
- goto got_shift;
-
- BUG();
-got_shift:
- bytes = byte_table[shift - 1];
-
- BUG_ON(out + bytes > end);
-
- if (likely(bytes <= 8)) {
- u64 b = cpu_to_be64(in[0]);
-
- memcpy(out, (void *) &b + 8 - bytes, bytes);
- } else {
- u64 b = cpu_to_be64(in[1]);
-
- memcpy(out, (void *) &b + 16 - bytes, bytes);
- put_unaligned_be64(in[0], out + bytes - 8);
- }
-
- *out |= (1 << 8) >> shift;
-
- return bytes;
-}
-
-static int inode_decode_field(const u8 *in, const u8 *end,
- u64 out[2], unsigned *out_bits)
-{
- unsigned bytes, bits, shift;
-
- if (in >= end)
- return -1;
-
- if (!*in)
- return -1;
-
- /*
- * position of highest set bit indicates number of bytes:
- * shift = number of bits to remove in high byte:
- */
- shift = 8 - __fls(*in); /* 1 <= shift <= 8 */
- bytes = byte_table[shift - 1];
- bits = bytes * 8 - shift;
-
- if (in + bytes > end)
- return -1;
-
- /*
- * we're assuming it's safe to deref up to 7 bytes < in; this will work
- * because keys always start quite a bit more than 7 bytes after the
- * start of the btree node header:
- */
- if (likely(bytes <= 8)) {
- out[0] = get_unaligned_be64(in + bytes - 8);
- out[0] <<= 64 - bits;
- out[0] >>= 64 - bits;
- out[1] = 0;
- } else {
- out[0] = get_unaligned_be64(in + bytes - 8);
- out[1] = get_unaligned_be64(in + bytes - 16);
- out[1] <<= 128 - bits;
- out[1] >>= 128 - bits;
- }
-
- *out_bits = out[1] ? 64 + fls64(out[1]) : fls64(out[0]);
- return bytes;
-}
-
-void bch_inode_pack(struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
-{
- u8 *out = packed->inode.v.fields;
- u8 *end = (void *) &packed[1];
- u8 *last_nonzero_field = out;
- u64 field[2];
- unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
-
- bkey_inode_init(&packed->inode.k_i);
- packed->inode.k.p.inode = inode->inum;
- packed->inode.v.i_hash_seed = inode->i_hash_seed;
- packed->inode.v.i_flags = cpu_to_le32(inode->i_flags);
- packed->inode.v.i_mode = cpu_to_le16(inode->i_mode);
-
-#define BCH_INODE_FIELD(_name, _bits) \
- field[0] = inode->_name; \
- field[1] = 0; \
- out += inode_encode_field(out, end, field); \
- nr_fields++; \
- \
- if (field[0] | field[1]) { \
- last_nonzero_field = out; \
- last_nonzero_fieldnr = nr_fields; \
- }
-
- BCH_INODE_FIELDS()
-#undef BCH_INODE_FIELD
-
- out = last_nonzero_field;
- nr_fields = last_nonzero_fieldnr;
-
- set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v);
- memset(out, 0,
- (u8 *) &packed->inode.v +
- bkey_val_bytes(&packed->inode.k) - out);
-
- SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
-
- if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
- struct bch_inode_unpacked unpacked;
-
- int ret = bch_inode_unpack(inode_i_to_s_c(&packed->inode),
- &unpacked);
- BUG_ON(ret);
- BUG_ON(unpacked.inum != inode->inum);
- BUG_ON(unpacked.i_hash_seed != inode->i_hash_seed);
- BUG_ON(unpacked.i_mode != inode->i_mode);
-
-#define BCH_INODE_FIELD(_name, _bits) BUG_ON(unpacked._name != inode->_name);
- BCH_INODE_FIELDS()
-#undef BCH_INODE_FIELD
- }
-}
-
-int bch_inode_unpack(struct bkey_s_c_inode inode,
- struct bch_inode_unpacked *unpacked)
-{
- const u8 *in = inode.v->fields;
- const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
- u64 field[2];
- unsigned fieldnr = 0, field_bits;
- int ret;
-
- unpacked->inum = inode.k->p.inode;
- unpacked->i_hash_seed = inode.v->i_hash_seed;
- unpacked->i_flags = le32_to_cpu(inode.v->i_flags);
- unpacked->i_mode = le16_to_cpu(inode.v->i_mode);
-
-#define BCH_INODE_FIELD(_name, _bits) \
- if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
- memset(&unpacked->_name, 0, \
- sizeof(*unpacked) - \
- offsetof(struct bch_inode_unpacked, _name)); \
- return 0; \
- } \
- \
- ret = inode_decode_field(in, end, field, &field_bits); \
- if (ret < 0) \
- return ret; \
- \
- if (field_bits > sizeof(unpacked->_name) * 8) \
- return -1; \
- \
- unpacked->_name = field[0]; \
- in += ret;
-
- BCH_INODE_FIELDS()
-#undef BCH_INODE_FIELD
-
- /* XXX: signal if there were more fields than expected? */
-
- return 0;
-}
-
-static const char *bch_inode_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
-{
- if (k.k->p.offset)
- return "nonzero offset";
-
- switch (k.k->type) {
- case BCH_INODE_FS: {
- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- struct bch_inode_unpacked unpacked;
-
- if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
- return "incorrect value size";
-
- if (k.k->p.inode < BLOCKDEV_INODE_MAX)
- return "fs inode in blockdev range";
-
- if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
- return "invalid str hash type";
-
- if (bch_inode_unpack(inode, &unpacked))
- return "invalid variable length fields";
-
- return NULL;
- }
- case BCH_INODE_BLOCKDEV:
- if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev))
- return "incorrect value size";
-
- if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
- return "blockdev inode in fs range";
-
- return NULL;
- default:
- return "invalid type";
- }
-}
-
-static void bch_inode_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
-{
- struct bkey_s_c_inode inode;
- struct bch_inode_unpacked unpacked;
-
- switch (k.k->type) {
- case BCH_INODE_FS:
- inode = bkey_s_c_to_inode(k);
- if (bch_inode_unpack(inode, &unpacked)) {
- scnprintf(buf, size, "(unpack error)");
- break;
- }
-
- scnprintf(buf, size, "i_size %llu", unpacked.i_size);
- break;
- }
-}
-
-const struct bkey_ops bch_bkey_inode_ops = {
- .key_invalid = bch_inode_invalid,
- .val_to_text = bch_inode_to_text,
-};
-
-void bch_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
- uid_t uid, gid_t gid, umode_t mode, dev_t rdev)
-{
- s64 now = timespec_to_bch_time(c, CURRENT_TIME);
-
- memset(inode_u, 0, sizeof(*inode_u));
-
- /* ick */
- inode_u->i_flags |= c->sb.str_hash_type << INODE_STR_HASH_OFFSET;
- get_random_bytes(&inode_u->i_hash_seed, sizeof(inode_u->i_hash_seed));
-
- inode_u->i_mode = mode;
- inode_u->i_uid = uid;
- inode_u->i_gid = gid;
- inode_u->i_dev = rdev;
- inode_u->i_atime = now;
- inode_u->i_mtime = now;
- inode_u->i_ctime = now;
- inode_u->i_otime = now;
-}
-
-int bch_inode_create(struct bch_fs *c, struct bkey_i *inode,
- u64 min, u64 max, u64 *hint)
-{
- struct btree_iter iter;
- bool searched_from_start = false;
- int ret;
-
- if (!max)
- max = ULLONG_MAX;
-
- if (c->opts.inodes_32bit)
- max = min_t(u64, max, U32_MAX);
-
- if (*hint >= max || *hint < min)
- *hint = min;
-
- if (*hint == min)
- searched_from_start = true;
-again:
- bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(*hint, 0));
-
- while (1) {
- struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
-
- ret = btree_iter_err(k);
- if (ret) {
- bch_btree_iter_unlock(&iter);
- return ret;
- }
-
- if (k.k->type < BCH_INODE_FS) {
- inode->k.p = k.k->p;
-
- pr_debug("inserting inode %llu (size %u)",
- inode->k.p.inode, inode->k.u64s);
-
- ret = bch_btree_insert_at(c, NULL, NULL, NULL,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&iter, inode));
-
- if (ret == -EINTR)
- continue;
-
- bch_btree_iter_unlock(&iter);
- if (!ret)
- *hint = k.k->p.inode + 1;
-
- return ret;
- } else {
- if (iter.pos.inode == max)
- break;
- /* slot used */
- bch_btree_iter_advance_pos(&iter);
- }
- }
- bch_btree_iter_unlock(&iter);
-
- if (!searched_from_start) {
- /* Retry from start */
- *hint = min;
- searched_from_start = true;
- goto again;
- }
-
- return -ENOSPC;
-}
-
-int bch_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
- struct extent_insert_hook *hook, u64 *journal_seq)
-{
- return bch_discard(c, POS(inode_nr, new_size), POS(inode_nr + 1, 0),
- ZERO_VERSION, NULL, hook, journal_seq);
-}
-
-int bch_inode_rm(struct bch_fs *c, u64 inode_nr)
-{
- struct bkey_i delete;
- int ret;
-
- ret = bch_inode_truncate(c, inode_nr, 0, NULL, NULL);
- if (ret < 0)
- return ret;
-
- ret = bch_btree_delete_range(c, BTREE_ID_XATTRS,
- POS(inode_nr, 0),
- POS(inode_nr + 1, 0),
- ZERO_VERSION, NULL, NULL, NULL);
- if (ret < 0)
- return ret;
-
- /*
- * If this was a directory, there shouldn't be any real dirents left -
- * but there could be whiteouts (from hash collisions) that we should
- * delete:
- *
- * XXX: the dirent could ideally would delete whitouts when they're no
- * longer needed
- */
- ret = bch_btree_delete_range(c, BTREE_ID_DIRENTS,
- POS(inode_nr, 0),
- POS(inode_nr + 1, 0),
- ZERO_VERSION, NULL, NULL, NULL);
- if (ret < 0)
- return ret;
-
- bkey_init(&delete.k);
- delete.k.p.inode = inode_nr;
-
- return bch_btree_insert(c, BTREE_ID_INODES, &delete, NULL,
- NULL, NULL, BTREE_INSERT_NOFAIL);
-}
-
-int bch_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = -ENOENT;
-
- for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES,
- POS(inode_nr, 0), k) {
- switch (k.k->type) {
- case BCH_INODE_FS:
- ret = bch_inode_unpack(bkey_s_c_to_inode(k), inode);
- break;
- default:
- /* hole, not found */
- break;
- }
-
- break;
-
- }
-
- return bch_btree_iter_unlock(&iter) ?: ret;
-}
-
-int bch_cached_dev_inode_find_by_uuid(struct bch_fs *c, uuid_le *uuid,
- struct bkey_i_inode_blockdev *ret)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
-
- for_each_btree_key(&iter, c, BTREE_ID_INODES, POS(0, 0), k) {
- if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
- break;
-
- if (k.k->type == BCH_INODE_BLOCKDEV) {
- struct bkey_s_c_inode_blockdev inode =
- bkey_s_c_to_inode_blockdev(k);
-
- pr_debug("found inode %llu: %pU (u64s %u)",
- inode.k->p.inode, inode.v->i_uuid.b,
- inode.k->u64s);
-
- if (CACHED_DEV(inode.v) &&
- !memcmp(uuid, &inode.v->i_uuid, 16)) {
- bkey_reassemble(&ret->k_i, k);
- bch_btree_iter_unlock(&iter);
- return 0;
- }
- }
-
- bch_btree_iter_cond_resched(&iter);
- }
- bch_btree_iter_unlock(&iter);
- return -ENOENT;
-}
diff --git a/libbcache/inode.h b/libbcache/inode.h
deleted file mode 100644
index 41e344d5..00000000
--- a/libbcache/inode.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef _BCACHE_INODE_H
-#define _BCACHE_INODE_H
-
-#include <linux/math64.h>
-
-extern const struct bkey_ops bch_bkey_inode_ops;
-
-struct bch_inode_unpacked {
- u64 inum;
- __le64 i_hash_seed;
- u32 i_flags;
- u16 i_mode;
-
-#define BCH_INODE_FIELD(_name, _bits) u##_bits _name;
- BCH_INODE_FIELDS()
-#undef BCH_INODE_FIELD
-};
-
-struct bkey_inode_buf {
- struct bkey_i_inode inode;
-
-#define BCH_INODE_FIELD(_name, _bits) + 8 + _bits / 8
- u8 _pad[0 + BCH_INODE_FIELDS()];
-#undef BCH_INODE_FIELD
-} __packed;
-
-void bch_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
-int bch_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
-
-void bch_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
- uid_t, gid_t, umode_t, dev_t);
-int bch_inode_create(struct bch_fs *, struct bkey_i *, u64, u64, u64 *);
-int bch_inode_truncate(struct bch_fs *, u64, u64,
- struct extent_insert_hook *, u64 *);
-int bch_inode_rm(struct bch_fs *, u64);
-
-int bch_inode_find_by_inum(struct bch_fs *, u64,
- struct bch_inode_unpacked *);
-int bch_cached_dev_inode_find_by_uuid(struct bch_fs *, uuid_le *,
- struct bkey_i_inode_blockdev *);
-
-static inline struct timespec bch_time_to_timespec(struct bch_fs *c, u64 time)
-{
- return ns_to_timespec(time * c->sb.time_precision + c->sb.time_base_lo);
-}
-
-static inline u64 timespec_to_bch_time(struct bch_fs *c, struct timespec ts)
-{
- s64 ns = timespec_to_ns(&ts) - c->sb.time_base_lo;
-
- if (c->sb.time_precision == 1)
- return ns;
-
- return div_s64(ns, c->sb.time_precision);
-}
-
-#endif
diff --git a/libbcache/io.c b/libbcache/io.c
deleted file mode 100644
index 753c8a3d..00000000
--- a/libbcache/io.c
+++ /dev/null
@@ -1,1435 +0,0 @@
-/*
- * Some low level IO code, and hacks for various block layer limitations
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcache.h"
-#include "alloc.h"
-#include "bset.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "compress.h"
-#include "clock.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "io.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "notify.h"
-#include "stats.h"
-#include "super-io.h"
-
-#include <linux/blkdev.h>
-#include <linux/random.h>
-
-#include <trace/events/bcache.h>
-
-static inline void __bio_inc_remaining(struct bio *bio)
-{
- bio_set_flag(bio, BIO_CHAIN);
- smp_mb__before_atomic();
- atomic_inc(&bio->__bi_remaining);
-}
-
-void bch_generic_make_request(struct bio *bio, struct bch_fs *c)
-{
- if (current->bio_list) {
- spin_lock(&c->bio_submit_lock);
- bio_list_add(&c->bio_submit_list, bio);
- spin_unlock(&c->bio_submit_lock);
- queue_work(bcache_io_wq, &c->bio_submit_work);
- } else {
- generic_make_request(bio);
- }
-}
-
-void bch_bio_submit_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs,
- bio_submit_work);
- struct bio_list bl;
- struct bio *bio;
-
- spin_lock(&c->bio_submit_lock);
- bl = c->bio_submit_list;
- bio_list_init(&c->bio_submit_list);
- spin_unlock(&c->bio_submit_lock);
-
- while ((bio = bio_list_pop(&bl)))
- generic_make_request(bio);
-}
-
-/* Allocate, free from mempool: */
-
-void bch_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
-{
- struct bio_vec *bv;
- unsigned i;
-
- bio_for_each_segment_all(bv, bio, i)
- if (bv->bv_page != ZERO_PAGE(0))
- mempool_free(bv->bv_page, &c->bio_bounce_pages);
- bio->bi_vcnt = 0;
-}
-
-static void bch_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio,
- bool *using_mempool)
-{
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++];
-
- if (likely(!*using_mempool)) {
- bv->bv_page = alloc_page(GFP_NOIO);
- if (unlikely(!bv->bv_page)) {
- mutex_lock(&c->bio_bounce_pages_lock);
- *using_mempool = true;
- goto pool_alloc;
-
- }
- } else {
-pool_alloc:
- bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
- }
-
- bv->bv_len = PAGE_SIZE;
- bv->bv_offset = 0;
-}
-
-void bch_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
- size_t bytes)
-{
- bool using_mempool = false;
-
- bio->bi_iter.bi_size = bytes;
-
- while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
- bch_bio_alloc_page_pool(c, bio, &using_mempool);
-
- if (using_mempool)
- mutex_unlock(&c->bio_bounce_pages_lock);
-}
-
-/* Bios with headers */
-
-static void bch_submit_wbio(struct bch_fs *c, struct bch_write_bio *wbio,
- struct bch_dev *ca, const struct bch_extent_ptr *ptr,
- bool punt)
-{
- wbio->ca = ca;
- wbio->submit_time_us = local_clock_us();
- wbio->bio.bi_iter.bi_sector = ptr->offset;
- wbio->bio.bi_bdev = ca ? ca->disk_sb.bdev : NULL;
-
- if (!ca)
- bcache_io_error(c, &wbio->bio, "device has been removed");
- else if (punt)
- bch_generic_make_request(&wbio->bio, c);
- else
- generic_make_request(&wbio->bio);
-}
-
-void bch_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
- const struct bkey_i *k, bool punt)
-{
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
- const struct bch_extent_ptr *ptr;
- struct bch_write_bio *n;
- struct bch_dev *ca;
-
- BUG_ON(c->opts.nochanges);
-
- wbio->split = false;
- wbio->c = c;
-
- extent_for_each_ptr(e, ptr) {
- ca = c->devs[ptr->dev];
- if (!percpu_ref_tryget(&ca->io_ref)) {
- bch_submit_wbio(c, wbio, NULL, ptr, punt);
- break;
- }
-
- if (ptr + 1 < &extent_entry_last(e)->ptr) {
- n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
- &ca->replica_set));
-
- n->bio.bi_end_io = wbio->bio.bi_end_io;
- n->bio.bi_private = wbio->bio.bi_private;
- n->c = c;
- n->orig = &wbio->bio;
- n->bounce = false;
- n->split = true;
- n->put_bio = true;
- n->bio.bi_opf = wbio->bio.bi_opf;
- __bio_inc_remaining(n->orig);
- } else {
- n = wbio;
- }
-
- if (!journal_flushes_device(ca))
- n->bio.bi_opf |= REQ_FUA;
-
- bch_submit_wbio(c, n, ca, ptr, punt);
- }
-}
-
-/* IO errors */
-
-/* Writes */
-
-static struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
- return op->alloc_reserve == RESERVE_MOVINGGC
- ? op->c->copygc_wq
- : op->c->wq;
-}
-
-static void __bch_write(struct closure *);
-
-static void bch_write_done(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-
- BUG_ON(!(op->flags & BCH_WRITE_DONE));
-
- if (!op->error && (op->flags & BCH_WRITE_FLUSH))
- op->error = bch_journal_error(&op->c->journal);
-
- bch_disk_reservation_put(op->c, &op->res);
- percpu_ref_put(&op->c->writes);
- bch_keylist_free(&op->insert_keys, op->inline_keys);
- closure_return(cl);
-}
-
-static u64 keylist_sectors(struct keylist *keys)
-{
- struct bkey_i *k;
- u64 ret = 0;
-
- for_each_keylist_key(keys, k)
- ret += k->k.size;
-
- return ret;
-}
-
-static int bch_write_index_default(struct bch_write_op *op)
-{
- struct keylist *keys = &op->insert_keys;
- struct btree_iter iter;
- int ret;
-
- bch_btree_iter_init_intent(&iter, op->c, BTREE_ID_EXTENTS,
- bkey_start_pos(&bch_keylist_front(keys)->k));
-
- ret = bch_btree_insert_list_at(&iter, keys, &op->res,
- NULL, op_journal_seq(op),
- BTREE_INSERT_NOFAIL);
- bch_btree_iter_unlock(&iter);
-
- return ret;
-}
-
-/**
- * bch_write_index - after a write, update index to point to new data
- */
-static void bch_write_index(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_fs *c = op->c;
- struct keylist *keys = &op->insert_keys;
- unsigned i;
-
- op->flags |= BCH_WRITE_LOOPED;
-
- if (!bch_keylist_empty(keys)) {
- u64 sectors_start = keylist_sectors(keys);
- int ret = op->index_update_fn(op);
-
- BUG_ON(keylist_sectors(keys) && !ret);
-
- op->written += sectors_start - keylist_sectors(keys);
-
- if (ret) {
- __bcache_io_error(c, "btree IO error %i", ret);
- op->error = ret;
- }
- }
-
- for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++)
- if (op->open_buckets[i]) {
- bch_open_bucket_put(c,
- c->open_buckets +
- op->open_buckets[i]);
- op->open_buckets[i] = 0;
- }
-
- if (!(op->flags & BCH_WRITE_DONE))
- continue_at(cl, __bch_write, op->io_wq);
-
- if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
- bch_journal_flush_seq_async(&c->journal,
- *op_journal_seq(op),
- cl);
- continue_at(cl, bch_write_done, index_update_wq(op));
- } else {
- continue_at_nobarrier(cl, bch_write_done, NULL);
- }
-}
-
-/**
- * bch_write_discard - discard range of keys
- *
- * Used to implement discard, and to handle when writethrough write hits
- * a write error on the cache device.
- */
-static void bch_write_discard(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bio *bio = &op->bio->bio;
- struct bpos end = op->pos;
-
- end.offset += bio_sectors(bio);
-
- op->error = bch_discard(op->c, op->pos, end, op->version,
- &op->res, NULL, NULL);
-}
-
-/*
- * Convert extents to be inserted to discards after an error:
- */
-static void bch_write_io_error(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-
- if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
- struct bkey_i *src = bch_keylist_front(&op->insert_keys);
- struct bkey_i *dst = bch_keylist_front(&op->insert_keys);
-
- /*
- * Our data write just errored, which means we've got a bunch
- * of keys to insert that point to data that wasn't
- * successfully written.
- *
- * We don't have to insert those keys but we still have to
- * invalidate that region of the cache - so, if we just strip
- * off all the pointers from the keys we'll accomplish just
- * that.
- */
-
- while (src != op->insert_keys.top) {
- struct bkey_i *n = bkey_next(src);
-
- set_bkey_val_u64s(&src->k, 0);
- src->k.type = KEY_TYPE_DISCARD;
- bkey_copy(dst, src);
-
- dst = bkey_next(dst);
- src = n;
- }
-
- op->insert_keys.top = dst;
- op->flags |= BCH_WRITE_DISCARD;
- } else {
- /* TODO: We could try to recover from this. */
- while (!bch_keylist_empty(&op->insert_keys))
- bch_keylist_pop_front(&op->insert_keys);
-
- op->error = -EIO;
- op->flags |= BCH_WRITE_DONE;
- }
-
- bch_write_index(cl);
-}
-
-static void bch_write_endio(struct bio *bio)
-{
- struct closure *cl = bio->bi_private;
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_write_bio *wbio = to_wbio(bio);
- struct bch_fs *c = wbio->c;
- struct bio *orig = wbio->orig;
- struct bch_dev *ca = wbio->ca;
-
- if (bch_dev_nonfatal_io_err_on(bio->bi_error, ca,
- "data write")) {
- set_closure_fn(cl, bch_write_io_error, index_update_wq(op));
- }
-
- bch_account_io_completion_time(ca, wbio->submit_time_us,
- REQ_OP_WRITE);
- if (ca)
- percpu_ref_put(&ca->io_ref);
-
- if (bio->bi_error && orig)
- orig->bi_error = bio->bi_error;
-
- if (wbio->bounce)
- bch_bio_free_pages_pool(c, bio);
-
- if (wbio->put_bio)
- bio_put(bio);
-
- if (orig)
- bio_endio(orig);
- else
- closure_put(cl);
-}
-
-static struct nonce extent_nonce(struct bversion version,
- unsigned nonce,
- unsigned uncompressed_size,
- unsigned compression_type)
-{
- return (struct nonce) {{
- [0] = cpu_to_le32((nonce << 12) |
- (uncompressed_size << 22)),
- [1] = cpu_to_le32(version.lo),
- [2] = cpu_to_le32(version.lo >> 32),
- [3] = cpu_to_le32(version.hi|
- (compression_type << 24))^BCH_NONCE_EXTENT,
- }};
-}
-
-static void init_append_extent(struct bch_write_op *op,
- unsigned compressed_size,
- unsigned uncompressed_size,
- unsigned compression_type,
- unsigned nonce,
- struct bch_csum csum, unsigned csum_type,
- struct open_bucket *ob)
-{
- struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
-
- op->pos.offset += uncompressed_size;
- e->k.p = op->pos;
- e->k.size = uncompressed_size;
- e->k.version = op->version;
- bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
-
- bch_extent_crc_append(e, compressed_size,
- uncompressed_size,
- compression_type,
- nonce, csum, csum_type);
-
- bch_alloc_sectors_append_ptrs(op->c, e, op->nr_replicas,
- ob, compressed_size);
-
- bkey_extent_set_cached(&e->k, (op->flags & BCH_WRITE_CACHED));
- bch_keylist_push(&op->insert_keys);
-}
-
-static int bch_write_extent(struct bch_write_op *op,
- struct open_bucket *ob,
- struct bio *orig)
-{
- struct bch_fs *c = op->c;
- struct bio *bio;
- struct bch_write_bio *wbio;
- unsigned key_to_write_offset = op->insert_keys.top_p -
- op->insert_keys.keys_p;
- struct bkey_i *key_to_write;
- unsigned csum_type = op->csum_type;
- unsigned compression_type = op->compression_type;
- int ret;
-
- /* don't refetch csum type/compression type */
- barrier();
-
- /* Need to decompress data? */
- if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
- (crc_uncompressed_size(NULL, &op->crc) != op->size ||
- crc_compressed_size(NULL, &op->crc) > ob->sectors_free)) {
- int ret;
-
- ret = bch_bio_uncompress_inplace(c, orig, op->size, op->crc);
- if (ret)
- return ret;
-
- op->flags &= ~BCH_WRITE_DATA_COMPRESSED;
- }
-
- if (op->flags & BCH_WRITE_DATA_COMPRESSED) {
- init_append_extent(op,
- crc_compressed_size(NULL, &op->crc),
- crc_uncompressed_size(NULL, &op->crc),
- op->crc.compression_type,
- op->crc.nonce,
- op->crc.csum,
- op->crc.csum_type,
- ob);
-
- bio = orig;
- wbio = to_wbio(bio);
- wbio->orig = NULL;
- wbio->bounce = false;
- wbio->put_bio = false;
- ret = 0;
- } else if (csum_type != BCH_CSUM_NONE ||
- compression_type != BCH_COMPRESSION_NONE) {
- /* all units here in bytes */
- unsigned total_output = 0, output_available =
- min(ob->sectors_free << 9, orig->bi_iter.bi_size);
- unsigned crc_nonce = bch_csum_type_is_encryption(csum_type)
- ? op->nonce : 0;
- struct bch_csum csum;
- struct nonce nonce;
-
- bio = bio_alloc_bioset(GFP_NOIO,
- DIV_ROUND_UP(output_available, PAGE_SIZE),
- &c->bio_write);
- /*
- * XXX: can't use mempool for more than
- * BCH_COMPRESSED_EXTENT_MAX worth of pages
- */
- bch_bio_alloc_pages_pool(c, bio, output_available);
-
- /* copy WRITE_SYNC flag */
- bio->bi_opf = orig->bi_opf;
- wbio = to_wbio(bio);
- wbio->orig = NULL;
- wbio->bounce = true;
- wbio->put_bio = true;
-
- do {
- unsigned fragment_compression_type = compression_type;
- size_t dst_len, src_len;
-
- bch_bio_compress(c, bio, &dst_len,
- orig, &src_len,
- &fragment_compression_type);
-
- BUG_ON(!dst_len || dst_len > bio->bi_iter.bi_size);
- BUG_ON(!src_len || src_len > orig->bi_iter.bi_size);
- BUG_ON(dst_len & (block_bytes(c) - 1));
- BUG_ON(src_len & (block_bytes(c) - 1));
-
- swap(bio->bi_iter.bi_size, dst_len);
- nonce = extent_nonce(op->version,
- crc_nonce,
- src_len >> 9,
- compression_type),
-
- bch_encrypt_bio(c, csum_type, nonce, bio);
-
- csum = bch_checksum_bio(c, csum_type, nonce, bio);
- swap(bio->bi_iter.bi_size, dst_len);
-
- init_append_extent(op,
- dst_len >> 9, src_len >> 9,
- fragment_compression_type,
- crc_nonce, csum, csum_type, ob);
-
- total_output += dst_len;
- bio_advance(bio, dst_len);
- bio_advance(orig, src_len);
- } while (bio->bi_iter.bi_size &&
- orig->bi_iter.bi_size &&
- !bch_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_EXTENT_U64s_MAX));
-
- BUG_ON(total_output > output_available);
-
- memset(&bio->bi_iter, 0, sizeof(bio->bi_iter));
- bio->bi_iter.bi_size = total_output;
-
- /*
- * Free unneeded pages after compressing:
- */
- while (bio->bi_vcnt * PAGE_SIZE >
- round_up(bio->bi_iter.bi_size, PAGE_SIZE))
- mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page,
- &c->bio_bounce_pages);
-
- ret = orig->bi_iter.bi_size != 0;
- } else {
- bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO,
- &c->bio_write);
-
- wbio = to_wbio(bio);
- wbio->orig = NULL;
- wbio->bounce = false;
- wbio->put_bio = bio != orig;
-
- init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
- compression_type, 0,
- (struct bch_csum) { 0 }, csum_type, ob);
-
- ret = bio != orig;
- }
-
- bio->bi_end_io = bch_write_endio;
- bio->bi_private = &op->cl;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
- closure_get(bio->bi_private);
-
- /* might have done a realloc... */
-
- key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
-
- bch_check_mark_super(c, key_to_write, false);
-
-#ifndef CONFIG_BCACHE_NO_IO
- bch_submit_wbio_replicas(to_wbio(bio), c, key_to_write, false);
-#else
- to_wbio(bio)->ca = NULL;
- bio_endio(bio);
-#endif
- return ret;
-}
-
-static void __bch_write(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_fs *c = op->c;
- struct bio *bio = &op->bio->bio;
- unsigned open_bucket_nr = 0;
- struct open_bucket *b;
- int ret;
-
- memset(op->open_buckets, 0, sizeof(op->open_buckets));
-
- if (op->flags & BCH_WRITE_DISCARD) {
- op->flags |= BCH_WRITE_DONE;
- bch_write_discard(cl);
- bio_put(bio);
- continue_at(cl, bch_write_done, index_update_wq(op));
- }
-
- /*
- * Journal writes are marked REQ_PREFLUSH; if the original write was a
- * flush, it'll wait on the journal write.
- */
- bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
-
- do {
- EBUG_ON(bio->bi_iter.bi_sector != op->pos.offset);
- EBUG_ON(!bio_sectors(bio));
-
- if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
- continue_at(cl, bch_write_index, index_update_wq(op));
-
- /* for the device pointers and 1 for the chksum */
- if (bch_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_EXTENT_U64s_MAX))
- continue_at(cl, bch_write_index, index_update_wq(op));
-
- b = bch_alloc_sectors_start(c, op->wp,
- op->nr_replicas,
- c->opts.data_replicas_required,
- op->alloc_reserve,
- (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
- EBUG_ON(!b);
-
- if (unlikely(IS_ERR(b))) {
- if (unlikely(PTR_ERR(b) != -EAGAIN)) {
- ret = PTR_ERR(b);
- goto err;
- }
-
- /*
- * If we already have some keys, must insert them first
- * before allocating another open bucket. We only hit
- * this case if open_bucket_nr > 1.
- */
- if (!bch_keylist_empty(&op->insert_keys))
- continue_at(cl, bch_write_index,
- index_update_wq(op));
-
- /*
- * If we've looped, we're running out of a workqueue -
- * not the bch_write() caller's context - and we don't
- * want to block the workqueue:
- */
- if (op->flags & BCH_WRITE_LOOPED)
- continue_at(cl, __bch_write, op->io_wq);
-
- /*
- * Otherwise, we do want to block the caller on alloc
- * failure instead of letting it queue up more and more
- * writes:
- * XXX: this technically needs a try_to_freeze() -
- * except that that's not safe because caller may have
- * issued other IO... hmm..
- */
- closure_sync(cl);
- continue;
- }
-
- BUG_ON(b - c->open_buckets == 0 ||
- b - c->open_buckets > U8_MAX);
- op->open_buckets[open_bucket_nr++] = b - c->open_buckets;
-
- ret = bch_write_extent(op, b, bio);
-
- bch_alloc_sectors_done(c, op->wp, b);
-
- if (ret < 0)
- goto err;
- } while (ret);
-
- op->flags |= BCH_WRITE_DONE;
- continue_at(cl, bch_write_index, index_update_wq(op));
-err:
- if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
- /*
- * If we were writing cached data, not doing the write is fine
- * so long as we discard whatever would have been overwritten -
- * then it's equivalent to doing the write and immediately
- * reclaiming it.
- */
-
- bch_write_discard(cl);
- } else {
- /*
- * Right now we can only error here if we went RO - the
- * allocation failed, but we already checked for -ENOSPC when we
- * got our reservation.
- *
- * XXX capacity might have changed, but we don't check for that
- * yet:
- */
- op->error = ret;
- }
-
- op->flags |= BCH_WRITE_DONE;
-
- /*
- * No reason not to insert keys for whatever data was successfully
- * written (especially for a cmpxchg operation that's moving data
- * around)
- */
- continue_at(cl, !bch_keylist_empty(&op->insert_keys)
- ? bch_write_index
- : bch_write_done, index_update_wq(op));
-}
-
-void bch_wake_delayed_writes(unsigned long data)
-{
- struct bch_fs *c = (void *) data;
- struct bch_write_op *op;
- unsigned long flags;
-
- spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
-
- while ((op = c->write_wait_head)) {
- if (time_after(op->expires, jiffies)) {
- mod_timer(&c->foreground_write_wakeup, op->expires);
- break;
- }
-
- c->write_wait_head = op->next;
- if (!c->write_wait_head)
- c->write_wait_tail = NULL;
-
- closure_put(&op->cl);
- }
-
- spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
-}
-
-/**
- * bch_write - handle a write to a cache device or flash only volume
- *
- * This is the starting point for any data to end up in a cache device; it could
- * be from a normal write, or a writeback write, or a write to a flash only
- * volume - it's also used by the moving garbage collector to compact data in
- * mostly empty buckets.
- *
- * It first writes the data to the cache, creating a list of keys to be inserted
- * (if the data won't fit in a single open bucket, there will be multiple keys);
- * after the data is written it calls bch_journal, and after the keys have been
- * added to the next journal write they're inserted into the btree.
- *
- * It inserts the data in op->bio; bi_sector is used for the key offset, and
- * op->inode is used for the key inode.
- *
- * If op->discard is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->bio and op->inode.
- */
-void bch_write(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bio *bio = &op->bio->bio;
- struct bch_fs *c = op->c;
- u64 inode = op->pos.inode;
-
- trace_bcache_write(c, inode, bio,
- !(op->flags & BCH_WRITE_CACHED),
- op->flags & BCH_WRITE_DISCARD);
-
- if (c->opts.nochanges ||
- !percpu_ref_tryget(&c->writes)) {
- __bcache_io_error(c, "read only");
- op->error = -EROFS;
- bch_disk_reservation_put(c, &op->res);
- closure_return(cl);
- }
-
- if (bversion_zero(op->version) &&
- bch_csum_type_is_encryption(op->csum_type))
- op->version.lo =
- atomic64_inc_return(&c->key_version) + 1;
-
- if (!(op->flags & BCH_WRITE_DISCARD))
- bch_increment_clock(c, bio_sectors(bio), WRITE);
-
- if (!(op->flags & BCH_WRITE_DISCARD))
- bch_mark_foreground_write(c, bio_sectors(bio));
- else
- bch_mark_discard(c, bio_sectors(bio));
-
- /* Don't call bch_next_delay() if rate is >= 1 GB/sec */
-
- if (c->foreground_write_ratelimit_enabled &&
- c->foreground_write_pd.rate.rate < (1 << 30) &&
- !(op->flags & BCH_WRITE_DISCARD) && op->wp->throttle) {
- unsigned long flags;
- u64 delay;
-
- spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
- bch_ratelimit_increment(&c->foreground_write_pd.rate,
- bio->bi_iter.bi_size);
-
- delay = bch_ratelimit_delay(&c->foreground_write_pd.rate);
-
- if (delay >= HZ / 100) {
- trace_bcache_write_throttle(c, inode, bio, delay);
-
- closure_get(&op->cl); /* list takes a ref */
-
- op->expires = jiffies + delay;
- op->next = NULL;
-
- if (c->write_wait_tail)
- c->write_wait_tail->next = op;
- else
- c->write_wait_head = op;
- c->write_wait_tail = op;
-
- if (!timer_pending(&c->foreground_write_wakeup))
- mod_timer(&c->foreground_write_wakeup,
- op->expires);
-
- spin_unlock_irqrestore(&c->foreground_write_pd_lock,
- flags);
- continue_at(cl, __bch_write, index_update_wq(op));
- }
-
- spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
- }
-
- continue_at_nobarrier(cl, __bch_write, NULL);
-}
-
-void bch_write_op_init(struct bch_write_op *op, struct bch_fs *c,
- struct bch_write_bio *bio, struct disk_reservation res,
- struct write_point *wp, struct bpos pos,
- u64 *journal_seq, unsigned flags)
-{
- EBUG_ON(res.sectors && !res.nr_replicas);
-
- op->c = c;
- op->io_wq = index_update_wq(op);
- op->bio = bio;
- op->written = 0;
- op->error = 0;
- op->flags = flags;
- op->csum_type = bch_data_checksum_type(c);
- op->compression_type = c->opts.compression;
- op->nr_replicas = res.nr_replicas;
- op->alloc_reserve = RESERVE_NONE;
- op->nonce = 0;
- op->pos = pos;
- op->version = ZERO_VERSION;
- op->res = res;
- op->wp = wp;
-
- if (journal_seq) {
- op->journal_seq_p = journal_seq;
- op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
- } else {
- op->journal_seq = 0;
- }
-
- op->index_update_fn = bch_write_index_default;
-
- bch_keylist_init(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys));
-
- if (version_stress_test(c))
- get_random_bytes(&op->version, sizeof(op->version));
-}
-
-/* Discard */
-
-/* bch_discard - discard a range of keys from start_key to end_key.
- * @c filesystem
- * @start_key pointer to start location
- * NOTE: discard starts at bkey_start_offset(start_key)
- * @end_key pointer to end location
- * NOTE: discard ends at KEY_OFFSET(end_key)
- * @version version of discard (0ULL if none)
- *
- * Returns:
- * 0 on success
- * <0 on error
- *
- * XXX: this needs to be refactored with inode_truncate, or more
- * appropriately inode_truncate should call this
- */
-int bch_discard(struct bch_fs *c, struct bpos start,
- struct bpos end, struct bversion version,
- struct disk_reservation *disk_res,
- struct extent_insert_hook *hook,
- u64 *journal_seq)
-{
- return bch_btree_delete_range(c, BTREE_ID_EXTENTS, start, end, version,
- disk_res, hook, journal_seq);
-}
-
-/* Cache promotion on read */
-
-struct cache_promote_op {
- struct closure cl;
- struct migrate_write write;
- struct bio_vec bi_inline_vecs[0]; /* must be last */
-};
-
-/* Read */
-
-static int bio_checksum_uncompress(struct bch_fs *c,
- struct bch_read_bio *rbio)
-{
- struct bio *src = &rbio->bio;
- struct bio *dst = &bch_rbio_parent(rbio)->bio;
- struct bvec_iter dst_iter = rbio->parent_iter;
- struct nonce nonce = extent_nonce(rbio->version,
- rbio->crc.nonce,
- crc_uncompressed_size(NULL, &rbio->crc),
- rbio->crc.compression_type);
- struct bch_csum csum;
- int ret = 0;
-
- /*
- * reset iterator for checksumming and copying bounced data: here we've
- * set rbio->compressed_size to the amount of data we actually read,
- * which was not necessarily the full extent if we were only bouncing
- * in order to promote
- */
- if (rbio->bounce) {
- src->bi_iter.bi_size = crc_compressed_size(NULL, &rbio->crc) << 9;
- src->bi_iter.bi_idx = 0;
- src->bi_iter.bi_bvec_done = 0;
- } else {
- src->bi_iter = rbio->parent_iter;
- }
-
- csum = bch_checksum_bio(c, rbio->crc.csum_type, nonce, src);
- if (bch_dev_nonfatal_io_err_on(bch_crc_cmp(rbio->crc.csum, csum), rbio->ca,
- "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
- rbio->inode, (u64) rbio->parent_iter.bi_sector << 9,
- rbio->crc.csum.hi, rbio->crc.csum.lo, csum.hi, csum.lo,
- rbio->crc.csum_type))
- ret = -EIO;
-
- /*
- * If there was a checksum error, still copy the data back - unless it
- * was compressed, we don't want to decompress bad data:
- */
- if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) {
- if (!ret) {
- bch_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
- ret = bch_bio_uncompress(c, src, dst,
- dst_iter, rbio->crc);
- if (ret)
- __bcache_io_error(c, "decompression error");
- }
- } else if (rbio->bounce) {
- bio_advance(src, rbio->crc.offset << 9);
-
- /* don't need to decrypt the entire bio: */
- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
- src->bi_iter.bi_size = dst_iter.bi_size;
-
- nonce = nonce_add(nonce, rbio->crc.offset << 9);
-
- bch_encrypt_bio(c, rbio->crc.csum_type,
- nonce, src);
-
- bio_copy_data_iter(dst, dst_iter,
- src, src->bi_iter);
- } else {
- bch_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
- }
-
- return ret;
-}
-
-static void bch_rbio_free(struct bch_read_bio *rbio)
-{
- struct bch_fs *c = rbio->c;
- struct bio *bio = &rbio->bio;
-
- BUG_ON(rbio->ca);
- BUG_ON(!rbio->split);
-
- if (rbio->promote)
- kfree(rbio->promote);
- if (rbio->bounce)
- bch_bio_free_pages_pool(c, bio);
-
- bio_put(bio);
-}
-
-static void bch_rbio_done(struct bch_read_bio *rbio)
-{
- struct bio *orig = &bch_rbio_parent(rbio)->bio;
-
- percpu_ref_put(&rbio->ca->io_ref);
- rbio->ca = NULL;
-
- if (rbio->split) {
- if (rbio->bio.bi_error)
- orig->bi_error = rbio->bio.bi_error;
-
- bio_endio(orig);
- bch_rbio_free(rbio);
- } else {
- if (rbio->promote)
- kfree(rbio->promote);
-
- orig->bi_end_io = rbio->orig_bi_end_io;
- bio_endio_nodec(orig);
- }
-}
-
-static void bch_rbio_error(struct bch_read_bio *rbio, int error)
-{
- bch_rbio_parent(rbio)->bio.bi_error = error;
- bch_rbio_done(rbio);
-}
-
-static void bch_rbio_retry(struct bch_fs *c, struct bch_read_bio *rbio)
-{
- unsigned long flags;
-
- percpu_ref_put(&rbio->ca->io_ref);
- rbio->ca = NULL;
-
- spin_lock_irqsave(&c->read_retry_lock, flags);
- bio_list_add(&c->read_retry_list, &rbio->bio);
- spin_unlock_irqrestore(&c->read_retry_lock, flags);
- queue_work(c->wq, &c->read_retry_work);
-}
-
-static void cache_promote_done(struct closure *cl)
-{
- struct cache_promote_op *op =
- container_of(cl, struct cache_promote_op, cl);
-
- bch_bio_free_pages_pool(op->write.op.c, &op->write.wbio.bio);
- kfree(op);
-}
-
-/* Inner part that may run in process context */
-static void __bch_read_endio(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- int ret;
-
- ret = bio_checksum_uncompress(c, rbio);
- if (ret) {
- /*
- * Checksum error: if the bio wasn't bounced, we may have been
- * reading into buffers owned by userspace (that userspace can
- * scribble over) - retry the read, bouncing it this time:
- */
- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
- rbio->flags |= BCH_READ_FORCE_BOUNCE;
- bch_rbio_retry(c, rbio);
- } else {
- bch_rbio_error(rbio, -EIO);
- }
- return;
- }
-
- if (rbio->promote) {
- struct cache_promote_op *promote = rbio->promote;
- struct closure *cl = &promote->cl;
-
- BUG_ON(!rbio->split || !rbio->bounce);
-
- /* we now own pages: */
- swap(promote->write.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt);
- rbio->promote = NULL;
-
- bch_rbio_done(rbio);
-
- closure_init(cl, &c->cl);
- closure_call(&promote->write.op.cl, bch_write, c->wq, cl);
- closure_return_with_destructor(cl, cache_promote_done);
- } else {
- bch_rbio_done(rbio);
- }
-}
-
-static void bch_read_endio(struct bio *bio)
-{
- struct bch_read_bio *rbio =
- container_of(bio, struct bch_read_bio, bio);
- struct bch_fs *c = rbio->c;
-
- if (rbio->flags & BCH_READ_ACCOUNT_TIMES)
- bch_account_io_completion_time(rbio->ca, rbio->submit_time_us,
- REQ_OP_READ);
-
- if (bch_dev_nonfatal_io_err_on(bio->bi_error, rbio->ca, "data read")) {
- /* XXX: retry IO errors when we have another replica */
- bch_rbio_error(rbio, bio->bi_error);
- return;
- }
-
- if (rbio->ptr.cached &&
- (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(rbio->ca, &rbio->ptr))) {
- atomic_long_inc(&c->cache_read_races);
-
- if (rbio->flags & BCH_READ_RETRY_IF_STALE)
- bch_rbio_retry(c, rbio);
- else
- bch_rbio_error(rbio, -EINTR);
- return;
- }
-
- if (rbio->crc.compression_type ||
- bch_csum_type_is_encryption(rbio->crc.csum_type))
- queue_work(system_unbound_wq, &rbio->work);
- else if (rbio->crc.csum_type)
- queue_work(system_highpri_wq, &rbio->work);
- else
- __bch_read_endio(&rbio->work);
-}
-
-static bool should_promote(struct bch_fs *c,
- struct extent_pick_ptr *pick, unsigned flags)
-{
- if (!(flags & BCH_READ_PROMOTE))
- return false;
-
- if (percpu_ref_is_dying(&c->writes))
- return false;
-
- return c->fastest_tier &&
- c->fastest_tier < c->tiers + pick->ca->mi.tier;
-}
-
-void bch_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
- struct bvec_iter iter, struct bkey_s_c k,
- struct extent_pick_ptr *pick, unsigned flags)
-{
- struct bch_read_bio *rbio;
- struct cache_promote_op *promote_op = NULL;
- unsigned skip = iter.bi_sector - bkey_start_offset(k.k);
- bool bounce = false, split, read_full = false;
-
- EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
- k.k->p.offset < bvec_iter_end_sector(iter));
-
- /* only promote if we're not reading from the fastest tier: */
-
- /*
- * XXX: multiple promotes can race with each other, wastefully. Keep a
- * list of outstanding promotes?
- */
- if (should_promote(c, pick, flags)) {
- /*
- * biovec needs to be big enough to hold decompressed data, if
- * the bch_write_extent() has to decompress/recompress it:
- */
- unsigned sectors =
- max_t(unsigned, k.k->size,
- crc_uncompressed_size(NULL, &pick->crc));
- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
-
- promote_op = kmalloc(sizeof(*promote_op) +
- sizeof(struct bio_vec) * pages, GFP_NOIO);
- if (promote_op) {
- struct bio *promote_bio = &promote_op->write.wbio.bio;
-
- bio_init(promote_bio);
- promote_bio->bi_max_vecs = pages;
- promote_bio->bi_io_vec = promote_bio->bi_inline_vecs;
- bounce = true;
- /* could also set read_full */
- }
- }
-
- /*
- * note: if compression_type and crc_type both == none, then
- * compressed/uncompressed size is zero
- */
- if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
- (pick->crc.csum_type != BCH_CSUM_NONE &&
- (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) ||
- (bch_csum_type_is_encryption(pick->crc.csum_type) &&
- (flags & BCH_READ_USER_MAPPED)) ||
- (flags & BCH_READ_FORCE_BOUNCE)))) {
- read_full = true;
- bounce = true;
- }
-
- if (bounce) {
- unsigned sectors = read_full
- ? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size)
- : bvec_iter_sectors(iter);
-
- rbio = container_of(bio_alloc_bioset(GFP_NOIO,
- DIV_ROUND_UP(sectors, PAGE_SECTORS),
- &c->bio_read_split),
- struct bch_read_bio, bio);
-
- bch_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
- split = true;
- } else if (!(flags & BCH_READ_MAY_REUSE_BIO) ||
- !(flags & BCH_READ_IS_LAST)) {
- /*
- * Have to clone if there were any splits, due to error
- * reporting issues (if a split errored, and retrying didn't
- * work, when it reports the error to its parent (us) we don't
- * know if the error was from our bio, and we should retry, or
- * from the whole bio, in which case we don't want to retry and
- * lose the error)
- */
- rbio = container_of(bio_clone_fast(&orig->bio,
- GFP_NOIO, &c->bio_read_split),
- struct bch_read_bio, bio);
- rbio->bio.bi_iter = iter;
- split = true;
- } else {
- rbio = orig;
- rbio->bio.bi_iter = iter;
- split = false;
- BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
- }
-
- if (!(flags & BCH_READ_IS_LAST))
- __bio_inc_remaining(&orig->bio);
-
- if (split)
- rbio->parent = orig;
- else
- rbio->orig_bi_end_io = orig->bio.bi_end_io;
- rbio->parent_iter = iter;
-
- rbio->flags = flags;
- rbio->bounce = bounce;
- rbio->split = split;
- rbio->c = c;
- rbio->ca = pick->ca;
- rbio->ptr = pick->ptr;
- rbio->crc = pick->crc;
- /*
- * crc.compressed_size will be 0 if there wasn't any checksum
- * information, also we need to stash the original size of the bio if we
- * bounced (which isn't necessarily the original key size, if we bounced
- * only for promoting)
- */
- rbio->crc._compressed_size = bio_sectors(&rbio->bio) - 1;
- rbio->version = k.k->version;
- rbio->promote = promote_op;
- rbio->inode = k.k->p.inode;
- INIT_WORK(&rbio->work, __bch_read_endio);
-
- rbio->bio.bi_bdev = pick->ca->disk_sb.bdev;
- rbio->bio.bi_opf = orig->bio.bi_opf;
- rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
- rbio->bio.bi_end_io = bch_read_endio;
-
- if (promote_op) {
- struct bio *promote_bio = &promote_op->write.wbio.bio;
-
- promote_bio->bi_iter = rbio->bio.bi_iter;
- memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec,
- sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
-
- bch_migrate_write_init(c, &promote_op->write,
- &c->promote_write_point,
- k, NULL,
- BCH_WRITE_ALLOC_NOWAIT|
- BCH_WRITE_CACHED);
- promote_op->write.promote = true;
-
- if (rbio->crc.compression_type) {
- promote_op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED;
- promote_op->write.op.crc = rbio->crc;
- promote_op->write.op.size = k.k->size;
- } else if (read_full) {
- /*
- * Adjust bio to correspond to _live_ portion of @k -
- * which might be less than what we're actually reading:
- */
- bio_advance(promote_bio, rbio->crc.offset << 9);
- BUG_ON(bio_sectors(promote_bio) < k.k->size);
- promote_bio->bi_iter.bi_size = k.k->size << 9;
- } else {
- /*
- * Set insert pos to correspond to what we're actually
- * reading:
- */
- promote_op->write.op.pos.offset = iter.bi_sector;
- }
-
- promote_bio->bi_iter.bi_sector =
- promote_op->write.op.pos.offset;
- }
-
- /* _after_ promete stuff has looked at rbio->crc.offset */
- if (read_full)
- rbio->crc.offset += skip;
- else
- rbio->bio.bi_iter.bi_sector += skip;
-
- rbio->submit_time_us = local_clock_us();
-
-#ifndef CONFIG_BCACHE_NO_IO
- generic_make_request(&rbio->bio);
-#else
- bio_endio(&rbio->bio);
-#endif
-}
-
-static void bch_read_iter(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
- unsigned flags)
-{
- struct bio *bio = &rbio->bio;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
- POS(inode, bvec_iter.bi_sector), k) {
- BKEY_PADDED(k) tmp;
- struct extent_pick_ptr pick;
- unsigned bytes, sectors;
- bool is_last;
-
- /*
- * Unlock the iterator while the btree node's lock is still in
- * cache, before doing the IO:
- */
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
- bch_btree_iter_unlock(&iter);
-
- bch_extent_pick_ptr(c, k, &pick);
- if (IS_ERR(pick.ca)) {
- bcache_io_error(c, bio, "no device to read from");
- bio_endio(bio);
- return;
- }
-
- sectors = min_t(u64, k.k->p.offset,
- bvec_iter_end_sector(bvec_iter)) -
- bvec_iter.bi_sector;
- bytes = sectors << 9;
- is_last = bytes == bvec_iter.bi_size;
- swap(bvec_iter.bi_size, bytes);
-
- if (is_last)
- flags |= BCH_READ_IS_LAST;
-
- if (pick.ca) {
- PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
- c->prio_clock[READ].hand;
-
- bch_read_extent_iter(c, rbio, bvec_iter,
- k, &pick, flags);
-
- flags &= ~BCH_READ_MAY_REUSE_BIO;
- } else {
- zero_fill_bio_iter(bio, bvec_iter);
-
- if (is_last)
- bio_endio(bio);
- }
-
- if (is_last)
- return;
-
- swap(bvec_iter.bi_size, bytes);
- bio_advance_iter(bio, &bvec_iter, bytes);
- }
-
- /*
- * If we get here, it better have been because there was an error
- * reading a btree node
- */
- ret = bch_btree_iter_unlock(&iter);
- BUG_ON(!ret);
- bcache_io_error(c, bio, "btree IO error %i", ret);
- bio_endio(bio);
-}
-
-void bch_read(struct bch_fs *c, struct bch_read_bio *bio, u64 inode)
-{
- bch_increment_clock(c, bio_sectors(&bio->bio), READ);
-
- bch_read_iter(c, bio, bio->bio.bi_iter, inode,
- BCH_READ_RETRY_IF_STALE|
- BCH_READ_PROMOTE|
- BCH_READ_MAY_REUSE_BIO|
- BCH_READ_USER_MAPPED);
-}
-
-/**
- * bch_read_retry - re-submit a bio originally from bch_read()
- */
-static void bch_read_retry(struct bch_fs *c, struct bch_read_bio *rbio)
-{
- struct bch_read_bio *parent = bch_rbio_parent(rbio);
- struct bvec_iter iter = rbio->parent_iter;
- unsigned flags = rbio->flags;
- u64 inode = rbio->inode;
-
- trace_bcache_read_retry(&rbio->bio);
-
- if (rbio->split)
- bch_rbio_free(rbio);
- else
- rbio->bio.bi_end_io = rbio->orig_bi_end_io;
-
- bch_read_iter(c, parent, iter, inode, flags);
-}
-
-void bch_read_retry_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs,
- read_retry_work);
- struct bch_read_bio *rbio;
- struct bio *bio;
- unsigned long flags;
-
- while (1) {
- spin_lock_irqsave(&c->read_retry_lock, flags);
- bio = bio_list_pop(&c->read_retry_list);
- spin_unlock_irqrestore(&c->read_retry_lock, flags);
-
- if (!bio)
- break;
-
- rbio = container_of(bio, struct bch_read_bio, bio);
- bch_read_retry(c, rbio);
- }
-}
diff --git a/libbcache/io.h b/libbcache/io.h
deleted file mode 100644
index 9239ca4a..00000000
--- a/libbcache/io.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef _BCACHE_IO_H
-#define _BCACHE_IO_H
-
-#include "io_types.h"
-
-#define to_wbio(_bio) \
- container_of((_bio), struct bch_write_bio, bio)
-
-#define to_rbio(_bio) \
- container_of((_bio), struct bch_read_bio, bio)
-
-void bch_bio_free_pages_pool(struct bch_fs *, struct bio *);
-void bch_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-
-enum bch_write_flags {
- BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
- BCH_WRITE_DISCARD = (1 << 1),
- BCH_WRITE_CACHED = (1 << 2),
- BCH_WRITE_FLUSH = (1 << 3),
- BCH_WRITE_DISCARD_ON_ERROR = (1 << 4),
- BCH_WRITE_DATA_COMPRESSED = (1 << 5),
-
- /* Internal: */
- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 6),
- BCH_WRITE_DONE = (1 << 7),
- BCH_WRITE_LOOPED = (1 << 8),
-};
-
-static inline u64 *op_journal_seq(struct bch_write_op *op)
-{
- return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
- ? op->journal_seq_p : &op->journal_seq;
-}
-
-static inline struct write_point *foreground_write_point(struct bch_fs *c,
- unsigned long v)
-{
- return c->write_points +
- hash_long(v, ilog2(ARRAY_SIZE(c->write_points)));
-}
-
-void bch_write_op_init(struct bch_write_op *, struct bch_fs *,
- struct bch_write_bio *,
- struct disk_reservation, struct write_point *,
- struct bpos, u64 *, unsigned);
-void bch_write(struct closure *);
-
-struct cache_promote_op;
-
-struct extent_pick_ptr;
-
-void bch_read_extent_iter(struct bch_fs *, struct bch_read_bio *,
- struct bvec_iter, struct bkey_s_c k,
- struct extent_pick_ptr *, unsigned);
-
-static inline void bch_read_extent(struct bch_fs *c,
- struct bch_read_bio *orig,
- struct bkey_s_c k,
- struct extent_pick_ptr *pick,
- unsigned flags)
-{
- bch_read_extent_iter(c, orig, orig->bio.bi_iter,
- k, pick, flags);
-}
-
-enum bch_read_flags {
- BCH_READ_FORCE_BOUNCE = 1 << 0,
- BCH_READ_RETRY_IF_STALE = 1 << 1,
- BCH_READ_PROMOTE = 1 << 2,
- BCH_READ_IS_LAST = 1 << 3,
- BCH_READ_MAY_REUSE_BIO = 1 << 4,
- BCH_READ_ACCOUNT_TIMES = 1 << 5,
- BCH_READ_USER_MAPPED = 1 << 6,
-};
-
-void bch_read(struct bch_fs *, struct bch_read_bio *, u64);
-
-void bch_generic_make_request(struct bio *, struct bch_fs *);
-void bch_bio_submit_work(struct work_struct *);
-void bch_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
- const struct bkey_i *, bool);
-
-int bch_discard(struct bch_fs *, struct bpos, struct bpos,
- struct bversion, struct disk_reservation *,
- struct extent_insert_hook *, u64 *);
-
-void bch_read_retry_work(struct work_struct *);
-void bch_wake_delayed_writes(unsigned long data);
-
-#endif /* _BCACHE_IO_H */
diff --git a/libbcache/io_types.h b/libbcache/io_types.h
deleted file mode 100644
index ca1b0192..00000000
--- a/libbcache/io_types.h
+++ /dev/null
@@ -1,145 +0,0 @@
-#ifndef _BCACHE_IO_TYPES_H
-#define _BCACHE_IO_TYPES_H
-
-#include "btree_types.h"
-#include "buckets_types.h"
-#include "keylist_types.h"
-
-#include <linux/llist.h>
-#include <linux/workqueue.h>
-
-struct bch_read_bio {
- /*
- * Reads will often have to be split, and if the extent being read from
- * was checksummed or compressed we'll also have to allocate bounce
- * buffers and copy the data back into the original bio.
- *
- * If we didn't have to split, we have to save and restore the original
- * bi_end_io - @split below indicates which:
- */
- union {
- struct bch_read_bio *parent;
- bio_end_io_t *orig_bi_end_io;
- };
-
- /*
- * Saved copy of parent->bi_iter, from submission time - allows us to
- * resubmit on IO error, and also to copy data back to the original bio
- * when we're bouncing:
- */
- struct bvec_iter parent_iter;
-
- unsigned submit_time_us;
- u16 flags;
- u8 bounce:1,
- split:1;
-
- struct bch_fs *c;
- struct bch_dev *ca;
- struct bch_extent_ptr ptr;
- struct bch_extent_crc128 crc;
- struct bversion version;
-
- struct cache_promote_op *promote;
-
- /*
- * If we have to retry the read (IO error, checksum failure, read stale
- * data (raced with allocator), we retry the portion of the parent bio
- * that failed (i.e. this bio's portion, parent_iter).
- *
- * But we need to stash the inode somewhere:
- */
- u64 inode;
-
- struct work_struct work;
-
- struct bio bio;
-};
-
-static inline struct bch_read_bio *
-bch_rbio_parent(struct bch_read_bio *rbio)
-{
- return rbio->split ? rbio->parent : rbio;
-}
-
-struct bch_write_bio {
- struct bch_fs *c;
- struct bch_dev *ca;
- union {
- struct bio *orig;
- struct closure *cl;
- };
-
- unsigned submit_time_us;
- unsigned split:1,
- bounce:1,
- put_bio:1;
-
- /* Only for btree writes: */
- unsigned used_mempool:1;
- u8 order;
-
- struct bio bio;
-};
-
-struct bch_replace_info {
- struct extent_insert_hook hook;
- /* How many insertions succeeded */
- unsigned successes;
- /* How many insertions failed */
- unsigned failures;
- BKEY_PADDED(key);
-};
-
-struct bch_write_op {
- struct closure cl;
- struct bch_fs *c;
- struct workqueue_struct *io_wq;
- struct bch_write_bio *bio;
-
- unsigned written; /* sectors */
-
- short error;
-
- u16 flags;
- unsigned csum_type:4;
- unsigned compression_type:4;
- unsigned nr_replicas:4;
- unsigned alloc_reserve:4;
- unsigned nonce:14;
-
- struct bpos pos;
- struct bversion version;
-
- /* For BCH_WRITE_DATA_COMPRESSED: */
- struct bch_extent_crc128 crc;
- unsigned size;
-
- struct disk_reservation res;
-
- struct write_point *wp;
-
- union {
- u8 open_buckets[16];
- struct {
- struct bch_write_op *next;
- unsigned long expires;
- };
- };
-
- /*
- * If caller wants to flush but hasn't passed us a journal_seq ptr, we
- * still need to stash the journal_seq somewhere:
- */
- union {
- u64 *journal_seq_p;
- u64 journal_seq;
- };
-
- int (*index_update_fn)(struct bch_write_op *);
-
- struct keylist insert_keys;
- u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2];
-};
-
-#endif /* _BCACHE_IO_TYPES_H */
diff --git a/libbcache/journal.c b/libbcache/journal.c
deleted file mode 100644
index 585d1205..00000000
--- a/libbcache/journal.c
+++ /dev/null
@@ -1,2835 +0,0 @@
-/*
- * bcache journalling code, for btree insertions
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcache.h"
-#include "alloc.h"
-#include "bkey_methods.h"
-#include "buckets.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_io.h"
-#include "checksum.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "io.h"
-#include "keylist.h"
-#include "journal.h"
-#include "super-io.h"
-#include "vstructs.h"
-
-#include <trace/events/bcache.h>
-
-static void journal_write(struct closure *);
-static void journal_reclaim_fast(struct journal *);
-static void journal_pin_add_entry(struct journal *,
- struct journal_entry_pin_list *,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
-
-static inline struct journal_buf *journal_cur_buf(struct journal *j)
-{
- return j->buf + j->reservations.idx;
-}
-
-static inline struct journal_buf *journal_prev_buf(struct journal *j)
-{
- return j->buf + !j->reservations.idx;
-}
-
-/* Sequence number of oldest dirty journal entry */
-
-static inline u64 last_seq(struct journal *j)
-{
- return atomic64_read(&j->seq) - fifo_used(&j->pin) + 1;
-}
-
-static inline u64 journal_pin_seq(struct journal *j,
- struct journal_entry_pin_list *pin_list)
-{
- return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
-}
-
-static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
- struct jset_entry *entry, unsigned type)
-{
- while (entry < vstruct_last(jset)) {
- if (JOURNAL_ENTRY_TYPE(entry) == type)
- return entry;
-
- entry = vstruct_next(entry);
- }
-
- return NULL;
-}
-
-#define for_each_jset_entry_type(entry, jset, type) \
- for (entry = (jset)->start; \
- (entry = __jset_entry_type_next(jset, entry, type)); \
- entry = vstruct_next(entry))
-
-#define for_each_jset_key(k, _n, entry, jset) \
- for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
- vstruct_for_each_safe(entry, k, _n)
-
-static inline void bch_journal_add_entry(struct journal_buf *buf,
- const void *data, size_t u64s,
- unsigned type, enum btree_id id,
- unsigned level)
-{
- struct jset *jset = buf->data;
-
- bch_journal_add_entry_at(buf, data, u64s, type, id, level,
- le32_to_cpu(jset->u64s));
- le32_add_cpu(&jset->u64s, jset_u64s(u64s));
-}
-
-static struct jset_entry *bch_journal_find_entry(struct jset *j, unsigned type,
- enum btree_id id)
-{
- struct jset_entry *entry;
-
- for_each_jset_entry_type(entry, j, type)
- if (entry->btree_id == id)
- return entry;
-
- return NULL;
-}
-
-struct bkey_i *bch_journal_find_btree_root(struct bch_fs *c, struct jset *j,
- enum btree_id id, unsigned *level)
-{
- struct bkey_i *k;
- struct jset_entry *entry =
- bch_journal_find_entry(j, JOURNAL_ENTRY_BTREE_ROOT, id);
-
- if (!entry)
- return NULL;
-
- k = entry->start;
- *level = entry->level;
- *level = entry->level;
- return k;
-}
-
-static void bch_journal_add_btree_root(struct journal_buf *buf,
- enum btree_id id, struct bkey_i *k,
- unsigned level)
-{
- bch_journal_add_entry(buf, k, k->k.u64s,
- JOURNAL_ENTRY_BTREE_ROOT, id, level);
-}
-
-static inline void bch_journal_add_prios(struct journal *j,
- struct journal_buf *buf)
-{
- /*
- * no prio bucket ptrs yet... XXX should change the allocator so this
- * can't happen:
- */
- if (!buf->nr_prio_buckets)
- return;
-
- bch_journal_add_entry(buf, j->prio_buckets, buf->nr_prio_buckets,
- JOURNAL_ENTRY_PRIO_PTRS, 0, 0);
-}
-
-static void journal_seq_blacklist_flush(struct journal *j,
- struct journal_entry_pin *pin)
-{
- struct bch_fs *c =
- container_of(j, struct bch_fs, journal);
- struct journal_seq_blacklist *bl =
- container_of(pin, struct journal_seq_blacklist, pin);
- struct blacklisted_node n;
- struct closure cl;
- unsigned i;
- int ret;
-
- closure_init_stack(&cl);
-
- for (i = 0;; i++) {
- struct btree_iter iter;
- struct btree *b;
-
- mutex_lock(&j->blacklist_lock);
- if (i >= bl->nr_entries) {
- mutex_unlock(&j->blacklist_lock);
- break;
- }
- n = bl->entries[i];
- mutex_unlock(&j->blacklist_lock);
-
- bch_btree_iter_init(&iter, c, n.btree_id, n.pos);
- iter.is_extents = false;
-redo_peek:
- b = bch_btree_iter_peek_node(&iter);
-
- /* The node might have already been rewritten: */
-
- if (b->data->keys.seq == n.seq &&
- !bkey_cmp(b->key.k.p, n.pos)) {
- ret = bch_btree_node_rewrite(&iter, b, &cl);
- if (ret) {
- bch_btree_iter_unlock(&iter);
- closure_sync(&cl);
-
- if (ret == -EAGAIN ||
- ret == -EINTR)
- goto redo_peek;
-
- /* -EROFS or perhaps -ENOSPC - bail out: */
- /* XXX warn here */
- return;
- }
- }
-
- bch_btree_iter_unlock(&iter);
- }
-
- closure_sync(&cl);
-
- for (i = 0;; i++) {
- struct btree_interior_update *as;
- struct pending_btree_node_free *d;
-
- mutex_lock(&j->blacklist_lock);
- if (i >= bl->nr_entries) {
- mutex_unlock(&j->blacklist_lock);
- break;
- }
- n = bl->entries[i];
- mutex_unlock(&j->blacklist_lock);
-redo_wait:
- mutex_lock(&c->btree_interior_update_lock);
-
- /*
- * Is the node on the list of pending interior node updates -
- * being freed? If so, wait for that to finish:
- */
- for_each_pending_btree_node_free(c, as, d)
- if (n.seq == d->seq &&
- n.btree_id == d->btree_id &&
- !d->level &&
- !bkey_cmp(n.pos, d->key.k.p)) {
- closure_wait(&as->wait, &cl);
- mutex_unlock(&c->btree_interior_update_lock);
- closure_sync(&cl);
- goto redo_wait;
- }
-
- mutex_unlock(&c->btree_interior_update_lock);
- }
-
- mutex_lock(&j->blacklist_lock);
-
- bch_journal_pin_drop(j, &bl->pin);
- list_del(&bl->list);
- kfree(bl->entries);
- kfree(bl);
-
- mutex_unlock(&j->blacklist_lock);
-}
-
-static struct journal_seq_blacklist *
-journal_seq_blacklist_find(struct journal *j, u64 seq)
-{
- struct journal_seq_blacklist *bl;
-
- lockdep_assert_held(&j->blacklist_lock);
-
- list_for_each_entry(bl, &j->seq_blacklist, list)
- if (seq == bl->seq)
- return bl;
-
- return NULL;
-}
-
-static struct journal_seq_blacklist *
-bch_journal_seq_blacklisted_new(struct journal *j, u64 seq)
-{
- struct journal_seq_blacklist *bl;
-
- lockdep_assert_held(&j->blacklist_lock);
-
- bl = kzalloc(sizeof(*bl), GFP_KERNEL);
- if (!bl)
- return NULL;
-
- bl->seq = seq;
- list_add_tail(&bl->list, &j->seq_blacklist);
- return bl;
-}
-
-/*
- * Returns true if @seq is newer than the most recent journal entry that got
- * written, and data corresponding to @seq should be ignored - also marks @seq
- * as blacklisted so that on future restarts the corresponding data will still
- * be ignored:
- */
-int bch_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
-{
- struct journal *j = &c->journal;
- struct journal_seq_blacklist *bl = NULL;
- struct blacklisted_node *n;
- u64 journal_seq, i;
- int ret = 0;
-
- if (!seq)
- return 0;
-
- journal_seq = atomic64_read(&j->seq);
-
- /* Interier updates aren't journalled: */
- BUG_ON(b->level);
- BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
-
- if (seq <= journal_seq) {
- if (list_empty_careful(&j->seq_blacklist))
- return 0;
-
- mutex_lock(&j->blacklist_lock);
- ret = journal_seq_blacklist_find(j, seq) != NULL;
- mutex_unlock(&j->blacklist_lock);
- return ret;
- }
-
- /*
- * Decrease this back to j->seq + 2 when we next rev the on disk format:
- * increasing it temporarily to work around bug in old kernels
- */
- bch_fs_inconsistent_on(seq > journal_seq + 4, c,
- "bset journal seq too far in the future: %llu > %llu",
- seq, journal_seq);
-
- bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
- b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
-
- /*
- * When we start the journal, bch_journal_start() will skip over @seq:
- */
-
- mutex_lock(&j->blacklist_lock);
-
- for (i = journal_seq + 1; i <= seq; i++) {
- bl = journal_seq_blacklist_find(j, i) ?:
- bch_journal_seq_blacklisted_new(j, i);
-
- if (!bl) {
- ret = -ENOMEM;
- goto out;
- }
- }
-
- for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
- if (b->data->keys.seq == n->seq &&
- b->btree_id == n->btree_id &&
- !bkey_cmp(b->key.k.p, n->pos))
- goto found_entry;
-
- if (!bl->nr_entries ||
- is_power_of_2(bl->nr_entries)) {
- n = krealloc(bl->entries,
- max(bl->nr_entries * 2, 8UL) * sizeof(*n),
- GFP_KERNEL);
- if (!n) {
- ret = -ENOMEM;
- goto out;
- }
- bl->entries = n;
- }
-
- bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
- .seq = b->data->keys.seq,
- .btree_id = b->btree_id,
- .pos = b->key.k.p,
- };
-found_entry:
- ret = 1;
-out:
- mutex_unlock(&j->blacklist_lock);
- return ret;
-}
-
-/*
- * Journal replay/recovery:
- *
- * This code is all driven from bch_fs_start(); we first read the journal
- * entries, do some other stuff, then we mark all the keys in the journal
- * entries (same as garbage collection would), then we replay them - reinserting
- * them into the cache in precisely the same order as they appear in the
- * journal.
- *
- * We only journal keys that go in leaf nodes, which simplifies things quite a
- * bit.
- */
-
-struct journal_list {
- struct closure cl;
- struct mutex lock;
- struct list_head *head;
- int ret;
-};
-
-#define JOURNAL_ENTRY_ADD_OK 0
-#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
-
-/*
- * Given a journal entry we just read, add it to the list of journal entries to
- * be replayed:
- */
-static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
- struct jset *j)
-{
- struct journal_replay *i, *pos;
- struct list_head *where;
- size_t bytes = vstruct_bytes(j);
- __le64 last_seq;
- int ret;
-
- mutex_lock(&jlist->lock);
-
- last_seq = !list_empty(jlist->head)
- ? list_last_entry(jlist->head, struct journal_replay,
- list)->j.last_seq
- : 0;
-
- /* Is this entry older than the range we need? */
- if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
- ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
- goto out;
- }
-
- /* Drop entries we don't need anymore */
- list_for_each_entry_safe(i, pos, jlist->head, list) {
- if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
- break;
- list_del(&i->list);
- kfree(i);
- }
-
- list_for_each_entry_reverse(i, jlist->head, list) {
- /* Duplicate? */
- if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
- fsck_err_on(bytes != vstruct_bytes(&i->j) ||
- memcmp(j, &i->j, bytes), c,
- "found duplicate but non identical journal entries (seq %llu)",
- le64_to_cpu(j->seq));
-
- ret = JOURNAL_ENTRY_ADD_OK;
- goto out;
- }
-
- if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
- where = &i->list;
- goto add;
- }
- }
-
- where = jlist->head;
-add:
- i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
- if (!i) {
- ret = -ENOMEM;
- goto out;
- }
-
- memcpy(&i->j, j, bytes);
- list_add(&i->list, where);
- ret = JOURNAL_ENTRY_ADD_OK;
-out:
-fsck_err:
- mutex_unlock(&jlist->lock);
- return ret;
-}
-
-static struct nonce journal_nonce(const struct jset *jset)
-{
- return (struct nonce) {{
- [0] = 0,
- [1] = ((__le32 *) &jset->seq)[0],
- [2] = ((__le32 *) &jset->seq)[1],
- [3] = BCH_NONCE_JOURNAL,
- }};
-}
-
-static void journal_entry_null_range(void *start, void *end)
-{
- struct jset_entry *entry;
-
- for (entry = start; entry != end; entry = vstruct_next(entry)) {
- entry->u64s = 0;
- entry->btree_id = 0;
- entry->level = 0;
- entry->flags = 0;
- SET_JOURNAL_ENTRY_TYPE(entry, 0);
- }
-}
-
-static int journal_validate_key(struct bch_fs *c, struct jset *j,
- struct jset_entry *entry,
- struct bkey_i *k, enum bkey_type key_type,
- const char *type)
-{
- void *next = vstruct_next(entry);
- const char *invalid;
- char buf[160];
- int ret = 0;
-
- if (mustfix_fsck_err_on(!k->k.u64s, c,
- "invalid %s in journal: k->u64s 0", type)) {
- entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
- journal_entry_null_range(vstruct_next(entry), next);
- return 0;
- }
-
- if (mustfix_fsck_err_on((void *) bkey_next(k) >
- (void *) vstruct_next(entry), c,
- "invalid %s in journal: extends past end of journal entry",
- type)) {
- entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
- journal_entry_null_range(vstruct_next(entry), next);
- return 0;
- }
-
- if (mustfix_fsck_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
- "invalid %s in journal: bad format %u",
- type, k->k.format)) {
- le16_add_cpu(&entry->u64s, -k->k.u64s);
- memmove(k, bkey_next(k), next - (void *) bkey_next(k));
- journal_entry_null_range(vstruct_next(entry), next);
- return 0;
- }
-
- if (JSET_BIG_ENDIAN(j) != CPU_BIG_ENDIAN)
- bch_bkey_swab(key_type, NULL, bkey_to_packed(k));
-
- invalid = bkey_invalid(c, key_type, bkey_i_to_s_c(k));
- if (invalid) {
- bch_bkey_val_to_text(c, key_type, buf, sizeof(buf),
- bkey_i_to_s_c(k));
- mustfix_fsck_err(c, "invalid %s in journal: %s", type, buf);
-
- le16_add_cpu(&entry->u64s, -k->k.u64s);
- memmove(k, bkey_next(k), next - (void *) bkey_next(k));
- journal_entry_null_range(vstruct_next(entry), next);
- return 0;
- }
-fsck_err:
- return ret;
-}
-
-#define JOURNAL_ENTRY_REREAD 5
-#define JOURNAL_ENTRY_NONE 6
-#define JOURNAL_ENTRY_BAD 7
-
-static int journal_entry_validate(struct bch_fs *c,
- struct jset *j, u64 sector,
- unsigned bucket_sectors_left,
- unsigned sectors_read)
-{
- struct jset_entry *entry;
- size_t bytes = vstruct_bytes(j);
- struct bch_csum csum;
- int ret = 0;
-
- if (le64_to_cpu(j->magic) != jset_magic(c))
- return JOURNAL_ENTRY_NONE;
-
- if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
- bch_err(c, "unknown journal entry version %u",
- le32_to_cpu(j->version));
- return BCH_FSCK_UNKNOWN_VERSION;
- }
-
- if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c,
- "journal entry too big (%zu bytes), sector %lluu",
- bytes, sector)) {
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
- }
-
- if (bytes > sectors_read << 9)
- return JOURNAL_ENTRY_REREAD;
-
- if (fsck_err_on(!bch_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
- "journal entry with unknown csum type %llu sector %lluu",
- JSET_CSUM_TYPE(j), sector))
- return JOURNAL_ENTRY_BAD;
-
- csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
- if (mustfix_fsck_err_on(bch_crc_cmp(csum, j->csum), c,
- "journal checksum bad, sector %llu", sector)) {
- /* XXX: retry IO, when we start retrying checksum errors */
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
- }
-
- bch_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
- j->encrypted_start,
- vstruct_end(j) - (void *) j->encrypted_start);
-
- if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
- "invalid journal entry: last_seq > seq"))
- j->last_seq = j->seq;
-
- vstruct_for_each(j, entry) {
- struct bkey_i *k;
-
- if (mustfix_fsck_err_on(vstruct_next(entry) >
- vstruct_last(j), c,
- "journal entry extents past end of jset")) {
- j->u64s = cpu_to_le64((u64 *) entry - j->_data);
- break;
- }
-
- switch (JOURNAL_ENTRY_TYPE(entry)) {
- case JOURNAL_ENTRY_BTREE_KEYS:
- vstruct_for_each(entry, k) {
- ret = journal_validate_key(c, j, entry, k,
- bkey_type(entry->level,
- entry->btree_id),
- "key");
- if (ret)
- goto fsck_err;
- }
- break;
-
- case JOURNAL_ENTRY_BTREE_ROOT:
- k = entry->start;
-
- if (mustfix_fsck_err_on(!entry->u64s ||
- le16_to_cpu(entry->u64s) != k->k.u64s, c,
- "invalid btree root journal entry: wrong number of keys")) {
- journal_entry_null_range(entry,
- vstruct_next(entry));
- continue;
- }
-
- ret = journal_validate_key(c, j, entry, k,
- BKEY_TYPE_BTREE, "btree root");
- if (ret)
- goto fsck_err;
- break;
-
- case JOURNAL_ENTRY_PRIO_PTRS:
- break;
-
- case JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED:
- if (mustfix_fsck_err_on(le16_to_cpu(entry->u64s) != 1, c,
- "invalid journal seq blacklist entry: bad size")) {
- journal_entry_null_range(entry,
- vstruct_next(entry));
- }
-
- break;
- default:
- mustfix_fsck_err(c, "invalid journal entry type %llu",
- JOURNAL_ENTRY_TYPE(entry));
- journal_entry_null_range(entry, vstruct_next(entry));
- break;
- }
- }
-
-fsck_err:
- return ret;
-}
-
-struct journal_read_buf {
- void *data;
- size_t size;
-};
-
-static int journal_read_buf_realloc(struct journal_read_buf *b,
- size_t new_size)
-{
- void *n;
-
- new_size = roundup_pow_of_two(new_size);
- n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size));
- if (!n)
- return -ENOMEM;
-
- free_pages((unsigned long) b->data, get_order(b->size));
- b->data = n;
- b->size = new_size;
- return 0;
-}
-
-static int journal_read_bucket(struct bch_dev *ca,
- struct journal_read_buf *buf,
- struct journal_list *jlist,
- unsigned bucket, u64 *seq, bool *entries_found)
-{
- struct bch_fs *c = ca->fs;
- struct journal_device *ja = &ca->journal;
- struct bio *bio = ja->bio;
- struct jset *j = NULL;
- unsigned sectors, sectors_read = 0;
- u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
- end = offset + ca->mi.bucket_size;
- bool saw_bad = false;
- int ret = 0;
-
- pr_debug("reading %u", bucket);
-
- while (offset < end) {
- if (!sectors_read) {
-reread: sectors_read = min_t(unsigned,
- end - offset, buf->size >> 9);
-
- bio_reset(bio);
- bio->bi_bdev = ca->disk_sb.bdev;
- bio->bi_iter.bi_sector = offset;
- bio->bi_iter.bi_size = sectors_read << 9;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
- bch_bio_map(bio, buf->data);
-
- ret = submit_bio_wait(bio);
-
- if (bch_dev_fatal_io_err_on(ret, ca,
- "journal read from sector %llu",
- offset) ||
- bch_meta_read_fault("journal"))
- return -EIO;
-
- j = buf->data;
- }
-
- ret = journal_entry_validate(c, j, offset,
- end - offset, sectors_read);
- switch (ret) {
- case BCH_FSCK_OK:
- break;
- case JOURNAL_ENTRY_REREAD:
- if (vstruct_bytes(j) > buf->size) {
- ret = journal_read_buf_realloc(buf,
- vstruct_bytes(j));
- if (ret)
- return ret;
- }
- goto reread;
- case JOURNAL_ENTRY_NONE:
- if (!saw_bad)
- return 0;
- sectors = c->sb.block_size;
- goto next_block;
- case JOURNAL_ENTRY_BAD:
- saw_bad = true;
- sectors = c->sb.block_size;
- goto next_block;
- default:
- return ret;
- }
-
- /*
- * This happens sometimes if we don't have discards on -
- * when we've partially overwritten a bucket with new
- * journal entries. We don't need the rest of the
- * bucket:
- */
- if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- return 0;
-
- ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
-
- ret = journal_entry_add(c, jlist, j);
- switch (ret) {
- case JOURNAL_ENTRY_ADD_OK:
- *entries_found = true;
- break;
- case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
- break;
- default:
- return ret;
- }
-
- if (le64_to_cpu(j->seq) > *seq)
- *seq = le64_to_cpu(j->seq);
-
- sectors = vstruct_sectors(j, c->block_bits);
-next_block:
- pr_debug("next");
- offset += sectors;
- sectors_read -= sectors;
- j = ((void *) j) + (sectors << 9);
- }
-
- return 0;
-}
-
-static void bch_journal_read_device(struct closure *cl)
-{
-#define read_bucket(b) \
- ({ \
- bool entries_found = false; \
- ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \
- &entries_found); \
- if (ret) \
- goto err; \
- __set_bit(b, bitmap); \
- entries_found; \
- })
-
- struct journal_device *ja =
- container_of(cl, struct journal_device, read);
- struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
- struct journal_list *jlist =
- container_of(cl->parent, struct journal_list, cl);
- struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
- struct journal_read_buf buf = { NULL, 0 };
-
- DECLARE_BITMAP(bitmap, ja->nr);
- unsigned i, l, r;
- u64 seq = 0;
- int ret;
-
- if (!ja->nr)
- goto out;
-
- bitmap_zero(bitmap, ja->nr);
- ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
- if (ret)
- goto err;
-
- pr_debug("%u journal buckets", ja->nr);
-
- /*
- * If the device supports discard but not secure discard, we can't do
- * the fancy fibonacci hash/binary search because the live journal
- * entries might not form a contiguous range:
- */
- for (i = 0; i < ja->nr; i++)
- read_bucket(i);
- goto search_done;
-
- if (!blk_queue_nonrot(q))
- goto linear_scan;
-
- /*
- * Read journal buckets ordered by golden ratio hash to quickly
- * find a sequence of buckets with valid journal entries
- */
- for (i = 0; i < ja->nr; i++) {
- l = (i * 2654435769U) % ja->nr;
-
- if (test_bit(l, bitmap))
- break;
-
- if (read_bucket(l))
- goto bsearch;
- }
-
- /*
- * If that fails, check all the buckets we haven't checked
- * already
- */
- pr_debug("falling back to linear search");
-linear_scan:
- for (l = find_first_zero_bit(bitmap, ja->nr);
- l < ja->nr;
- l = find_next_zero_bit(bitmap, ja->nr, l + 1))
- if (read_bucket(l))
- goto bsearch;
-
- /* no journal entries on this device? */
- if (l == ja->nr)
- goto out;
-bsearch:
- /* Binary search */
- r = find_next_bit(bitmap, ja->nr, l + 1);
- pr_debug("starting binary search, l %u r %u", l, r);
-
- while (l + 1 < r) {
- unsigned m = (l + r) >> 1;
- u64 cur_seq = seq;
-
- read_bucket(m);
-
- if (cur_seq != seq)
- l = m;
- else
- r = m;
- }
-
-search_done:
- /*
- * Find the journal bucket with the highest sequence number:
- *
- * If there's duplicate journal entries in multiple buckets (which
- * definitely isn't supposed to happen, but...) - make sure to start
- * cur_idx at the last of those buckets, so we don't deadlock trying to
- * allocate
- */
- seq = 0;
-
- for (i = 0; i < ja->nr; i++)
- if (ja->bucket_seq[i] >= seq &&
- ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
- /*
- * When journal_next_bucket() goes to allocate for
- * the first time, it'll use the bucket after
- * ja->cur_idx
- */
- ja->cur_idx = i;
- seq = ja->bucket_seq[i];
- }
-
- /*
- * Set last_idx to indicate the entire journal is full and needs to be
- * reclaimed - journal reclaim will immediately reclaim whatever isn't
- * pinned when it first runs:
- */
- ja->last_idx = (ja->cur_idx + 1) % ja->nr;
-
- /*
- * Read buckets in reverse order until we stop finding more journal
- * entries:
- */
- for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
- i != ja->cur_idx;
- i = (i + ja->nr - 1) % ja->nr)
- if (!test_bit(i, bitmap) &&
- !read_bucket(i))
- break;
-out:
- free_pages((unsigned long) buf.data, get_order(buf.size));
- percpu_ref_put(&ca->io_ref);
- closure_return(cl);
-err:
- mutex_lock(&jlist->lock);
- jlist->ret = ret;
- mutex_unlock(&jlist->lock);
- goto out;
-#undef read_bucket
-}
-
-void bch_journal_entries_free(struct list_head *list)
-{
-
- while (!list_empty(list)) {
- struct journal_replay *i =
- list_first_entry(list, struct journal_replay, list);
- list_del(&i->list);
- kvfree(i);
- }
-}
-
-static int journal_seq_blacklist_read(struct journal *j,
- struct journal_replay *i,
- struct journal_entry_pin_list *p)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct jset_entry *entry;
- struct journal_seq_blacklist *bl;
- u64 seq;
-
- for_each_jset_entry_type(entry, &i->j,
- JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
- seq = le64_to_cpu(entry->_data[0]);
-
- bch_verbose(c, "blacklisting existing journal seq %llu", seq);
-
- bl = bch_journal_seq_blacklisted_new(j, seq);
- if (!bl)
- return -ENOMEM;
-
- journal_pin_add_entry(j, p, &bl->pin,
- journal_seq_blacklist_flush);
- bl->written = true;
- }
-
- return 0;
-}
-
-static inline bool journal_has_keys(struct list_head *list)
-{
- struct journal_replay *i;
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
-
- list_for_each_entry(i, list, list)
- for_each_jset_key(k, _n, entry, &i->j)
- return true;
-
- return false;
-}
-
-int bch_journal_read(struct bch_fs *c, struct list_head *list)
-{
- struct jset_entry *prio_ptrs;
- struct journal_list jlist;
- struct journal_replay *i;
- struct jset *j;
- struct journal_entry_pin_list *p;
- struct bch_dev *ca;
- u64 cur_seq, end_seq;
- unsigned iter;
- int ret = 0;
-
- closure_init_stack(&jlist.cl);
- mutex_init(&jlist.lock);
- jlist.head = list;
- jlist.ret = 0;
-
- for_each_readable_member(ca, c, iter) {
- percpu_ref_get(&ca->io_ref);
- closure_call(&ca->journal.read,
- bch_journal_read_device,
- system_unbound_wq,
- &jlist.cl);
- }
-
- closure_sync(&jlist.cl);
-
- if (jlist.ret)
- return jlist.ret;
-
- if (list_empty(list)){
- bch_err(c, "no journal entries found");
- return BCH_FSCK_REPAIR_IMPOSSIBLE;
- }
-
- fsck_err_on(c->sb.clean && journal_has_keys(list), c,
- "filesystem marked clean but journal has keys to replay");
-
- j = &list_entry(list->prev, struct journal_replay, list)->j;
-
- unfixable_fsck_err_on(le64_to_cpu(j->seq) -
- le64_to_cpu(j->last_seq) + 1 >
- c->journal.pin.size, c,
- "too many journal entries open for refcount fifo");
-
- c->journal.pin.back = le64_to_cpu(j->seq) -
- le64_to_cpu(j->last_seq) + 1;
-
- atomic64_set(&c->journal.seq, le64_to_cpu(j->seq));
- c->journal.last_seq_ondisk = le64_to_cpu(j->last_seq);
-
- BUG_ON(last_seq(&c->journal) != le64_to_cpu(j->last_seq));
-
- i = list_first_entry(list, struct journal_replay, list);
-
- mutex_lock(&c->journal.blacklist_lock);
-
- fifo_for_each_entry_ptr(p, &c->journal.pin, iter) {
- u64 seq = journal_pin_seq(&c->journal, p);
-
- INIT_LIST_HEAD(&p->list);
-
- if (i && le64_to_cpu(i->j.seq) == seq) {
- atomic_set(&p->count, 1);
-
- if (journal_seq_blacklist_read(&c->journal, i, p)) {
- mutex_unlock(&c->journal.blacklist_lock);
- return -ENOMEM;
- }
-
- i = list_is_last(&i->list, list)
- ? NULL
- : list_next_entry(i, list);
- } else {
- atomic_set(&p->count, 0);
- }
- }
-
- mutex_unlock(&c->journal.blacklist_lock);
-
- cur_seq = last_seq(&c->journal);
- end_seq = le64_to_cpu(list_last_entry(list,
- struct journal_replay, list)->j.seq);
-
- list_for_each_entry(i, list, list) {
- bool blacklisted;
-
- mutex_lock(&c->journal.blacklist_lock);
- while (cur_seq < le64_to_cpu(i->j.seq) &&
- journal_seq_blacklist_find(&c->journal, cur_seq))
- cur_seq++;
-
- blacklisted = journal_seq_blacklist_find(&c->journal,
- le64_to_cpu(i->j.seq));
- mutex_unlock(&c->journal.blacklist_lock);
-
- fsck_err_on(blacklisted, c,
- "found blacklisted journal entry %llu",
- le64_to_cpu(i->j.seq));
-
- fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
- "journal entries %llu-%llu missing! (replaying %llu-%llu)",
- cur_seq, le64_to_cpu(i->j.seq) - 1,
- last_seq(&c->journal), end_seq);
-
- cur_seq = le64_to_cpu(i->j.seq) + 1;
- }
-
- prio_ptrs = bch_journal_find_entry(j, JOURNAL_ENTRY_PRIO_PTRS, 0);
- if (prio_ptrs) {
- memcpy_u64s(c->journal.prio_buckets,
- prio_ptrs->_data,
- le16_to_cpu(prio_ptrs->u64s));
- c->journal.nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s);
- }
-fsck_err:
- return ret;
-}
-
-void bch_journal_mark(struct bch_fs *c, struct list_head *list)
-{
- struct bkey_i *k, *n;
- struct jset_entry *j;
- struct journal_replay *r;
-
- list_for_each_entry(r, list, list)
- for_each_jset_key(k, n, j, &r->j) {
- enum bkey_type type = bkey_type(j->level, j->btree_id);
- struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
-
- if (btree_type_has_ptrs(type))
- bch_btree_mark_key_initial(c, type, k_s_c);
- }
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
- return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-void bch_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
- if (!need_write_just_set &&
- test_bit(JOURNAL_NEED_WRITE, &j->flags))
- __bch_time_stats_update(j->delay_time,
- j->need_write_time);
-#if 0
- closure_call(&j->io, journal_write, NULL, &c->cl);
-#else
- /* Shut sparse up: */
- closure_init(&j->io, &c->cl);
- set_closure_fn(&j->io, journal_write, NULL);
- journal_write(&j->io);
-#endif
-}
-
-static void __bch_journal_next_entry(struct journal *j)
-{
- struct journal_entry_pin_list pin_list, *p;
- struct journal_buf *buf;
-
- /*
- * The fifo_push() needs to happen at the same time as j->seq is
- * incremented for last_seq() to be calculated correctly
- */
- atomic64_inc(&j->seq);
- BUG_ON(!fifo_push(&j->pin, pin_list));
- p = &fifo_peek_back(&j->pin);
-
- INIT_LIST_HEAD(&p->list);
- atomic_set(&p->count, 1);
-
- if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) {
- smp_wmb();
- j->cur_pin_list = p;
- }
-
- buf = journal_cur_buf(j);
- memset(buf->has_inode, 0, sizeof(buf->has_inode));
-
- memset(buf->data, 0, sizeof(*buf->data));
- buf->data->seq = cpu_to_le64(atomic64_read(&j->seq));
- buf->data->u64s = 0;
-
- BUG_ON(journal_pin_seq(j, p) != atomic64_read(&j->seq));
-}
-
-static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
-{
- unsigned ret = BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
-
- if (buf->nr_prio_buckets)
- ret += JSET_KEYS_U64s + buf->nr_prio_buckets;
-
- return ret;
-}
-
-static enum {
- JOURNAL_ENTRY_ERROR,
- JOURNAL_ENTRY_INUSE,
- JOURNAL_ENTRY_CLOSED,
- JOURNAL_UNLOCKED,
-} journal_buf_switch(struct journal *j, bool need_write_just_set)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf;
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- do {
- old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
- return JOURNAL_ENTRY_CLOSED;
-
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return JOURNAL_ENTRY_ERROR;
-
- if (new.prev_buf_unwritten)
- return JOURNAL_ENTRY_INUSE;
-
- /*
- * avoid race between setting buf->data->u64s and
- * journal_res_put starting write:
- */
- journal_state_inc(&new);
-
- new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
- new.idx++;
- new.prev_buf_unwritten = 1;
-
- BUG_ON(journal_state_count(new, new.idx));
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
-
- journal_reclaim_fast(j);
-
- clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
- buf = &j->buf[old.idx];
- buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
- buf->data->last_seq = cpu_to_le64(last_seq(j));
-
- j->prev_buf_sectors =
- vstruct_blocks_plus(buf->data, c->block_bits,
- journal_entry_u64s_reserve(buf)) *
- c->sb.block_size;
-
- BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
-
- atomic_dec_bug(&fifo_peek_back(&j->pin).count);
- __bch_journal_next_entry(j);
-
- cancel_delayed_work(&j->write_work);
- spin_unlock(&j->lock);
-
- if (c->bucket_journal_seq > 1 << 14) {
- c->bucket_journal_seq = 0;
- bch_bucket_seq_cleanup(c);
- }
-
- /* ugh - might be called from __journal_res_get() under wait_event() */
- __set_current_state(TASK_RUNNING);
- bch_journal_buf_put(j, old.idx, need_write_just_set);
-
- return JOURNAL_UNLOCKED;
-}
-
-void bch_journal_halt(struct journal *j)
-{
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- do {
- old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return;
-
- new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
-
- wake_up(&j->wait);
- closure_wake_up(&journal_cur_buf(j)->wait);
- closure_wake_up(&journal_prev_buf(j)->wait);
-}
-
-static unsigned journal_dev_buckets_available(struct journal *j,
- struct bch_dev *ca)
-{
- struct journal_device *ja = &ca->journal;
- unsigned next = (ja->cur_idx + 1) % ja->nr;
- unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
-
- /*
- * Hack to avoid a deadlock during journal replay:
- * journal replay might require setting a new btree
- * root, which requires writing another journal entry -
- * thus, if the journal is full (and this happens when
- * replaying the first journal bucket's entries) we're
- * screwed.
- *
- * So don't let the journal fill up unless we're in
- * replay:
- */
- if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
- available = max((int) available - 2, 0);
-
- /*
- * Don't use the last bucket unless writing the new last_seq
- * will make another bucket available:
- */
- if (ja->bucket_seq[ja->last_idx] >= last_seq(j))
- available = max((int) available - 1, 0);
-
- return available;
-}
-
-/* returns number of sectors available for next journal entry: */
-static int journal_entry_sectors(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
- unsigned sectors_available = j->entry_size_max >> 9;
- unsigned i, nr_online = 0, nr_devs = 0;
-
- lockdep_assert_held(&j->lock);
-
- spin_lock(&j->devs.lock);
- group_for_each_dev(ca, &j->devs, i) {
- unsigned buckets_required = 0;
-
- sectors_available = min_t(unsigned, sectors_available,
- ca->mi.bucket_size);
-
- /*
- * Note that we don't allocate the space for a journal entry
- * until we write it out - thus, if we haven't started the write
- * for the previous entry we have to make sure we have space for
- * it too:
- */
- if (bch_extent_has_device(e.c, ca->dev_idx)) {
- if (j->prev_buf_sectors > ca->journal.sectors_free)
- buckets_required++;
-
- if (j->prev_buf_sectors + sectors_available >
- ca->journal.sectors_free)
- buckets_required++;
- } else {
- if (j->prev_buf_sectors + sectors_available >
- ca->mi.bucket_size)
- buckets_required++;
-
- buckets_required++;
- }
-
- if (journal_dev_buckets_available(j, ca) >= buckets_required)
- nr_devs++;
- nr_online++;
- }
- spin_unlock(&j->devs.lock);
-
- if (nr_online < c->opts.metadata_replicas_required)
- return -EROFS;
-
- if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
- return 0;
-
- return sectors_available;
-}
-
-/*
- * should _only_ called from journal_res_get() - when we actually want a
- * journal reservation - journal entry is open means journal is dirty:
- */
-static int journal_entry_open(struct journal *j)
-{
- struct journal_buf *buf = journal_cur_buf(j);
- ssize_t u64s;
- int ret = 0, sectors;
-
- lockdep_assert_held(&j->lock);
- BUG_ON(journal_entry_is_open(j));
-
- if (!fifo_free(&j->pin))
- return 0;
-
- sectors = journal_entry_sectors(j);
- if (sectors <= 0)
- return sectors;
-
- j->cur_buf_sectors = sectors;
- buf->nr_prio_buckets = j->nr_prio_buckets;
-
- u64s = (sectors << 9) / sizeof(u64);
-
- /* Subtract the journal header */
- u64s -= sizeof(struct jset) / sizeof(u64);
- /*
- * Btree roots, prio pointers don't get added until right before we do
- * the write:
- */
- u64s -= journal_entry_u64s_reserve(buf);
- u64s = max_t(ssize_t, 0L, u64s);
-
- BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
-
- if (u64s > le32_to_cpu(buf->data->u64s)) {
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- /*
- * Must be set before marking the journal entry as open:
- */
- j->cur_entry_u64s = u64s;
-
- do {
- old.v = new.v = v;
-
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return false;
-
- /* Handle any already added entries */
- new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
- ret = 1;
-
- wake_up(&j->wait);
-
- if (j->res_get_blocked_start) {
- __bch_time_stats_update(j->blocked_time,
- j->res_get_blocked_start);
- j->res_get_blocked_start = 0;
- }
-
- mod_delayed_work(system_freezable_wq,
- &j->write_work,
- msecs_to_jiffies(j->write_delay_ms));
- }
-
- return ret;
-}
-
-void bch_journal_start(struct bch_fs *c)
-{
- struct journal *j = &c->journal;
- struct journal_seq_blacklist *bl;
- u64 new_seq = 0;
-
- list_for_each_entry(bl, &j->seq_blacklist, list)
- new_seq = max(new_seq, bl->seq);
-
- spin_lock(&j->lock);
-
- set_bit(JOURNAL_STARTED, &j->flags);
-
- while (atomic64_read(&j->seq) < new_seq) {
- struct journal_entry_pin_list pin_list, *p;
-
- BUG_ON(!fifo_push(&j->pin, pin_list));
- p = &fifo_peek_back(&j->pin);
-
- INIT_LIST_HEAD(&p->list);
- atomic_set(&p->count, 0);
- atomic64_inc(&j->seq);
- }
-
- /*
- * journal_buf_switch() only inits the next journal entry when it
- * closes an open journal entry - the very first journal entry gets
- * initialized here:
- */
- __bch_journal_next_entry(j);
-
- /*
- * Adding entries to the next journal entry before allocating space on
- * disk for the next journal entry - this is ok, because these entries
- * only have to go down with the next journal entry we write:
- */
- list_for_each_entry(bl, &j->seq_blacklist, list)
- if (!bl->written) {
- bch_journal_add_entry(journal_cur_buf(j), &bl->seq, 1,
- JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED,
- 0, 0);
-
- journal_pin_add_entry(j,
- &fifo_peek_back(&j->pin),
- &bl->pin,
- journal_seq_blacklist_flush);
- bl->written = true;
- }
-
- spin_unlock(&j->lock);
-
- queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
-}
-
-int bch_journal_replay(struct bch_fs *c, struct list_head *list)
-{
- int ret = 0, keys = 0, entries = 0;
- struct journal *j = &c->journal;
- struct bkey_i *k, *_n;
- struct jset_entry *entry;
- struct journal_replay *i, *n;
-
- list_for_each_entry_safe(i, n, list, list) {
- j->cur_pin_list =
- &j->pin.data[((j->pin.back - 1 -
- (atomic64_read(&j->seq) -
- le64_to_cpu(i->j.seq))) &
- j->pin.mask)];
-
- for_each_jset_key(k, _n, entry, &i->j) {
- struct disk_reservation disk_res;
-
- /*
- * We might cause compressed extents to be split, so we
- * need to pass in a disk_reservation:
- */
- BUG_ON(bch_disk_reservation_get(c, &disk_res, 0, 0));
-
- trace_bcache_journal_replay_key(&k->k);
-
- ret = bch_btree_insert(c, entry->btree_id, k,
- &disk_res, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_REPLAY);
- bch_disk_reservation_put(c, &disk_res);
-
- if (ret)
- goto err;
-
- cond_resched();
- keys++;
- }
-
- if (atomic_dec_and_test(&j->cur_pin_list->count))
- wake_up(&j->wait);
-
- entries++;
- }
-
- if (keys) {
- bch_btree_flush(c);
-
- /*
- * Write a new journal entry _before_ we start journalling new data -
- * otherwise, we could end up with btree node bsets with journal seqs
- * arbitrarily far in the future vs. the most recently written journal
- * entry on disk, if we crash before writing the next journal entry:
- */
- ret = bch_journal_meta(&c->journal);
- if (ret)
- goto err;
- }
-
- bch_info(c, "journal replay done, %i keys in %i entries, seq %llu",
- keys, entries, (u64) atomic64_read(&j->seq));
-
- bch_journal_set_replay_done(&c->journal);
-err:
- if (ret)
- bch_err(c, "journal replay error: %d", ret);
-
- bch_journal_entries_free(list);
-
- return ret;
-}
-
-#if 0
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-static int bch_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr)
-{
- struct journal *j = &c->journal;
- struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets;
- struct disk_reservation disk_res = { 0, 0 };
- struct closure cl;
- u64 *new_bucket_seq = NULL, *new_buckets = NULL;
- int ret = 0;
-
- closure_init_stack(&cl);
-
- /* don't handle reducing nr of buckets yet: */
- if (nr <= ja->nr)
- return 0;
-
- /*
- * note: journal buckets aren't really counted as _sectors_ used yet, so
- * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
- * when space used goes up without a reservation - but we do need the
- * reservation to ensure we'll actually be able to allocate:
- */
-
- if (bch_disk_reservation_get(c, &disk_res,
- (nr - ja->nr) << ca->bucket_bits, 0))
- return -ENOSPC;
-
- mutex_lock(&c->sb_lock);
-
- ret = -ENOMEM;
- new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- if (!new_buckets || !new_bucket_seq)
- goto err;
-
- journal_buckets = bch_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
- if (!journal_buckets)
- goto err;
-
- spin_lock(&j->lock);
- memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
- memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
- swap(new_buckets, ja->buckets);
- swap(new_bucket_seq, ja->bucket_seq);
-
- while (ja->nr < nr) {
- /* must happen under journal lock, to avoid racing with gc: */
- u64 b = bch_bucket_alloc(ca, RESERVE_NONE);
- if (!b) {
- if (!closure_wait(&c->freelist_wait, &cl)) {
- spin_unlock(&j->lock);
- closure_sync(&cl);
- spin_lock(&j->lock);
- }
- continue;
- }
-
- bch_mark_metadata_bucket(ca, &ca->buckets[b],
- BUCKET_JOURNAL, false);
- bch_mark_alloc_bucket(ca, &ca->buckets[b], false);
-
- memmove(ja->buckets + ja->last_idx + 1,
- ja->buckets + ja->last_idx,
- (ja->nr - ja->last_idx) * sizeof(u64));
- memmove(ja->bucket_seq + ja->last_idx + 1,
- ja->bucket_seq + ja->last_idx,
- (ja->nr - ja->last_idx) * sizeof(u64));
- memmove(journal_buckets->buckets + ja->last_idx + 1,
- journal_buckets->buckets + ja->last_idx,
- (ja->nr - ja->last_idx) * sizeof(u64));
-
- ja->buckets[ja->last_idx] = b;
- journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
-
- if (ja->last_idx < ja->nr) {
- if (ja->cur_idx >= ja->last_idx)
- ja->cur_idx++;
- ja->last_idx++;
- }
- ja->nr++;
-
- }
- spin_unlock(&j->lock);
-
- BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi));
-
- bch_write_super(c);
-
- ret = 0;
-err:
- mutex_unlock(&c->sb_lock);
-
- kfree(new_bucket_seq);
- kfree(new_buckets);
- bch_disk_reservation_put(c, &disk_res);
-
- return ret;
-}
-#endif
-
-int bch_dev_journal_alloc(struct bch_dev *ca)
-{
- struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets;
- unsigned i, nr;
- u64 b, *p;
-
- if (dynamic_fault("bcache:add:journal_alloc"))
- return -ENOMEM;
-
- /*
- * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
- * is smaller:
- */
- nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
- BCH_JOURNAL_BUCKETS_MIN,
- min(1 << 10,
- (1 << 20) / ca->mi.bucket_size));
-
- p = krealloc(ja->bucket_seq, nr * sizeof(u64),
- GFP_KERNEL|__GFP_ZERO);
- if (!p)
- return -ENOMEM;
-
- ja->bucket_seq = p;
-
- p = krealloc(ja->buckets, nr * sizeof(u64),
- GFP_KERNEL|__GFP_ZERO);
- if (!p)
- return -ENOMEM;
-
- ja->buckets = p;
-
- journal_buckets = bch_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
- if (!journal_buckets)
- return -ENOMEM;
-
- for (i = 0, b = ca->mi.first_bucket;
- i < nr && b < ca->mi.nbuckets; b++) {
- if (!is_available_bucket(ca->buckets[b].mark))
- continue;
-
- bch_mark_metadata_bucket(ca, &ca->buckets[b],
- BUCKET_JOURNAL, true);
- ja->buckets[i] = b;
- journal_buckets->buckets[i] = cpu_to_le64(b);
- i++;
- }
-
- if (i < nr)
- return -ENOSPC;
-
- BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi));
-
- ja->nr = nr;
-
- return 0;
-}
-
-/* Journalling */
-
-/**
- * journal_reclaim_fast - do the fast part of journal reclaim
- *
- * Called from IO submission context, does not block. Cleans up after btree
- * write completions by advancing the journal pin and each cache's last_idx,
- * kicking off discards and background reclaim as necessary.
- */
-static void journal_reclaim_fast(struct journal *j)
-{
- struct journal_entry_pin_list temp;
- bool popped = false;
-
- lockdep_assert_held(&j->lock);
-
- /*
- * Unpin journal entries whose reference counts reached zero, meaning
- * all btree nodes got written out
- */
- while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
- BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
- BUG_ON(!fifo_pop(&j->pin, temp));
- popped = true;
- }
-
- if (popped)
- wake_up(&j->wait);
-}
-
-/*
- * Journal entry pinning - machinery for holding a reference on a given journal
- * entry, marking it as dirty:
- */
-
-static inline void __journal_pin_add(struct journal *j,
- struct journal_entry_pin_list *pin_list,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- BUG_ON(journal_pin_active(pin));
-
- atomic_inc(&pin_list->count);
- pin->pin_list = pin_list;
- pin->flush = flush_fn;
-
- if (flush_fn)
- list_add(&pin->list, &pin_list->list);
- else
- INIT_LIST_HEAD(&pin->list);
-}
-
-static void journal_pin_add_entry(struct journal *j,
- struct journal_entry_pin_list *pin_list,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock_irq(&j->pin_lock);
- __journal_pin_add(j, pin_list, pin, flush_fn);
- spin_unlock_irq(&j->pin_lock);
-}
-
-void bch_journal_pin_add(struct journal *j,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock_irq(&j->pin_lock);
- __journal_pin_add(j, j->cur_pin_list, pin, flush_fn);
- spin_unlock_irq(&j->pin_lock);
-}
-
-static inline bool __journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- struct journal_entry_pin_list *pin_list = pin->pin_list;
-
- pin->pin_list = NULL;
-
- /* journal_reclaim_work() might have already taken us off the list */
- if (!list_empty_careful(&pin->list))
- list_del_init(&pin->list);
-
- return atomic_dec_and_test(&pin_list->count);
-}
-
-void bch_journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- unsigned long flags;
- bool wakeup;
-
- if (!journal_pin_active(pin))
- return;
-
- spin_lock_irqsave(&j->pin_lock, flags);
- wakeup = __journal_pin_drop(j, pin);
- spin_unlock_irqrestore(&j->pin_lock, flags);
-
- /*
- * Unpinning a journal entry make make journal_next_bucket() succeed, if
- * writing a new last_seq will now make another bucket available:
- *
- * Nested irqsave is expensive, don't do the wakeup with lock held:
- */
- if (wakeup)
- wake_up(&j->wait);
-}
-
-void bch_journal_pin_add_if_older(struct journal *j,
- struct journal_entry_pin *src_pin,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock_irq(&j->pin_lock);
-
- if (journal_pin_active(src_pin) &&
- (!journal_pin_active(pin) ||
- fifo_entry_idx(&j->pin, src_pin->pin_list) <
- fifo_entry_idx(&j->pin, pin->pin_list))) {
- if (journal_pin_active(pin))
- __journal_pin_drop(j, pin);
- __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
- }
-
- spin_unlock_irq(&j->pin_lock);
-}
-
-static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush)
-{
- struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *ret = NULL;
- unsigned iter;
-
- /* so we don't iterate over empty fifo entries below: */
- if (!atomic_read(&fifo_peek_front(&j->pin).count)) {
- spin_lock(&j->lock);
- journal_reclaim_fast(j);
- spin_unlock(&j->lock);
- }
-
- spin_lock_irq(&j->pin_lock);
- fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
- if (journal_pin_seq(j, pin_list) > seq_to_flush)
- break;
-
- ret = list_first_entry_or_null(&pin_list->list,
- struct journal_entry_pin, list);
- if (ret) {
- /* must be list_del_init(), see bch_journal_pin_drop() */
- list_del_init(&ret->list);
- break;
- }
- }
- spin_unlock_irq(&j->pin_lock);
-
- return ret;
-}
-
-static bool journal_has_pins(struct journal *j)
-{
- bool ret;
-
- spin_lock(&j->lock);
- journal_reclaim_fast(j);
- ret = fifo_used(&j->pin) > 1 ||
- atomic_read(&fifo_peek_front(&j->pin).count) > 1;
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-void bch_journal_flush_pins(struct journal *j)
-{
- struct journal_entry_pin *pin;
-
- while ((pin = journal_get_next_pin(j, U64_MAX)))
- pin->flush(j, pin);
-
- wait_event(j->wait, !journal_has_pins(j) || bch_journal_error(j));
-}
-
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
- bool ret;
-
- spin_lock(&j->lock);
- ret = ja->nr &&
- (ja->last_idx != ja->cur_idx &&
- ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-/**
- * journal_reclaim_work - free up journal buckets
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-static void journal_reclaim_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(to_delayed_work(work),
- struct bch_fs, journal.reclaim_work);
- struct journal *j = &c->journal;
- struct bch_dev *ca;
- struct journal_entry_pin *pin;
- u64 seq_to_flush = 0;
- unsigned iter, bucket_to_flush;
- unsigned long next_flush;
- bool reclaim_lock_held = false, need_flush;
-
- /*
- * Advance last_idx to point to the oldest journal entry containing
- * btree node updates that have not yet been written out
- */
- for_each_rw_member(ca, c, iter) {
- struct journal_device *ja = &ca->journal;
-
- if (!ja->nr)
- continue;
-
- while (should_discard_bucket(j, ja)) {
- if (!reclaim_lock_held) {
- /*
- * ugh:
- * might be called from __journal_res_get()
- * under wait_event() - have to go back to
- * TASK_RUNNING before doing something that
- * would block, but only if we're doing work:
- */
- __set_current_state(TASK_RUNNING);
-
- mutex_lock(&j->reclaim_lock);
- reclaim_lock_held = true;
- /* recheck under reclaim_lock: */
- continue;
- }
-
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca,
- ja->buckets[ja->last_idx]),
- ca->mi.bucket_size, GFP_NOIO, 0);
-
- spin_lock(&j->lock);
- ja->last_idx = (ja->last_idx + 1) % ja->nr;
- spin_unlock(&j->lock);
-
- wake_up(&j->wait);
- }
-
- /*
- * Write out enough btree nodes to free up 50% journal
- * buckets
- */
- spin_lock(&j->lock);
- bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
- seq_to_flush = max_t(u64, seq_to_flush,
- ja->bucket_seq[bucket_to_flush]);
- spin_unlock(&j->lock);
- }
-
- if (reclaim_lock_held)
- mutex_unlock(&j->reclaim_lock);
-
- /* Also flush if the pin fifo is more than half full */
- seq_to_flush = max_t(s64, seq_to_flush,
- (s64) atomic64_read(&j->seq) -
- (j->pin.size >> 1));
-
- /*
- * If it's been longer than j->reclaim_delay_ms since we last flushed,
- * make sure to flush at least one journal pin:
- */
- next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
- need_flush = time_after(jiffies, next_flush);
-
- while ((pin = journal_get_next_pin(j, need_flush
- ? U64_MAX
- : seq_to_flush))) {
- __set_current_state(TASK_RUNNING);
- pin->flush(j, pin);
- need_flush = false;
-
- j->last_flushed = jiffies;
- }
-
- if (!test_bit(BCH_FS_RO, &c->flags))
- queue_delayed_work(system_freezable_wq, &j->reclaim_work,
- msecs_to_jiffies(j->reclaim_delay_ms));
-}
-
-/**
- * journal_next_bucket - move on to the next journal bucket if possible
- */
-static int journal_write_alloc(struct journal *j, unsigned sectors)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
- struct bch_extent_ptr *ptr;
- struct journal_device *ja;
- struct bch_dev *ca;
- bool swapped;
- unsigned i, replicas, replicas_want =
- READ_ONCE(c->opts.metadata_replicas);
-
- spin_lock(&j->lock);
-
- /*
- * Drop any pointers to devices that have been removed, are no longer
- * empty, or filled up their current journal bucket:
- *
- * Note that a device may have had a small amount of free space (perhaps
- * one sector) that wasn't enough for the smallest possible journal
- * entry - that's why we drop pointers to devices <= current free space,
- * i.e. whichever device was limiting the current journal entry size.
- */
- extent_for_each_ptr_backwards(e, ptr) {
- ca = c->devs[ptr->dev];
-
- if (ca->mi.state != BCH_MEMBER_STATE_RW ||
- ca->journal.sectors_free <= sectors)
- __bch_extent_drop_ptr(e, ptr);
- else
- ca->journal.sectors_free -= sectors;
- }
-
- replicas = bch_extent_nr_ptrs(e.c);
-
- spin_lock(&j->devs.lock);
-
- /* Sort by tier: */
- do {
- swapped = false;
-
- for (i = 0; i + 1 < j->devs.nr; i++)
- if (j->devs.d[i + 0].dev->mi.tier >
- j->devs.d[i + 1].dev->mi.tier) {
- swap(j->devs.d[i], j->devs.d[i + 1]);
- swapped = true;
- }
- } while (swapped);
-
- /*
- * Pick devices for next journal write:
- * XXX: sort devices by free journal space?
- */
- group_for_each_dev(ca, &j->devs, i) {
- ja = &ca->journal;
-
- if (replicas >= replicas_want)
- break;
-
- /*
- * Check that we can use this device, and aren't already using
- * it:
- */
- if (bch_extent_has_device(e.c, ca->dev_idx) ||
- !journal_dev_buckets_available(j, ca) ||
- sectors > ca->mi.bucket_size)
- continue;
-
- ja->sectors_free = ca->mi.bucket_size - sectors;
- ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq);
-
- extent_ptr_append(bkey_i_to_extent(&j->key),
- (struct bch_extent_ptr) {
- .offset = bucket_to_sector(ca,
- ja->buckets[ja->cur_idx]),
- .dev = ca->dev_idx,
- });
- replicas++;
-
- trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
- }
- spin_unlock(&j->devs.lock);
-
- j->prev_buf_sectors = 0;
- spin_unlock(&j->lock);
-
- if (replicas < c->opts.metadata_replicas_required)
- return -EROFS;
-
- BUG_ON(!replicas);
-
- return 0;
-}
-
-static void journal_write_compact(struct jset *jset)
-{
- struct jset_entry *i, *next, *prev = NULL;
-
- /*
- * Simple compaction, dropping empty jset_entries (from journal
- * reservations that weren't fully used) and merging jset_entries that
- * can be.
- *
- * If we wanted to be really fancy here, we could sort all the keys in
- * the jset and drop keys that were overwritten - probably not worth it:
- */
- vstruct_for_each_safe(jset, i, next) {
- unsigned u64s = le16_to_cpu(i->u64s);
-
- /* Empty entry: */
- if (!u64s)
- continue;
-
- /* Can we merge with previous entry? */
- if (prev &&
- i->btree_id == prev->btree_id &&
- i->level == prev->level &&
- JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) &&
- JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS &&
- le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
- memmove_u64s_down(vstruct_next(prev),
- i->_data,
- u64s);
- le16_add_cpu(&prev->u64s, u64s);
- continue;
- }
-
- /* Couldn't merge, move i into new position (after prev): */
- prev = prev ? vstruct_next(prev) : jset->start;
- if (i != prev)
- memmove_u64s_down(prev, i, jset_u64s(u64s));
- }
-
- prev = prev ? vstruct_next(prev) : jset->start;
- jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
-static void journal_write_endio(struct bio *bio)
-{
- struct bch_dev *ca = bio->bi_private;
- struct journal *j = &ca->fs->journal;
-
- if (bch_dev_fatal_io_err_on(bio->bi_error, ca, "journal write") ||
- bch_meta_write_fault("journal"))
- bch_journal_halt(j);
-
- closure_put(&j->io);
- percpu_ref_put(&ca->io_ref);
-}
-
-static void journal_write_done(struct closure *cl)
-{
- struct journal *j = container_of(cl, struct journal, io);
- struct journal_buf *w = journal_prev_buf(j);
-
- j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
-
- __bch_time_stats_update(j->write_time, j->write_start_time);
-
- BUG_ON(!j->reservations.prev_buf_unwritten);
- atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
- &j->reservations.counter);
-
- /*
- * XXX: this is racy, we could technically end up doing the wake up
- * after the journal_buf struct has been reused for the next write
- * (because we're clearing JOURNAL_IO_IN_FLIGHT) and wake up things that
- * are waiting on the _next_ write, not this one.
- *
- * The wake up can't come before, because journal_flush_seq_async() is
- * looking at JOURNAL_IO_IN_FLIGHT when it has to wait on a journal
- * write that was already in flight.
- *
- * The right fix is to use a lock here, but using j.lock here means it
- * has to be a spin_lock_irqsave() lock which then requires propagating
- * the irq()ness to other locks and it's all kinds of nastiness.
- */
-
- closure_wake_up(&w->wait);
- wake_up(&j->wait);
-
- /*
- * Updating last_seq_ondisk may let journal_reclaim_work() discard more
- * buckets:
- */
- mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
-}
-
-static void journal_write(struct closure *cl)
-{
- struct journal *j = container_of(cl, struct journal, io);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct journal_buf *w = journal_prev_buf(j);
- struct jset *jset = w->data;
- struct bio *bio;
- struct bch_extent_ptr *ptr;
- unsigned i, sectors, bytes;
-
- j->write_start_time = local_clock();
-
- bch_journal_add_prios(j, w);
-
- mutex_lock(&c->btree_root_lock);
- for (i = 0; i < BTREE_ID_NR; i++) {
- struct btree_root *r = &c->btree_roots[i];
-
- if (r->alive)
- bch_journal_add_btree_root(w, i, &r->key, r->level);
- }
- mutex_unlock(&c->btree_root_lock);
-
- journal_write_compact(jset);
-
- jset->read_clock = cpu_to_le16(c->prio_clock[READ].hand);
- jset->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand);
- jset->magic = cpu_to_le64(jset_magic(c));
- jset->version = cpu_to_le32(BCACHE_JSET_VERSION);
-
- SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
- SET_JSET_CSUM_TYPE(jset, bch_meta_checksum_type(c));
-
- bch_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
- jset->encrypted_start,
- vstruct_end(jset) - (void *) jset->encrypted_start);
-
- jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
- journal_nonce(jset), jset);
-
- sectors = vstruct_sectors(jset, c->block_bits);
- BUG_ON(sectors > j->prev_buf_sectors);
-
- bytes = vstruct_bytes(w->data);
- memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
-
- if (journal_write_alloc(j, sectors)) {
- bch_journal_halt(j);
- bch_err(c, "Unable to allocate journal write");
- bch_fatal_error(c);
- closure_return_with_destructor(cl, journal_write_done);
- }
-
- bch_check_mark_super(c, &j->key, true);
-
- /*
- * XXX: we really should just disable the entire journal in nochanges
- * mode
- */
- if (c->opts.nochanges)
- goto no_io;
-
- extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) {
- ca = c->devs[ptr->dev];
- if (!percpu_ref_tryget(&ca->io_ref)) {
- /* XXX: fix this */
- bch_err(c, "missing device for journal write\n");
- continue;
- }
-
- atomic64_add(sectors, &ca->meta_sectors_written);
-
- bio = ca->journal.bio;
- bio_reset(bio);
- bio->bi_iter.bi_sector = ptr->offset;
- bio->bi_bdev = ca->disk_sb.bdev;
- bio->bi_iter.bi_size = sectors << 9;
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- bio_set_op_attrs(bio, REQ_OP_WRITE,
- REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
- bch_bio_map(bio, jset);
-
- trace_bcache_journal_write(bio);
- closure_bio_submit_punt(bio, cl, c);
-
- ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
- }
-
- for_each_rw_member(ca, c, i)
- if (journal_flushes_device(ca) &&
- !bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) {
- percpu_ref_get(&ca->io_ref);
-
- bio = ca->journal.bio;
- bio_reset(bio);
- bio->bi_bdev = ca->disk_sb.bdev;
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
- closure_bio_submit_punt(bio, cl, c);
- }
-
-no_io:
- extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
- ptr->offset += sectors;
-
- closure_return_with_destructor(cl, journal_write_done);
-}
-
-static void journal_write_work(struct work_struct *work)
-{
- struct journal *j = container_of(to_delayed_work(work),
- struct journal, write_work);
- spin_lock(&j->lock);
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
-
- if (journal_buf_switch(j, false) != JOURNAL_UNLOCKED)
- spin_unlock(&j->lock);
-}
-
-/*
- * Given an inode number, if that inode number has data in the journal that
- * hasn't yet been flushed, return the journal sequence number that needs to be
- * flushed:
- */
-u64 bch_inode_journal_seq(struct journal *j, u64 inode)
-{
- size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
- u64 seq = 0;
-
- if (!test_bit(h, j->buf[0].has_inode) &&
- !test_bit(h, j->buf[1].has_inode))
- return 0;
-
- spin_lock(&j->lock);
- if (test_bit(h, journal_cur_buf(j)->has_inode))
- seq = atomic64_read(&j->seq);
- else if (test_bit(h, journal_prev_buf(j)->has_inode))
- seq = atomic64_read(&j->seq) - 1;
- spin_unlock(&j->lock);
-
- return seq;
-}
-
-static int __journal_res_get(struct journal *j, struct journal_res *res,
- unsigned u64s_min, unsigned u64s_max)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- int ret;
-retry:
- ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
- if (ret)
- return ret;
-
- spin_lock(&j->lock);
- /*
- * Recheck after taking the lock, so we don't race with another thread
- * that just did journal_entry_open() and call journal_entry_close()
- * unnecessarily
- */
- ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
- if (ret) {
- spin_unlock(&j->lock);
- return 1;
- }
-
- /*
- * Ok, no more room in the current journal entry - try to start a new
- * one:
- */
- switch (journal_buf_switch(j, false)) {
- case JOURNAL_ENTRY_ERROR:
- spin_unlock(&j->lock);
- return -EIO;
- case JOURNAL_ENTRY_INUSE:
- /* haven't finished writing out the previous one: */
- spin_unlock(&j->lock);
- trace_bcache_journal_entry_full(c);
- goto blocked;
- case JOURNAL_ENTRY_CLOSED:
- break;
- case JOURNAL_UNLOCKED:
- goto retry;
- }
-
- /* We now have a new, closed journal buf - see if we can open it: */
- ret = journal_entry_open(j);
- spin_unlock(&j->lock);
-
- if (ret < 0)
- return ret;
- if (ret)
- goto retry;
-
- /* Journal's full, we have to wait */
-
- /*
- * Direct reclaim - can't rely on reclaim from work item
- * due to freezing..
- */
- journal_reclaim_work(&j->reclaim_work.work);
-
- trace_bcache_journal_full(c);
-blocked:
- if (!j->res_get_blocked_start)
- j->res_get_blocked_start = local_clock() ?: 1;
- return 0;
-}
-
-/*
- * Essentially the entry function to the journaling code. When bcache is doing
- * a btree insert, it calls this function to get the current journal write.
- * Journal write is the structure used set up journal writes. The calling
- * function will then add its keys to the structure, queuing them for the
- * next write.
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks.
- */
-int bch_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
- unsigned u64s_min, unsigned u64s_max)
-{
- int ret;
-
- wait_event(j->wait,
- (ret = __journal_res_get(j, res, u64s_min,
- u64s_max)));
- return ret < 0 ? ret : 0;
-}
-
-void bch_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent)
-{
- spin_lock(&j->lock);
-
- BUG_ON(seq > atomic64_read(&j->seq));
-
- if (bch_journal_error(j)) {
- spin_unlock(&j->lock);
- return;
- }
-
- if (seq == atomic64_read(&j->seq)) {
- if (!closure_wait(&journal_cur_buf(j)->wait, parent))
- BUG();
- } else if (seq + 1 == atomic64_read(&j->seq) &&
- j->reservations.prev_buf_unwritten) {
- if (!closure_wait(&journal_prev_buf(j)->wait, parent))
- BUG();
-
- smp_mb();
-
- /* check if raced with write completion (or failure) */
- if (!j->reservations.prev_buf_unwritten ||
- bch_journal_error(j))
- closure_wake_up(&journal_prev_buf(j)->wait);
- }
-
- spin_unlock(&j->lock);
-}
-
-void bch_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
-{
- spin_lock(&j->lock);
-
- BUG_ON(seq > atomic64_read(&j->seq));
-
- if (bch_journal_error(j)) {
- spin_unlock(&j->lock);
- return;
- }
-
- if (seq == atomic64_read(&j->seq)) {
- bool set_need_write = false;
-
- if (parent &&
- !closure_wait(&journal_cur_buf(j)->wait, parent))
- BUG();
-
- if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
- j->need_write_time = local_clock();
- set_need_write = true;
- }
-
- switch (journal_buf_switch(j, set_need_write)) {
- case JOURNAL_ENTRY_ERROR:
- if (parent)
- closure_wake_up(&journal_cur_buf(j)->wait);
- break;
- case JOURNAL_ENTRY_CLOSED:
- /*
- * Journal entry hasn't been opened yet, but caller
- * claims it has something (seq == j->seq):
- */
- BUG();
- case JOURNAL_ENTRY_INUSE:
- break;
- case JOURNAL_UNLOCKED:
- return;
- }
- } else if (parent &&
- seq + 1 == atomic64_read(&j->seq) &&
- j->reservations.prev_buf_unwritten) {
- if (!closure_wait(&journal_prev_buf(j)->wait, parent))
- BUG();
-
- smp_mb();
-
- /* check if raced with write completion (or failure) */
- if (!j->reservations.prev_buf_unwritten ||
- bch_journal_error(j))
- closure_wake_up(&journal_prev_buf(j)->wait);
- }
-
- spin_unlock(&j->lock);
-}
-
-int bch_journal_flush_seq(struct journal *j, u64 seq)
-{
- struct closure cl;
- u64 start_time = local_clock();
-
- closure_init_stack(&cl);
- bch_journal_flush_seq_async(j, seq, &cl);
- closure_sync(&cl);
-
- bch_time_stats_update(j->flush_seq_time, start_time);
-
- return bch_journal_error(j);
-}
-
-void bch_journal_meta_async(struct journal *j, struct closure *parent)
-{
- struct journal_res res;
- unsigned u64s = jset_u64s(0);
-
- memset(&res, 0, sizeof(res));
-
- bch_journal_res_get(j, &res, u64s, u64s);
- bch_journal_res_put(j, &res);
-
- bch_journal_flush_seq_async(j, res.seq, parent);
-}
-
-int bch_journal_meta(struct journal *j)
-{
- struct journal_res res;
- unsigned u64s = jset_u64s(0);
- int ret;
-
- memset(&res, 0, sizeof(res));
-
- ret = bch_journal_res_get(j, &res, u64s, u64s);
- if (ret)
- return ret;
-
- bch_journal_res_put(j, &res);
-
- return bch_journal_flush_seq(j, res.seq);
-}
-
-void bch_journal_flush_async(struct journal *j, struct closure *parent)
-{
- u64 seq, journal_seq;
-
- spin_lock(&j->lock);
- journal_seq = atomic64_read(&j->seq);
-
- if (journal_entry_is_open(j)) {
- seq = journal_seq;
- } else if (journal_seq) {
- seq = journal_seq - 1;
- } else {
- spin_unlock(&j->lock);
- return;
- }
- spin_unlock(&j->lock);
-
- bch_journal_flush_seq_async(j, seq, parent);
-}
-
-int bch_journal_flush(struct journal *j)
-{
- u64 seq, journal_seq;
-
- spin_lock(&j->lock);
- journal_seq = atomic64_read(&j->seq);
-
- if (journal_entry_is_open(j)) {
- seq = journal_seq;
- } else if (journal_seq) {
- seq = journal_seq - 1;
- } else {
- spin_unlock(&j->lock);
- return 0;
- }
- spin_unlock(&j->lock);
-
- return bch_journal_flush_seq(j, seq);
-}
-
-ssize_t bch_journal_print_debug(struct journal *j, char *buf)
-{
- union journal_res_state *s = &j->reservations;
- struct bch_dev *ca;
- unsigned iter;
- ssize_t ret = 0;
-
- rcu_read_lock();
- spin_lock(&j->lock);
-
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "active journal entries:\t%zu\n"
- "seq:\t\t\t%llu\n"
- "last_seq:\t\t%llu\n"
- "last_seq_ondisk:\t%llu\n"
- "reservation count:\t%u\n"
- "reservation offset:\t%u\n"
- "current entry u64s:\t%u\n"
- "io in flight:\t\t%i\n"
- "need write:\t\t%i\n"
- "dirty:\t\t\t%i\n"
- "replay done:\t\t%i\n",
- fifo_used(&j->pin),
- (u64) atomic64_read(&j->seq),
- last_seq(j),
- j->last_seq_ondisk,
- journal_state_count(*s, s->idx),
- s->cur_entry_offset,
- j->cur_entry_u64s,
- s->prev_buf_unwritten,
- test_bit(JOURNAL_NEED_WRITE, &j->flags),
- journal_entry_is_open(j),
- test_bit(JOURNAL_REPLAY_DONE, &j->flags));
-
- spin_lock(&j->devs.lock);
- group_for_each_dev(ca, &j->devs, iter) {
- struct journal_device *ja = &ca->journal;
-
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "dev %u:\n"
- "\tnr\t\t%u\n"
- "\tcur_idx\t\t%u (seq %llu)\n"
- "\tlast_idx\t%u (seq %llu)\n",
- iter, ja->nr,
- ja->cur_idx, ja->bucket_seq[ja->cur_idx],
- ja->last_idx, ja->bucket_seq[ja->last_idx]);
- }
- spin_unlock(&j->devs.lock);
-
- spin_unlock(&j->lock);
- rcu_read_unlock();
-
- return ret;
-}
-
-static bool bch_journal_writing_to_device(struct bch_dev *ca)
-{
- struct journal *j = &ca->fs->journal;
- bool ret;
-
- spin_lock(&j->lock);
- ret = bch_extent_has_device(bkey_i_to_s_c_extent(&j->key),
- ca->dev_idx);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-/*
- * This asumes that ca has already been marked read-only so that
- * journal_next_bucket won't pick buckets out of ca any more.
- * Hence, if the journal is not currently pointing to ca, there
- * will be no new writes to journal entries in ca after all the
- * pending ones have been flushed to disk.
- *
- * If the journal is being written to ca, write a new record, and
- * journal_next_bucket will notice that the device is no longer
- * writeable and pick a new set of devices to write to.
- */
-
-int bch_journal_move(struct bch_dev *ca)
-{
- u64 last_flushed_seq;
- struct journal_device *ja = &ca->journal;
- struct bch_fs *c = ca->fs;
- struct journal *j = &c->journal;
- unsigned i;
- int ret = 0; /* Success */
-
- if (bch_journal_writing_to_device(ca)) {
- /*
- * bch_journal_meta will write a record and we'll wait
- * for the write to complete.
- * Actually writing the journal (journal_write_locked)
- * will call journal_next_bucket which notices that the
- * device is no longer writeable, and picks a new one.
- */
- bch_journal_meta(j);
- BUG_ON(bch_journal_writing_to_device(ca));
- }
-
- /*
- * Flush all btree updates to backing store so that any
- * journal entries written to ca become stale and are no
- * longer needed.
- */
-
- /*
- * XXX: switch to normal journal reclaim machinery
- */
- bch_btree_flush(c);
-
- /*
- * Force a meta-data journal entry to be written so that
- * we have newer journal entries in devices other than ca,
- * and wait for the meta data write to complete.
- */
- bch_journal_meta(j);
-
- /*
- * Verify that we no longer need any of the journal entries in
- * the device
- */
- spin_lock(&j->lock);
- last_flushed_seq = last_seq(j);
- spin_unlock(&j->lock);
-
- for (i = 0; i < ja->nr; i += 1)
- BUG_ON(ja->bucket_seq[i] > last_flushed_seq);
-
- return ret;
-}
-
-void bch_fs_journal_stop(struct journal *j)
-{
- if (!test_bit(JOURNAL_STARTED, &j->flags))
- return;
-
- /*
- * Empty out the journal by first flushing everything pinning existing
- * journal entries, then force a brand new empty journal entry to be
- * written:
- */
- bch_journal_flush_pins(j);
- bch_journal_flush_async(j, NULL);
- bch_journal_meta(j);
-
- cancel_delayed_work_sync(&j->write_work);
- cancel_delayed_work_sync(&j->reclaim_work);
-}
-
-void bch_dev_journal_exit(struct bch_dev *ca)
-{
- kfree(ca->journal.bio);
- kfree(ca->journal.buckets);
- kfree(ca->journal.bucket_seq);
-
- ca->journal.bio = NULL;
- ca->journal.buckets = NULL;
- ca->journal.bucket_seq = NULL;
-}
-
-int bch_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
-{
- struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets =
- bch_sb_get_journal(sb);
- unsigned i, journal_entry_pages;
-
- journal_entry_pages =
- DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb),
- PAGE_SECTORS);
-
- ja->nr = bch_nr_journal_buckets(journal_buckets);
-
- ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
- if (!ja->bucket_seq)
- return -ENOMEM;
-
- ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages);
- if (!ca->journal.bio)
- return -ENOMEM;
-
- ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
- if (!ja->buckets)
- return -ENOMEM;
-
- for (i = 0; i < ja->nr; i++)
- ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
-
- return 0;
-}
-
-void bch_fs_journal_exit(struct journal *j)
-{
- unsigned order = get_order(j->entry_size_max);
-
- free_pages((unsigned long) j->buf[1].data, order);
- free_pages((unsigned long) j->buf[0].data, order);
- free_fifo(&j->pin);
-}
-
-int bch_fs_journal_init(struct journal *j, unsigned entry_size_max)
-{
- static struct lock_class_key res_key;
- unsigned order = get_order(entry_size_max);
-
- spin_lock_init(&j->lock);
- spin_lock_init(&j->pin_lock);
- init_waitqueue_head(&j->wait);
- INIT_DELAYED_WORK(&j->write_work, journal_write_work);
- INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
- mutex_init(&j->blacklist_lock);
- INIT_LIST_HEAD(&j->seq_blacklist);
- spin_lock_init(&j->devs.lock);
- mutex_init(&j->reclaim_lock);
-
- lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
-
- j->entry_size_max = entry_size_max;
- j->write_delay_ms = 100;
- j->reclaim_delay_ms = 100;
-
- bkey_extent_init(&j->key);
-
- atomic64_set(&j->reservations.counter,
- ((union journal_res_state)
- { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
-
- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
- !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
- return -ENOMEM;
-
- return 0;
-}
diff --git a/libbcache/journal.h b/libbcache/journal.h
deleted file mode 100644
index c83f8104..00000000
--- a/libbcache/journal.h
+++ /dev/null
@@ -1,373 +0,0 @@
-#ifndef _BCACHE_JOURNAL_H
-#define _BCACHE_JOURNAL_H
-
-/*
- * THE JOURNAL:
- *
- * The primary purpose of the journal is to log updates (insertions) to the
- * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
- *
- * Without the journal, the b-tree is always internally consistent on
- * disk - and in fact, in the earliest incarnations bcache didn't have a journal
- * but did handle unclean shutdowns by doing all index updates synchronously
- * (with coalescing).
- *
- * Updates to interior nodes still happen synchronously and without the journal
- * (for simplicity) - this may change eventually but updates to interior nodes
- * are rare enough it's not a huge priority.
- *
- * This means the journal is relatively separate from the b-tree; it consists of
- * just a list of keys and journal replay consists of just redoing those
- * insertions in same order that they appear in the journal.
- *
- * PERSISTENCE:
- *
- * For synchronous updates (where we're waiting on the index update to hit
- * disk), the journal entry will be written out immediately (or as soon as
- * possible, if the write for the previous journal entry was still in flight).
- *
- * Synchronous updates are specified by passing a closure (@flush_cl) to
- * bch_btree_insert() or bch_btree_insert_node(), which then pass that parameter
- * down to the journalling code. That closure will will wait on the journal
- * write to complete (via closure_wait()).
- *
- * If the index update wasn't synchronous, the journal entry will be
- * written out after 10 ms have elapsed, by default (the delay_ms field
- * in struct journal).
- *
- * JOURNAL ENTRIES:
- *
- * A journal entry is variable size (struct jset), it's got a fixed length
- * header and then a variable number of struct jset_entry entries.
- *
- * Journal entries are identified by monotonically increasing 64 bit sequence
- * numbers - jset->seq; other places in the code refer to this sequence number.
- *
- * A jset_entry entry contains one or more bkeys (which is what gets inserted
- * into the b-tree). We need a container to indicate which b-tree the key is
- * for; also, the roots of the various b-trees are stored in jset_entry entries
- * (one for each b-tree) - this lets us add new b-tree types without changing
- * the on disk format.
- *
- * We also keep some things in the journal header that are logically part of the
- * superblock - all the things that are frequently updated. This is for future
- * bcache on raw flash support; the superblock (which will become another
- * journal) can't be moved or wear leveled, so it contains just enough
- * information to find the main journal, and the superblock only has to be
- * rewritten when we want to move/wear level the main journal.
- *
- * JOURNAL LAYOUT ON DISK:
- *
- * The journal is written to a ringbuffer of buckets (which is kept in the
- * superblock); the individual buckets are not necessarily contiguous on disk
- * which means that journal entries are not allowed to span buckets, but also
- * that we can resize the journal at runtime if desired (unimplemented).
- *
- * The journal buckets exist in the same pool as all the other buckets that are
- * managed by the allocator and garbage collection - garbage collection marks
- * the journal buckets as metadata buckets.
- *
- * OPEN/DIRTY JOURNAL ENTRIES:
- *
- * Open/dirty journal entries are journal entries that contain b-tree updates
- * that have not yet been written out to the b-tree on disk. We have to track
- * which journal entries are dirty, and we also have to avoid wrapping around
- * the journal and overwriting old but still dirty journal entries with new
- * journal entries.
- *
- * On disk, this is represented with the "last_seq" field of struct jset;
- * last_seq is the first sequence number that journal replay has to replay.
- *
- * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
- * journal_device->seq) of for each journal bucket, the highest sequence number
- * any journal entry it contains. Then, by comparing that against last_seq we
- * can determine whether that journal bucket contains dirty journal entries or
- * not.
- *
- * To track which journal entries are dirty, we maintain a fifo of refcounts
- * (where each entry corresponds to a specific sequence number) - when a ref
- * goes to 0, that journal entry is no longer dirty.
- *
- * Journalling of index updates is done at the same time as the b-tree itself is
- * being modified (see btree_insert_key()); when we add the key to the journal
- * the pending b-tree write takes a ref on the journal entry the key was added
- * to. If a pending b-tree write would need to take refs on multiple dirty
- * journal entries, it only keeps the ref on the oldest one (since a newer
- * journal entry will still be replayed if an older entry was dirty).
- *
- * JOURNAL FILLING UP:
- *
- * There are two ways the journal could fill up; either we could run out of
- * space to write to, or we could have too many open journal entries and run out
- * of room in the fifo of refcounts. Since those refcounts are decremented
- * without any locking we can't safely resize that fifo, so we handle it the
- * same way.
- *
- * If the journal fills up, we start flushing dirty btree nodes until we can
- * allocate space for a journal write again - preferentially flushing btree
- * nodes that are pinning the oldest journal entries first.
- */
-
-#include <linux/hash.h>
-
-#include "journal_types.h"
-
-/*
- * Only used for holding the journal entries we read in btree_journal_read()
- * during cache_registration
- */
-struct journal_replay {
- struct list_head list;
- struct jset j;
-};
-
-#define JOURNAL_PIN ((32 * 1024) - 1)
-
-static inline bool journal_pin_active(struct journal_entry_pin *pin)
-{
- return pin->pin_list != NULL;
-}
-
-void bch_journal_pin_add(struct journal *, struct journal_entry_pin *,
- journal_pin_flush_fn);
-void bch_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-void bch_journal_pin_add_if_older(struct journal *,
- struct journal_entry_pin *,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
-void bch_journal_flush_pins(struct journal *);
-
-struct closure;
-struct bch_fs;
-struct keylist;
-
-struct bkey_i *bch_journal_find_btree_root(struct bch_fs *, struct jset *,
- enum btree_id, unsigned *);
-
-int bch_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
-
-u64 bch_inode_journal_seq(struct journal *, u64);
-
-static inline int journal_state_count(union journal_res_state s, int idx)
-{
- return idx == 0 ? s.buf0_count : s.buf1_count;
-}
-
-static inline void journal_state_inc(union journal_res_state *s)
-{
- s->buf0_count += s->idx == 0;
- s->buf1_count += s->idx == 1;
-}
-
-static inline void bch_journal_set_has_inode(struct journal_buf *buf, u64 inum)
-{
- set_bit(hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)), buf->has_inode);
-}
-
-/*
- * Amount of space that will be taken up by some keys in the journal (i.e.
- * including the jset header)
- */
-static inline unsigned jset_u64s(unsigned u64s)
-{
- return u64s + sizeof(struct jset_entry) / sizeof(u64);
-}
-
-static inline void bch_journal_add_entry_at(struct journal_buf *buf,
- const void *data, size_t u64s,
- unsigned type, enum btree_id id,
- unsigned level, unsigned offset)
-{
- struct jset_entry *entry = vstruct_idx(buf->data, offset);
-
- entry->u64s = cpu_to_le16(u64s);
- entry->btree_id = id;
- entry->level = level;
- entry->flags = 0;
- SET_JOURNAL_ENTRY_TYPE(entry, type);
-
- memcpy_u64s(entry->_data, data, u64s);
-}
-
-static inline void bch_journal_add_keys(struct journal *j, struct journal_res *res,
- enum btree_id id, const struct bkey_i *k)
-{
- struct journal_buf *buf = &j->buf[res->idx];
- unsigned actual = jset_u64s(k->k.u64s);
-
- EBUG_ON(!res->ref);
- BUG_ON(actual > res->u64s);
-
- bch_journal_set_has_inode(buf, k->k.p.inode);
-
- bch_journal_add_entry_at(buf, k, k->k.u64s,
- JOURNAL_ENTRY_BTREE_KEYS, id,
- 0, res->offset);
-
- res->offset += actual;
- res->u64s -= actual;
-}
-
-void bch_journal_buf_put_slowpath(struct journal *, bool);
-
-static inline void bch_journal_buf_put(struct journal *j, unsigned idx,
- bool need_write_just_set)
-{
- union journal_res_state s;
-
- s.v = atomic64_sub_return(((union journal_res_state) {
- .buf0_count = idx == 0,
- .buf1_count = idx == 1,
- }).v, &j->reservations.counter);
-
- EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
-
- /*
- * Do not initiate a journal write if the journal is in an error state
- * (previous journal entry write may have failed)
- */
- if (s.idx != idx &&
- !journal_state_count(s, idx) &&
- s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
- bch_journal_buf_put_slowpath(j, need_write_just_set);
-}
-
-/*
- * This function releases the journal write structure so other threads can
- * then proceed to add their keys as well.
- */
-static inline void bch_journal_res_put(struct journal *j,
- struct journal_res *res)
-{
- if (!res->ref)
- return;
-
- lock_release(&j->res_map, 0, _RET_IP_);
-
- while (res->u64s) {
- bch_journal_add_entry_at(&j->buf[res->idx], NULL, 0,
- JOURNAL_ENTRY_BTREE_KEYS,
- 0, 0, res->offset);
- res->offset += jset_u64s(0);
- res->u64s -= jset_u64s(0);
- }
-
- bch_journal_buf_put(j, res->idx, false);
-
- res->ref = 0;
-}
-
-int bch_journal_res_get_slowpath(struct journal *, struct journal_res *,
- unsigned, unsigned);
-
-static inline int journal_res_get_fast(struct journal *j,
- struct journal_res *res,
- unsigned u64s_min,
- unsigned u64s_max)
-{
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- do {
- old.v = new.v = v;
-
- /*
- * Check if there is still room in the current journal
- * entry:
- */
- if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
- return 0;
-
- res->offset = old.cur_entry_offset;
- res->u64s = min(u64s_max, j->cur_entry_u64s -
- old.cur_entry_offset);
-
- journal_state_inc(&new);
- new.cur_entry_offset += res->u64s;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
-
- res->ref = true;
- res->idx = new.idx;
- res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
- return 1;
-}
-
-static inline int bch_journal_res_get(struct journal *j, struct journal_res *res,
- unsigned u64s_min, unsigned u64s_max)
-{
- int ret;
-
- EBUG_ON(res->ref);
- EBUG_ON(u64s_max < u64s_min);
-
- if (journal_res_get_fast(j, res, u64s_min, u64s_max))
- goto out;
-
- ret = bch_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
- if (ret)
- return ret;
-out:
- lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
- EBUG_ON(!res->ref);
- return 0;
-}
-
-void bch_journal_wait_on_seq(struct journal *, u64, struct closure *);
-void bch_journal_flush_seq_async(struct journal *, u64, struct closure *);
-void bch_journal_flush_async(struct journal *, struct closure *);
-void bch_journal_meta_async(struct journal *, struct closure *);
-
-int bch_journal_flush_seq(struct journal *, u64);
-int bch_journal_flush(struct journal *);
-int bch_journal_meta(struct journal *);
-
-void bch_journal_halt(struct journal *);
-
-static inline int bch_journal_error(struct journal *j)
-{
- return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
- ? -EIO : 0;
-}
-
-static inline bool journal_flushes_device(struct bch_dev *ca)
-{
- return true;
-}
-
-void bch_journal_start(struct bch_fs *);
-void bch_journal_mark(struct bch_fs *, struct list_head *);
-void bch_journal_entries_free(struct list_head *);
-int bch_journal_read(struct bch_fs *, struct list_head *);
-int bch_journal_replay(struct bch_fs *, struct list_head *);
-
-static inline void bch_journal_set_replay_done(struct journal *j)
-{
- spin_lock(&j->lock);
- BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
-
- set_bit(JOURNAL_REPLAY_DONE, &j->flags);
- j->cur_pin_list = &fifo_peek_back(&j->pin);
- spin_unlock(&j->lock);
-}
-
-ssize_t bch_journal_print_debug(struct journal *, char *);
-
-int bch_dev_journal_alloc(struct bch_dev *);
-
-static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j)
-{
- return j
- ? (__le64 *) vstruct_end(&j->field) - j->buckets
- : 0;
-}
-
-int bch_journal_move(struct bch_dev *);
-
-void bch_fs_journal_stop(struct journal *);
-void bch_dev_journal_exit(struct bch_dev *);
-int bch_dev_journal_init(struct bch_dev *, struct bch_sb *);
-void bch_fs_journal_exit(struct journal *);
-int bch_fs_journal_init(struct journal *, unsigned);
-
-#endif /* _BCACHE_JOURNAL_H */
diff --git a/libbcache/journal_types.h b/libbcache/journal_types.h
deleted file mode 100644
index ebc340ad..00000000
--- a/libbcache/journal_types.h
+++ /dev/null
@@ -1,242 +0,0 @@
-#ifndef _BCACHE_JOURNAL_TYPES_H
-#define _BCACHE_JOURNAL_TYPES_H
-
-#include <linux/cache.h>
-#include <linux/workqueue.h>
-
-#include "alloc_types.h"
-#include "fifo.h"
-
-struct journal_res;
-
-/*
- * We put two of these in struct journal; we used them for writes to the
- * journal that are being staged or in flight.
- */
-struct journal_buf {
- struct jset *data;
- struct closure_waitlist wait;
-
- /*
- * ugh, prio_buckets are stupid - need to convert them to new
- * transaction machinery when it arrives
- */
- unsigned nr_prio_buckets;
-
- /* bloom filter: */
- unsigned long has_inode[1024 / sizeof(unsigned long)];
-};
-
-/*
- * Something that makes a journal entry dirty - i.e. a btree node that has to be
- * flushed:
- */
-
-struct journal_entry_pin_list {
- struct list_head list;
- atomic_t count;
-};
-
-struct journal;
-struct journal_entry_pin;
-typedef void (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin *);
-
-struct journal_entry_pin {
- struct list_head list;
- journal_pin_flush_fn flush;
- struct journal_entry_pin_list *pin_list;
-};
-
-/* corresponds to a btree node with a blacklisted bset: */
-struct blacklisted_node {
- __le64 seq;
- enum btree_id btree_id;
- struct bpos pos;
-};
-
-struct journal_seq_blacklist {
- struct list_head list;
- u64 seq;
- bool written;
- struct journal_entry_pin pin;
-
- struct blacklisted_node *entries;
- size_t nr_entries;
-};
-
-struct journal_res {
- bool ref;
- u8 idx;
- u16 u64s;
- u32 offset;
- u64 seq;
-};
-
-union journal_res_state {
- struct {
- atomic64_t counter;
- };
-
- struct {
- u64 v;
- };
-
- struct {
- u64 cur_entry_offset:20,
- idx:1,
- prev_buf_unwritten:1,
- buf0_count:21,
- buf1_count:21;
- };
-};
-
-/* 4 mb, in bytes: */
-#define JOURNAL_ENTRY_SIZE_MAX (4U << 20)
-
-/*
- * We stash some journal state as sentinal values in cur_entry_offset:
- */
-#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
-
-#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
-#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX)
-
-/*
- * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
- * either because something's waiting on the write to complete or because it's
- * been dirty too long and the timer's expired.
- */
-
-enum {
- JOURNAL_REPLAY_DONE,
- JOURNAL_STARTED,
- JOURNAL_NEED_WRITE,
-};
-
-/* Embedded in struct bch_fs */
-struct journal {
- /* Fastpath stuff up front: */
-
- unsigned long flags;
-
- union journal_res_state reservations;
- unsigned cur_entry_u64s;
- unsigned prev_buf_sectors;
- unsigned cur_buf_sectors;
- unsigned entry_size_max; /* bytes */
-
- /*
- * Two journal entries -- one is currently open for new entries, the
- * other is possibly being written out.
- */
- struct journal_buf buf[2];
-
- spinlock_t lock;
-
- /* Used when waiting because the journal was full */
- wait_queue_head_t wait;
-
- struct closure io;
- struct delayed_work write_work;
-
- /* Sequence number of most recent journal entry (last entry in @pin) */
- atomic64_t seq;
-
- /* last_seq from the most recent journal entry written */
- u64 last_seq_ondisk;
-
- /*
- * FIFO of journal entries whose btree updates have not yet been
- * written out.
- *
- * Each entry is a reference count. The position in the FIFO is the
- * entry's sequence number relative to @seq.
- *
- * The journal entry itself holds a reference count, put when the
- * journal entry is written out. Each btree node modified by the journal
- * entry also holds a reference count, put when the btree node is
- * written.
- *
- * When a reference count reaches zero, the journal entry is no longer
- * needed. When all journal entries in the oldest journal bucket are no
- * longer needed, the bucket can be discarded and reused.
- */
- DECLARE_FIFO(struct journal_entry_pin_list, pin);
- struct journal_entry_pin_list *cur_pin_list;
-
- /*
- * Protects the pin lists - the fifo itself is still protected by
- * j->lock though:
- */
- spinlock_t pin_lock;
-
- struct mutex blacklist_lock;
- struct list_head seq_blacklist;
-
- BKEY_PADDED(key);
- struct dev_group devs;
-
- struct delayed_work reclaim_work;
- unsigned long last_flushed;
-
- /* protects advancing ja->last_idx: */
- struct mutex reclaim_lock;
-
- /*
- * ugh: need to get prio_buckets converted over to the eventual new
- * transaction machinery
- */
- __le64 prio_buckets[BCH_SB_MEMBERS_MAX];
- unsigned nr_prio_buckets;
-
- unsigned write_delay_ms;
- unsigned reclaim_delay_ms;
-
- u64 res_get_blocked_start;
- u64 need_write_time;
- u64 write_start_time;
-
- struct time_stats *write_time;
- struct time_stats *delay_time;
- struct time_stats *blocked_time;
- struct time_stats *flush_seq_time;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map res_map;
-#endif
-};
-
-/*
- * Embedded in struct bch_dev. First three fields refer to the array of journal
- * buckets, in bch_sb.
- */
-struct journal_device {
- /*
- * For each journal bucket, contains the max sequence number of the
- * journal writes it contains - so we know when a bucket can be reused.
- */
- u64 *bucket_seq;
-
- unsigned sectors_free;
-
- /* Journal bucket we're currently writing to */
- unsigned cur_idx;
-
- /* Last journal bucket that still contains an open journal entry */
-
- /*
- * j->lock and j->reclaim_lock must both be held to modify, j->lock
- * sufficient to read:
- */
- unsigned last_idx;
- unsigned nr;
- u64 *buckets;
-
- /* Bio for journal reads/writes to this device */
- struct bio *bio;
-
- /* for bch_journal_read_device */
- struct closure read;
-};
-
-#endif /* _BCACHE_JOURNAL_TYPES_H */
diff --git a/libbcache/keybuf.c b/libbcache/keybuf.c
deleted file mode 100644
index 961fc79a..00000000
--- a/libbcache/keybuf.c
+++ /dev/null
@@ -1,195 +0,0 @@
-
-#include "bcache.h"
-#include "btree_gc.h"
-#include "btree_iter.h"
-#include "keybuf.h"
-
-#include <trace/events/bcache.h>
-
-/*
- * For buffered iteration over the btree, with predicates and ratelimiting and
- * whatnot
- */
-
-static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
-{
- /* Overlapping keys compare equal */
- if (bkey_cmp(l->key.k.p, bkey_start_pos(&r->key.k)) <= 0)
- return -1;
- if (bkey_cmp(bkey_start_pos(&l->key.k), r->key.k.p) >= 0)
- return 1;
- return 0;
-}
-
-static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
- struct keybuf_key *r)
-{
- return clamp_t(s64, bkey_cmp(l->key.k.p, r->key.k.p), -1, 1);
-}
-
-void bch_refill_keybuf(struct bch_fs *c, struct keybuf *buf,
- struct bpos end, keybuf_pred_fn *pred)
-{
- struct bpos start = buf->last_scanned;
- struct btree_iter iter;
- struct bkey_s_c k;
- unsigned nr_found = 0;
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, buf->last_scanned, k) {
- if (bkey_cmp(k.k->p, end) >= 0) {
- buf->last_scanned = k.k->p;
- goto done;
- }
-
- if (pred(buf, k)) {
- struct keybuf_key *w;
-
- spin_lock(&buf->lock);
-
- w = array_alloc(&buf->freelist);
- if (!w) {
- spin_unlock(&buf->lock);
- goto done;
- }
-
- bkey_reassemble(&w->key, k);
- atomic_set(&w->ref, -1); /* -1 means hasn't started */
-
- if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
- array_free(&buf->freelist, w);
- else
- nr_found++;
-
- spin_unlock(&buf->lock);
- }
-
- buf->last_scanned = k.k->p;
- bch_btree_iter_cond_resched(&iter);
- }
-
- /* If we end up here, it means:
- * - the map_fn didn't fill up the keybuf
- * - the map_fn didn't see the end key
- * - there were no more keys to map over
- * Therefore, we are at the end of the key space */
- buf->last_scanned = POS_MAX;
-done:
- bch_btree_iter_unlock(&iter);
-
- trace_bcache_keyscan(nr_found,
- start.inode, start.offset,
- buf->last_scanned.inode,
- buf->last_scanned.offset);
-
- spin_lock(&buf->lock);
-
- if (!RB_EMPTY_ROOT(&buf->keys)) {
- struct keybuf_key *w;
-
- w = RB_FIRST(&buf->keys, struct keybuf_key, node);
- buf->start = bkey_start_pos(&w->key.k);
-
- w = RB_LAST(&buf->keys, struct keybuf_key, node);
- buf->end = w->key.k.p;
- } else {
- buf->start = POS_MAX;
- buf->end = POS_MAX;
- }
-
- spin_unlock(&buf->lock);
-}
-
-static void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
-{
- rb_erase(&w->node, &buf->keys);
- array_free(&buf->freelist, w);
-}
-
-void bch_keybuf_put(struct keybuf *buf, struct keybuf_key *w)
-{
- BUG_ON(atomic_read(&w->ref) <= 0);
-
- if (atomic_dec_and_test(&w->ref)) {
- up(&buf->in_flight);
-
- spin_lock(&buf->lock);
- bch_keybuf_del(buf, w);
- spin_unlock(&buf->lock);
- }
-}
-
-void bch_keybuf_recalc_oldest_gens(struct bch_fs *c, struct keybuf *buf)
-{
- struct keybuf_key *w, *n;
-
- spin_lock(&buf->lock);
- rbtree_postorder_for_each_entry_safe(w, n,
- &buf->keys, node)
- bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(&w->key));
- spin_unlock(&buf->lock);
-}
-
-bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bpos start,
- struct bpos end)
-{
- bool ret = false;
- struct keybuf_key *w, *next, s = { .key.k.p = start };
-
- if (bkey_cmp(end, buf->start) <= 0 ||
- bkey_cmp(start, buf->end) >= 0)
- return false;
-
- spin_lock(&buf->lock);
-
- for (w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
- w && bkey_cmp(bkey_start_pos(&w->key.k), end) < 0;
- w = next) {
- next = RB_NEXT(w, node);
-
- if (atomic_read(&w->ref) == -1)
- bch_keybuf_del(buf, w);
- else
- ret = true;
- }
-
- spin_unlock(&buf->lock);
- return ret;
-}
-
-struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
-{
- struct keybuf_key *w;
-
- spin_lock(&buf->lock);
-
- w = RB_FIRST(&buf->keys, struct keybuf_key, node);
-
- while (w && atomic_read(&w->ref) != -1)
- w = RB_NEXT(w, node);
-
- if (!w) {
- spin_unlock(&buf->lock);
- return NULL;
- }
-
- atomic_set(&w->ref, 1);
- spin_unlock(&buf->lock);
-
- down(&buf->in_flight);
-
- return w;
-}
-
-void bch_keybuf_init(struct keybuf *buf)
-{
- sema_init(&buf->in_flight, KEYBUF_REFILL_BATCH / 2);
-
- buf->last_scanned = POS_MAX;
- buf->start = POS_MIN;
- buf->end = POS_MIN;
-
- buf->keys = RB_ROOT;
-
- spin_lock_init(&buf->lock);
- array_allocator_init(&buf->freelist);
-}
diff --git a/libbcache/keybuf.h b/libbcache/keybuf.h
deleted file mode 100644
index dd1402d3..00000000
--- a/libbcache/keybuf.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _BCACHE_KEYBUF_H
-#define _BCACHE_KEYBUF_H
-
-#include "keybuf_types.h"
-
-typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey_s_c);
-
-void bch_keybuf_init(struct keybuf *);
-void bch_refill_keybuf(struct bch_fs *, struct keybuf *,
- struct bpos, keybuf_pred_fn *);
-void bch_keybuf_recalc_oldest_gens(struct bch_fs *, struct keybuf *);
-bool bch_keybuf_check_overlapping(struct keybuf *, struct bpos, struct bpos);
-void bch_keybuf_put(struct keybuf *, struct keybuf_key *);
-struct keybuf_key *bch_keybuf_next(struct keybuf *);
-
-#endif /* _BCACHE_KEYBUF_H */
diff --git a/libbcache/keybuf_types.h b/libbcache/keybuf_types.h
deleted file mode 100644
index 3facc4a0..00000000
--- a/libbcache/keybuf_types.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _BCACHE_KEYBUF_TYPES_H
-#define _BCACHE_KEYBUF_TYPES_H
-
-struct keybuf_key {
- struct rb_node node;
- BKEY_PADDED(key);
- atomic_t ref;
-};
-
-#define KEYBUF_REFILL_BATCH 500
-
-struct keybuf {
- struct bpos last_scanned;
- spinlock_t lock;
-
- /*
- * Beginning and end of range in rb tree - so that we can skip taking
- * lock and checking the rb tree when we need to check for overlapping
- * keys.
- */
- struct bpos start;
- struct bpos end;
-
- struct rb_root keys;
-
- unsigned max_in_flight;
- struct semaphore in_flight;
-
- DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist,
- KEYBUF_REFILL_BATCH);
-};
-
-#endif /* _BCACHE_KEYBUF_TYPES_H */
diff --git a/libbcache/keylist.c b/libbcache/keylist.c
deleted file mode 100644
index adf5eeba..00000000
--- a/libbcache/keylist.c
+++ /dev/null
@@ -1,55 +0,0 @@
-
-#include "bcache.h"
-#include "keylist.h"
-
-int bch_keylist_realloc(struct keylist *l, u64 *inline_u64s,
- size_t nr_inline_u64s, size_t new_u64s)
-{
- size_t oldsize = bch_keylist_u64s(l);
- size_t newsize = oldsize + new_u64s;
- u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
- u64 *new_keys;
-
- newsize = roundup_pow_of_two(newsize);
-
- if (newsize <= nr_inline_u64s ||
- (old_buf && roundup_pow_of_two(oldsize) == newsize))
- return 0;
-
- new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
- if (!new_keys)
- return -ENOMEM;
-
- if (!old_buf)
- memcpy_u64s(new_keys, inline_u64s, oldsize);
-
- l->keys_p = new_keys;
- l->top_p = new_keys + oldsize;
-
- return 0;
-}
-
-void bch_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
-{
- struct bkey_i *where;
-
- for_each_keylist_key(l, where)
- if (bkey_cmp(insert->k.p, where->k.p) < 0)
- break;
-
- memmove_u64s_up((u64 *) where + insert->k.u64s,
- where,
- ((u64 *) l->top) - ((u64 *) where));
-
- l->top_p += insert->k.u64s;
- bkey_copy(where, insert);
-}
-
-void bch_keylist_pop_front(struct keylist *l)
-{
- l->top_p -= bch_keylist_front(l)->k.u64s;
-
- memmove_u64s_down(l->keys,
- bkey_next(l->keys),
- bch_keylist_u64s(l));
-}
diff --git a/libbcache/keylist.h b/libbcache/keylist.h
deleted file mode 100644
index 1166f941..00000000
--- a/libbcache/keylist.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef _BCACHE_KEYLIST_H
-#define _BCACHE_KEYLIST_H
-
-#include "keylist_types.h"
-
-int bch_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
-void bch_keylist_add_in_order(struct keylist *, struct bkey_i *);
-void bch_keylist_pop_front(struct keylist *);
-
-static inline void bch_keylist_init(struct keylist *l, u64 *inline_keys,
- size_t nr_inline_u64s)
-{
- l->top_p = l->keys_p = inline_keys;
-}
-
-static inline void bch_keylist_free(struct keylist *l, u64 *inline_keys)
-{
- if (l->keys_p != inline_keys)
- kfree(l->keys_p);
- memset(l, 0, sizeof(*l));
-}
-
-static inline void bch_keylist_push(struct keylist *l)
-{
- l->top = bkey_next(l->top);
-}
-
-static inline void bch_keylist_add(struct keylist *l, const struct bkey_i *k)
-{
- bkey_copy(l->top, k);
- bch_keylist_push(l);
-}
-
-static inline bool bch_keylist_empty(struct keylist *l)
-{
- return l->top == l->keys;
-}
-
-static inline size_t bch_keylist_u64s(struct keylist *l)
-{
- return l->top_p - l->keys_p;
-}
-
-static inline size_t bch_keylist_bytes(struct keylist *l)
-{
- return bch_keylist_u64s(l) * sizeof(u64);
-}
-
-static inline struct bkey_i *bch_keylist_front(struct keylist *l)
-{
- return l->keys;
-}
-
-#define for_each_keylist_key(_keylist, _k) \
- for (_k = (_keylist)->keys; \
- _k != (_keylist)->top; \
- _k = bkey_next(_k))
-
-#define keylist_single(k) \
- ((struct keylist) { .keys = k, .top = bkey_next(k) })
-
-#endif /* _BCACHE_KEYLIST_H */
diff --git a/libbcache/keylist_types.h b/libbcache/keylist_types.h
deleted file mode 100644
index 195785bf..00000000
--- a/libbcache/keylist_types.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _BCACHE_KEYLIST_TYPES_H
-#define _BCACHE_KEYLIST_TYPES_H
-
-struct keylist {
- union {
- struct bkey_i *keys;
- u64 *keys_p;
- };
- union {
- struct bkey_i *top;
- u64 *top_p;
- };
-};
-
-#endif /* _BCACHE_KEYLIST_TYPES_H */
diff --git a/libbcache/migrate.c b/libbcache/migrate.c
deleted file mode 100644
index 9ef9685e..00000000
--- a/libbcache/migrate.c
+++ /dev/null
@@ -1,395 +0,0 @@
-/*
- * Code for moving data off a device.
- */
-
-#include "bcache.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "extents.h"
-#include "io.h"
-#include "journal.h"
-#include "keylist.h"
-#include "migrate.h"
-#include "move.h"
-#include "super-io.h"
-
-static int issue_migration_move(struct bch_dev *ca,
- struct moving_context *ctxt,
- struct bkey_s_c k)
-{
- struct bch_fs *c = ca->fs;
- struct disk_reservation res;
- const struct bch_extent_ptr *ptr;
- int ret;
-
- if (bch_disk_reservation_get(c, &res, k.k->size, 0))
- return -ENOSPC;
-
- extent_for_each_ptr(bkey_s_c_to_extent(k), ptr)
- if (ptr->dev == ca->dev_idx)
- goto found;
-
- BUG();
-found:
- /* XXX: we need to be doing something with the disk reservation */
-
- ret = bch_data_move(c, ctxt, &c->migration_write_point, k, ptr);
- if (ret)
- bch_disk_reservation_put(c, &res);
- return ret;
-}
-
-#define MAX_DATA_OFF_ITER 10
-
-/*
- * This moves only the data off, leaving the meta-data (if any) in place.
- * It walks the key space, and for any key with a valid pointer to the
- * relevant device, it copies it elsewhere, updating the key to point to
- * the copy.
- * The meta-data is moved off by bch_move_meta_data_off_device.
- *
- * Note: If the number of data replicas desired is > 1, ideally, any
- * new copies would not be made in the same device that already have a
- * copy (if there are enough devices).
- * This is _not_ currently implemented. The multiple replicas can
- * land in the same device even if there are others available.
- */
-
-int bch_move_data_off_device(struct bch_dev *ca)
-{
- struct moving_context ctxt;
- struct bch_fs *c = ca->fs;
- struct bch_sb_field_members *mi;
- unsigned pass = 0;
- u64 seen_key_count;
- int ret = 0;
-
- BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
-
- if (!ca->mi.has_data)
- return 0;
-
- bch_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
- ctxt.avoid = ca;
-
- /*
- * In theory, only one pass should be necessary as we've
- * quiesced all writes before calling this.
- *
- * However, in practice, more than one pass may be necessary:
- * - Some move fails due to an error. We can can find this out
- * from the moving_context.
- * - Some key swap failed because some of the pointers in the
- * key in the tree changed due to caching behavior, btree gc
- * pruning stale pointers, or tiering (if the device being
- * removed is in tier 0). A smarter bkey_cmpxchg would
- * handle these cases.
- *
- * Thus this scans the tree one more time than strictly necessary,
- * but that can be viewed as a verification pass.
- */
-
- do {
- struct btree_iter iter;
- struct bkey_s_c k;
-
- seen_key_count = 0;
- atomic_set(&ctxt.error_count, 0);
- atomic_set(&ctxt.error_flags, 0);
-
- bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
-
- while (!bch_move_ctxt_wait(&ctxt) &&
- (k = bch_btree_iter_peek(&iter)).k &&
- !(ret = btree_iter_err(k))) {
- if (!bkey_extent_is_data(k.k) ||
- !bch_extent_has_device(bkey_s_c_to_extent(k),
- ca->dev_idx))
- goto next;
-
- ret = issue_migration_move(ca, &ctxt, k);
- if (ret == -ENOMEM) {
- bch_btree_iter_unlock(&iter);
-
- /*
- * memory allocation failure, wait for some IO
- * to finish
- */
- bch_move_ctxt_wait_for_io(&ctxt);
- continue;
- }
- if (ret == -ENOSPC)
- break;
- BUG_ON(ret);
-
- seen_key_count++;
-next:
- bch_btree_iter_advance_pos(&iter);
- bch_btree_iter_cond_resched(&iter);
-
- }
- bch_btree_iter_unlock(&iter);
- bch_move_ctxt_exit(&ctxt);
-
- if (ret)
- return ret;
- } while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
-
- if (seen_key_count) {
- pr_err("Unable to migrate all data in %d iterations.",
- MAX_DATA_OFF_ITER);
- return -1;
- }
-
- mutex_lock(&c->sb_lock);
- mi = bch_sb_get_members(c->disk_sb);
- SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
-
- bch_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
-/*
- * This walks the btree, and for any node on the relevant device it moves the
- * node elsewhere.
- */
-static int bch_move_btree_off(struct bch_dev *ca, enum btree_id id)
-{
- struct bch_fs *c = ca->fs;
- struct btree_iter iter;
- struct closure cl;
- struct btree *b;
- int ret;
-
- BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
-
- closure_init_stack(&cl);
-
- for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-retry:
- if (!bch_extent_has_device(e, ca->dev_idx))
- continue;
-
- ret = bch_btree_node_rewrite(&iter, b, &cl);
- if (ret == -EINTR || ret == -ENOSPC) {
- /*
- * Drop locks to upgrade locks or wait on
- * reserve: after retaking, recheck in case we
- * raced.
- */
- bch_btree_iter_unlock(&iter);
- closure_sync(&cl);
- b = bch_btree_iter_peek_node(&iter);
- goto retry;
- }
- if (ret) {
- bch_btree_iter_unlock(&iter);
- return ret;
- }
-
- bch_btree_iter_set_locks_want(&iter, 0);
- }
- ret = bch_btree_iter_unlock(&iter);
- if (ret)
- return ret; /* btree IO error */
-
- if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
- for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-
- BUG_ON(bch_extent_has_device(e, ca->dev_idx));
- }
- bch_btree_iter_unlock(&iter);
- }
-
- return 0;
-}
-
-/*
- * This moves only the meta-data off, leaving the data (if any) in place.
- * The data is moved off by bch_move_data_off_device, if desired, and
- * called first.
- *
- * Before calling this, allocation of buckets to the device must have
- * been disabled, as else we'll continue to write meta-data to the device
- * when new buckets are picked for meta-data writes.
- * In addition, the copying gc and allocator threads for the device
- * must have been stopped. The allocator thread is the only thread
- * that writes prio/gen information.
- *
- * Meta-data consists of:
- * - Btree nodes
- * - Prio/gen information
- * - Journal entries
- * - Superblock
- *
- * This has to move the btree nodes and the journal only:
- * - prio/gen information is not written once the allocator thread is stopped.
- * also, as the prio/gen information is per-device it is not moved.
- * - the superblock will be written by the caller once after everything
- * is stopped.
- *
- * Note that currently there is no way to stop btree node and journal
- * meta-data writes to a device without moving the meta-data because
- * once a bucket is open for a btree node, unless a replacement btree
- * node is allocated (and the tree updated), the bucket will continue
- * to be written with updates. Similarly for the journal (it gets
- * written until filled).
- *
- * This routine leaves the data (if any) in place. Whether the data
- * should be moved off is a decision independent of whether the meta
- * data should be moved off and stopped:
- *
- * - For device removal, both data and meta-data are moved off, in
- * that order.
- *
- * - However, for turning a device read-only without removing it, only
- * meta-data is moved off since that's the only way to prevent it
- * from being written. Data is left in the device, but no new data
- * is written.
- */
-
-int bch_move_metadata_off_device(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bch_sb_field_members *mi;
- unsigned i;
- int ret;
-
- BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
-
- if (!ca->mi.has_metadata)
- return 0;
-
- /* 1st, Move the btree nodes off the device */
-
- for (i = 0; i < BTREE_ID_NR; i++) {
- ret = bch_move_btree_off(ca, i);
- if (ret)
- return ret;
- }
-
- /* There are no prios/gens to move -- they are already in the device. */
-
- /* 2nd. Move the journal off the device */
-
- ret = bch_journal_move(ca);
- if (ret)
- return ret;
-
- mutex_lock(&c->sb_lock);
- mi = bch_sb_get_members(c->disk_sb);
- SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
-
- bch_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
-/*
- * Flagging data bad when forcibly removing a device after failing to
- * migrate the data off the device.
- */
-
-static int bch_flag_key_bad(struct btree_iter *iter,
- struct bch_dev *ca,
- struct bkey_s_c_extent orig)
-{
- BKEY_PADDED(key) tmp;
- struct bkey_s_extent e;
- struct bch_extent_ptr *ptr;
- struct bch_fs *c = ca->fs;
-
- bkey_reassemble(&tmp.key, orig.s_c);
- e = bkey_i_to_s_extent(&tmp.key);
-
- extent_for_each_ptr_backwards(e, ptr)
- if (ptr->dev == ca->dev_idx)
- bch_extent_drop_ptr(e, ptr);
-
- /*
- * If the new extent no longer has any pointers, bch_extent_normalize()
- * will do the appropriate thing with it (turning it into a
- * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
- */
- bch_extent_normalize(c, e.s);
-
- return bch_btree_insert_at(c, NULL, NULL, NULL,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(iter, &tmp.key));
-}
-
-/*
- * This doesn't actually move any data -- it marks the keys as bad
- * if they contain a pointer to a device that is forcibly removed
- * and don't have other valid pointers. If there are valid pointers,
- * the necessary pointers to the removed device are replaced with
- * bad pointers instead.
- *
- * This is only called if bch_move_data_off_device above failed, meaning
- * that we've already tried to move the data MAX_DATA_OFF_ITER times and
- * are not likely to succeed if we try again.
- */
-int bch_flag_data_bad(struct bch_dev *ca)
-{
- int ret = 0;
- struct bkey_s_c k;
- struct bkey_s_c_extent e;
- struct btree_iter iter;
-
- bch_btree_iter_init(&iter, ca->fs, BTREE_ID_EXTENTS, POS_MIN);
-
- while ((k = bch_btree_iter_peek(&iter)).k &&
- !(ret = btree_iter_err(k))) {
- if (!bkey_extent_is_data(k.k))
- goto advance;
-
- e = bkey_s_c_to_extent(k);
- if (!bch_extent_has_device(e, ca->dev_idx))
- goto advance;
-
- ret = bch_flag_key_bad(&iter, ca, e);
-
- /*
- * don't want to leave ret == -EINTR, since if we raced and
- * something else overwrote the key we could spuriously return
- * -EINTR below:
- */
- if (ret == -EINTR)
- ret = 0;
- if (ret)
- break;
-
- /*
- * If the replica we're dropping was dirty and there is an
- * additional cached replica, the cached replica will now be
- * considered dirty - upon inserting the new version of the key,
- * the bucket accounting will be updated to reflect the fact
- * that the cached data is now dirty and everything works out as
- * if by magic without us having to do anything.
- *
- * The one thing we need to be concerned with here is there's a
- * race between when we drop any stale pointers from the key
- * we're about to insert, and when the key actually gets
- * inserted and the cached data is marked as dirty - we could
- * end up trying to insert a key with a pointer that should be
- * dirty, but points to stale data.
- *
- * If that happens the insert code just bails out and doesn't do
- * the insert - however, it doesn't return an error. Hence we
- * need to always recheck the current key before advancing to
- * the next:
- */
- continue;
-advance:
- bch_btree_iter_advance_pos(&iter);
- }
-
- bch_btree_iter_unlock(&iter);
-
- return ret;
-}
diff --git a/libbcache/migrate.h b/libbcache/migrate.h
deleted file mode 100644
index c6a056cb..00000000
--- a/libbcache/migrate.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _BCACHE_MIGRATE_H
-#define _BCACHE_MIGRATE_H
-
-int bch_move_data_off_device(struct bch_dev *);
-int bch_move_metadata_off_device(struct bch_dev *);
-int bch_flag_data_bad(struct bch_dev *);
-
-#endif /* _BCACHE_MIGRATE_H */
diff --git a/libbcache/move.c b/libbcache/move.c
deleted file mode 100644
index edee726c..00000000
--- a/libbcache/move.c
+++ /dev/null
@@ -1,392 +0,0 @@
-
-#include "bcache.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "io.h"
-#include "move.h"
-#include "super-io.h"
-#include "keylist.h"
-
-#include <linux/ioprio.h>
-
-#include <trace/events/bcache.h>
-
-static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c,
- struct bkey_s_extent e,
- struct bch_extent_ptr ptr)
-{
- struct bch_extent_ptr *ptr2;
- unsigned bucket_bits = c->devs[ptr.dev]->bucket_bits;
-
- extent_for_each_ptr(e, ptr2)
- if (ptr2->dev == ptr.dev &&
- ptr2->gen == ptr.gen &&
- (ptr2->offset >> bucket_bits) ==
- (ptr.offset >> bucket_bits))
- return ptr2;
-
- return NULL;
-}
-
-static struct bch_extent_ptr *bch_migrate_matching_ptr(struct migrate_write *m,
- struct bkey_s_extent e)
-{
- const struct bch_extent_ptr *ptr;
- struct bch_extent_ptr *ret;
-
- if (m->move)
- ret = bkey_find_ptr(m->op.c, e, m->move_ptr);
- else
- extent_for_each_ptr(bkey_i_to_s_c_extent(&m->key), ptr)
- if ((ret = bkey_find_ptr(m->op.c, e, *ptr)))
- break;
-
- return ret;
-}
-
-static int bch_migrate_index_update(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct migrate_write *m =
- container_of(op, struct migrate_write, op);
- struct keylist *keys = &op->insert_keys;
- struct btree_iter iter;
- int ret = 0;
-
- bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS,
- bkey_start_pos(&bch_keylist_front(keys)->k));
-
- while (1) {
- struct bkey_s_extent insert =
- bkey_i_to_s_extent(bch_keylist_front(keys));
- struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
- struct bch_extent_ptr *ptr;
- struct bkey_s_extent e;
- BKEY_PADDED(k) new;
-
- if (!k.k) {
- ret = bch_btree_iter_unlock(&iter);
- break;
- }
-
- if (!bkey_extent_is_data(k.k))
- goto nomatch;
-
- bkey_reassemble(&new.k, k);
- bch_cut_front(iter.pos, &new.k);
- bch_cut_back(insert.k->p, &new.k.k);
- e = bkey_i_to_s_extent(&new.k);
-
- /* hack - promotes can race: */
- if (m->promote)
- extent_for_each_ptr(insert, ptr)
- if (bch_extent_has_device(e.c, ptr->dev))
- goto nomatch;
-
- ptr = bch_migrate_matching_ptr(m, e);
- if (ptr) {
- int nr_new_dirty = bch_extent_nr_dirty_ptrs(insert.s_c);
- unsigned insert_flags =
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOFAIL;
-
- /* copygc uses btree node reserve: */
- if (m->move)
- insert_flags |= BTREE_INSERT_USE_RESERVE;
-
- if (m->move) {
- nr_new_dirty -= !ptr->cached;
- __bch_extent_drop_ptr(e, ptr);
- }
-
- BUG_ON(nr_new_dirty < 0);
-
- memcpy_u64s(extent_entry_last(e),
- insert.v,
- bkey_val_u64s(insert.k));
- e.k->u64s += bkey_val_u64s(insert.k);
-
- bch_extent_narrow_crcs(e);
- bch_extent_drop_redundant_crcs(e);
- bch_extent_normalize(c, e.s);
- bch_extent_mark_replicas_cached(c, e, nr_new_dirty);
-
- ret = bch_btree_insert_at(c, &op->res,
- NULL, op_journal_seq(op),
- insert_flags,
- BTREE_INSERT_ENTRY(&iter, &new.k));
- if (ret && ret != -EINTR)
- break;
- } else {
-nomatch:
- bch_btree_iter_advance_pos(&iter);
- }
-
- while (bkey_cmp(iter.pos, bch_keylist_front(keys)->k.p) >= 0) {
- bch_keylist_pop_front(keys);
- if (bch_keylist_empty(keys))
- goto out;
- }
-
- bch_cut_front(iter.pos, bch_keylist_front(keys));
- }
-out:
- bch_btree_iter_unlock(&iter);
- return ret;
-}
-
-void bch_migrate_write_init(struct bch_fs *c,
- struct migrate_write *m,
- struct write_point *wp,
- struct bkey_s_c k,
- const struct bch_extent_ptr *move_ptr,
- unsigned flags)
-{
- bkey_reassemble(&m->key, k);
-
- m->promote = false;
- m->move = move_ptr != NULL;
- if (move_ptr)
- m->move_ptr = *move_ptr;
-
- if (bkey_extent_is_cached(k.k) ||
- (move_ptr && move_ptr->cached))
- flags |= BCH_WRITE_CACHED;
-
- bch_write_op_init(&m->op, c, &m->wbio,
- (struct disk_reservation) { 0 },
- wp,
- bkey_start_pos(k.k),
- NULL, flags);
-
- if (m->move)
- m->op.alloc_reserve = RESERVE_MOVINGGC;
-
- m->op.nonce = extent_current_nonce(bkey_s_c_to_extent(k));
- m->op.nr_replicas = 1;
- m->op.index_update_fn = bch_migrate_index_update;
-}
-
-static void migrate_bio_init(struct moving_io *io, struct bio *bio,
- unsigned sectors)
-{
- bio_init(bio);
- bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
-
- bio->bi_iter.bi_size = sectors << 9;
- bio->bi_max_vecs = DIV_ROUND_UP(sectors, PAGE_SECTORS);
- bio->bi_private = &io->cl;
- bio->bi_io_vec = io->bi_inline_vecs;
- bch_bio_map(bio, NULL);
-}
-
-static void moving_io_destructor(struct closure *cl)
-{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
- struct moving_context *ctxt = io->ctxt;
- struct bio_vec *bv;
- int i;
-
- //if (io->replace.failures)
- // trace_bcache_copy_collision(q, &io->key.k);
-
- atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight);
- wake_up(&ctxt->wait);
-
- bio_for_each_segment_all(bv, &io->write.wbio.bio, i)
- if (bv->bv_page)
- __free_page(bv->bv_page);
-
- kfree(io);
-}
-
-static void moving_error(struct moving_context *ctxt, unsigned flag)
-{
- atomic_inc(&ctxt->error_count);
- //atomic_or(flag, &ctxt->error_flags);
-}
-
-static void moving_io_after_write(struct closure *cl)
-{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
- struct moving_context *ctxt = io->ctxt;
-
- if (io->write.op.error)
- moving_error(ctxt, MOVING_FLAG_WRITE);
-
- moving_io_destructor(cl);
-}
-
-static void write_moving(struct moving_io *io)
-{
- struct bch_write_op *op = &io->write.op;
-
- if (op->error) {
- closure_return_with_destructor(&io->cl, moving_io_destructor);
- } else {
- closure_call(&op->cl, bch_write, NULL, &io->cl);
- closure_return_with_destructor(&io->cl, moving_io_after_write);
- }
-}
-
-static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
-{
- struct moving_io *io =
- list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
-
- return io && io->read_completed ? io : NULL;
-}
-
-static void read_moving_endio(struct bio *bio)
-{
- struct closure *cl = bio->bi_private;
- struct moving_io *io = container_of(cl, struct moving_io, cl);
- struct moving_context *ctxt = io->ctxt;
-
- trace_bcache_move_read_done(&io->write.key.k);
-
- if (bio->bi_error) {
- io->write.op.error = bio->bi_error;
- moving_error(io->ctxt, MOVING_FLAG_READ);
- }
-
- io->read_completed = true;
- if (next_pending_write(ctxt))
- wake_up(&ctxt->wait);
-
- closure_put(&ctxt->cl);
-}
-
-static void __bch_data_move(struct closure *cl)
-{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
- struct bch_fs *c = io->write.op.c;
- struct extent_pick_ptr pick;
-
- bch_extent_pick_ptr_avoiding(c, bkey_i_to_s_c(&io->write.key),
- io->ctxt->avoid, &pick);
- if (IS_ERR_OR_NULL(pick.ca))
- closure_return_with_destructor(cl, moving_io_destructor);
-
- bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&io->write.key.k);
- io->rbio.bio.bi_end_io = read_moving_endio;
-
- /*
- * dropped by read_moving_endio() - guards against use after free of
- * ctxt when doing wakeup
- */
- closure_get(&io->ctxt->cl);
-
- bch_read_extent(c, &io->rbio,
- bkey_i_to_s_c(&io->write.key),
- &pick, BCH_READ_IS_LAST);
-}
-
-int bch_data_move(struct bch_fs *c,
- struct moving_context *ctxt,
- struct write_point *wp,
- struct bkey_s_c k,
- const struct bch_extent_ptr *move_ptr)
-{
- struct moving_io *io;
-
- io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) *
- DIV_ROUND_UP(k.k->size, PAGE_SECTORS),
- GFP_KERNEL);
- if (!io)
- return -ENOMEM;
-
- io->ctxt = ctxt;
-
- migrate_bio_init(io, &io->rbio.bio, k.k->size);
-
- if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) {
- kfree(io);
- return -ENOMEM;
- }
-
- migrate_bio_init(io, &io->write.wbio.bio, k.k->size);
- bio_get(&io->write.wbio.bio);
- io->write.wbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
-
- bch_migrate_write_init(c, &io->write, wp, k, move_ptr, 0);
-
- trace_bcache_move_read(&io->write.key.k);
-
- ctxt->keys_moved++;
- ctxt->sectors_moved += k.k->size;
- if (ctxt->rate)
- bch_ratelimit_increment(ctxt->rate, k.k->size);
-
- atomic_add(k.k->size, &ctxt->sectors_in_flight);
- list_add_tail(&io->list, &ctxt->reads);
-
- closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl);
- return 0;
-}
-
-static void do_pending_writes(struct moving_context *ctxt)
-{
- struct moving_io *io;
-
- while ((io = next_pending_write(ctxt))) {
- list_del(&io->list);
- trace_bcache_move_write(&io->write.key.k);
- write_moving(io);
- }
-}
-
-#define move_ctxt_wait_event(_ctxt, _cond) \
-do { \
- do_pending_writes(_ctxt); \
- \
- if (_cond) \
- break; \
- __wait_event((_ctxt)->wait, \
- next_pending_write(_ctxt) || (_cond)); \
-} while (1)
-
-int bch_move_ctxt_wait(struct moving_context *ctxt)
-{
- move_ctxt_wait_event(ctxt,
- atomic_read(&ctxt->sectors_in_flight) <
- ctxt->max_sectors_in_flight);
-
- return ctxt->rate
- ? bch_ratelimit_wait_freezable_stoppable(ctxt->rate)
- : 0;
-}
-
-void bch_move_ctxt_wait_for_io(struct moving_context *ctxt)
-{
- unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight);
-
- move_ctxt_wait_event(ctxt,
- !atomic_read(&ctxt->sectors_in_flight) ||
- atomic_read(&ctxt->sectors_in_flight) != sectors_pending);
-}
-
-void bch_move_ctxt_exit(struct moving_context *ctxt)
-{
- move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight));
- closure_sync(&ctxt->cl);
-
- EBUG_ON(!list_empty(&ctxt->reads));
- EBUG_ON(atomic_read(&ctxt->sectors_in_flight));
-}
-
-void bch_move_ctxt_init(struct moving_context *ctxt,
- struct bch_ratelimit *rate,
- unsigned max_sectors_in_flight)
-{
- memset(ctxt, 0, sizeof(*ctxt));
- closure_init_stack(&ctxt->cl);
-
- ctxt->rate = rate;
- ctxt->max_sectors_in_flight = max_sectors_in_flight;
-
- INIT_LIST_HEAD(&ctxt->reads);
- init_waitqueue_head(&ctxt->wait);
-}
diff --git a/libbcache/move.h b/libbcache/move.h
deleted file mode 100644
index 317431d6..00000000
--- a/libbcache/move.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef _BCACHE_MOVE_H
-#define _BCACHE_MOVE_H
-
-#include "buckets.h"
-#include "io_types.h"
-#include "move_types.h"
-
-enum moving_flag_bitnos {
- MOVING_FLAG_BITNO_READ = 0,
- MOVING_FLAG_BITNO_WRITE,
-};
-
-#define MOVING_FLAG_READ (1U << MOVING_FLAG_BITNO_READ)
-#define MOVING_FLAG_WRITE (1U << MOVING_FLAG_BITNO_WRITE)
-
-struct migrate_write {
- BKEY_PADDED(key);
- bool promote;
- bool move;
- struct bch_extent_ptr move_ptr;
- struct bch_write_op op;
- struct bch_write_bio wbio;
-};
-
-void bch_migrate_write_init(struct bch_fs *,
- struct migrate_write *,
- struct write_point *,
- struct bkey_s_c,
- const struct bch_extent_ptr *,
- unsigned);
-
-#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
-
-struct moving_context {
- /* Closure for waiting on all reads and writes to complete */
- struct closure cl;
-
- /* Number and types of errors reported */
- atomic_t error_count;
- atomic_t error_flags;
-
- /* Key and sector moves issued, updated from submission context */
- u64 keys_moved;
- u64 sectors_moved;
-
- /* Rate-limiter counting submitted reads */
- struct bch_ratelimit *rate;
-
- /* Try to avoid reading the following device */
- struct bch_dev *avoid;
-
- struct list_head reads;
-
- /* Configuration */
- unsigned max_sectors_in_flight;
- atomic_t sectors_in_flight;
-
- wait_queue_head_t wait;
-};
-
-struct moving_io {
- struct list_head list;
- struct rb_node node;
- struct closure cl;
- struct moving_context *ctxt;
- struct migrate_write write;
- bool read_completed;
-
- struct bch_read_bio rbio;
- /* Must be last since it is variable size */
- struct bio_vec bi_inline_vecs[0];
-};
-
-int bch_data_move(struct bch_fs *,
- struct moving_context *,
- struct write_point *,
- struct bkey_s_c,
- const struct bch_extent_ptr *);
-
-int bch_move_ctxt_wait(struct moving_context *);
-void bch_move_ctxt_wait_for_io(struct moving_context *);
-
-void bch_move_ctxt_exit(struct moving_context *);
-void bch_move_ctxt_init(struct moving_context *, struct bch_ratelimit *,
- unsigned);
-
-#endif /* _BCACHE_MOVE_H */
diff --git a/libbcache/move_types.h b/libbcache/move_types.h
deleted file mode 100644
index 0e2275e2..00000000
--- a/libbcache/move_types.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef _BCACHE_MOVE_TYPES_H
-#define _BCACHE_MOVE_TYPES_H
-
-#endif /* _BCACHE_MOVE_TYPES_H */
diff --git a/libbcache/movinggc.c b/libbcache/movinggc.c
deleted file mode 100644
index 9bb2b7a4..00000000
--- a/libbcache/movinggc.c
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Moving/copying garbage collector
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcache.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "clock.h"
-#include "extents.h"
-#include "io.h"
-#include "keylist.h"
-#include "move.h"
-#include "movinggc.h"
-
-#include <trace/events/bcache.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/wait.h>
-
-/* Moving GC - IO loop */
-
-static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca,
- struct bkey_s_c k)
-{
- const struct bch_extent_ptr *ptr;
-
- if (bkey_extent_is_data(k.k) &&
- (ptr = bch_extent_has_device(bkey_s_c_to_extent(k),
- ca->dev_idx)) &&
- PTR_BUCKET(ca, ptr)->mark.copygc)
- return ptr;
-
- return NULL;
-}
-
-static int issue_moving_gc_move(struct bch_dev *ca,
- struct moving_context *ctxt,
- struct bkey_s_c k)
-{
- struct bch_fs *c = ca->fs;
- const struct bch_extent_ptr *ptr;
- int ret;
-
- ptr = moving_pred(ca, k);
- if (!ptr) /* We raced - bucket's been reused */
- return 0;
-
- ret = bch_data_move(c, ctxt, &ca->copygc_write_point, k, ptr);
- if (!ret)
- trace_bcache_gc_copy(k.k);
- else
- trace_bcache_moving_gc_alloc_fail(c, k.k->size);
- return ret;
-}
-
-static void read_moving(struct bch_dev *ca, size_t buckets_to_move,
- u64 sectors_to_move)
-{
- struct bch_fs *c = ca->fs;
- struct bucket *g;
- struct moving_context ctxt;
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 sectors_not_moved = 0;
- size_t buckets_not_moved = 0;
-
- bch_ratelimit_reset(&ca->moving_gc_pd.rate);
- bch_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate,
- SECTORS_IN_FLIGHT_PER_DEVICE);
- bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
-
- while (1) {
- if (kthread_should_stop())
- goto out;
- if (bch_move_ctxt_wait(&ctxt))
- goto out;
- k = bch_btree_iter_peek(&iter);
- if (!k.k)
- break;
- if (btree_iter_err(k))
- goto out;
-
- if (!moving_pred(ca, k))
- goto next;
-
- if (issue_moving_gc_move(ca, &ctxt, k)) {
- bch_btree_iter_unlock(&iter);
-
- /* memory allocation failure, wait for some IO to finish */
- bch_move_ctxt_wait_for_io(&ctxt);
- continue;
- }
-next:
- bch_btree_iter_advance_pos(&iter);
- //bch_btree_iter_cond_resched(&iter);
-
- /* unlock before calling moving_context_wait() */
- bch_btree_iter_unlock(&iter);
- cond_resched();
- }
-
- bch_btree_iter_unlock(&iter);
- bch_move_ctxt_exit(&ctxt);
- trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
- buckets_to_move);
-
- /* don't check this if we bailed out early: */
- for_each_bucket(g, ca)
- if (g->mark.copygc && bucket_sectors_used(g)) {
- sectors_not_moved += bucket_sectors_used(g);
- buckets_not_moved++;
- }
-
- if (sectors_not_moved)
- bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved",
- sectors_not_moved, sectors_to_move,
- buckets_not_moved, buckets_to_move);
- return;
-out:
- bch_btree_iter_unlock(&iter);
- bch_move_ctxt_exit(&ctxt);
- trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
- buckets_to_move);
-}
-
-static bool have_copygc_reserve(struct bch_dev *ca)
-{
- bool ret;
-
- spin_lock(&ca->freelist_lock);
- ret = fifo_used(&ca->free[RESERVE_MOVINGGC]) >=
- COPYGC_BUCKETS_PER_ITER(ca);
- spin_unlock(&ca->freelist_lock);
-
- return ret;
-}
-
-static void bch_moving_gc(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bucket *g;
- struct bucket_mark new;
- u64 sectors_to_move;
- size_t buckets_to_move, buckets_unused = 0;
- struct bucket_heap_entry e;
- unsigned sectors_used, i;
- int reserve_sectors;
-
- if (!have_copygc_reserve(ca)) {
- struct closure cl;
-
- closure_init_stack(&cl);
- while (1) {
- closure_wait(&c->freelist_wait, &cl);
- if (have_copygc_reserve(ca))
- break;
- closure_sync(&cl);
- }
- closure_wake_up(&c->freelist_wait);
- }
-
- reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
-
- trace_bcache_moving_gc_start(ca);
-
- /*
- * Find buckets with lowest sector counts, skipping completely
- * empty buckets, by building a maxheap sorted by sector count,
- * and repeatedly replacing the maximum element until all
- * buckets have been visited.
- */
-
- /*
- * We need bucket marks to be up to date, so gc can't be recalculating
- * them, and we don't want the allocator invalidating a bucket after
- * we've decided to evacuate it but before we set copygc:
- */
- down_read(&c->gc_lock);
- mutex_lock(&ca->heap_lock);
- mutex_lock(&ca->fs->bucket_lock);
-
- ca->heap.used = 0;
- for_each_bucket(g, ca) {
- bucket_cmpxchg(g, new, new.copygc = 0);
-
- if (bucket_unused(g)) {
- buckets_unused++;
- continue;
- }
-
- if (g->mark.owned_by_allocator ||
- g->mark.data_type != BUCKET_DATA)
- continue;
-
- sectors_used = bucket_sectors_used(g);
-
- if (sectors_used >= ca->mi.bucket_size)
- continue;
-
- bucket_heap_push(ca, g, sectors_used);
- }
-
- sectors_to_move = 0;
- for (i = 0; i < ca->heap.used; i++)
- sectors_to_move += ca->heap.data[i].val;
-
- while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
- BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp));
- sectors_to_move -= e.val;
- }
-
- for (i = 0; i < ca->heap.used; i++)
- bucket_cmpxchg(ca->heap.data[i].g, new, new.copygc = 1);
-
- buckets_to_move = ca->heap.used;
-
- mutex_unlock(&ca->fs->bucket_lock);
- mutex_unlock(&ca->heap_lock);
- up_read(&c->gc_lock);
-
- read_moving(ca, buckets_to_move, sectors_to_move);
-}
-
-static int bch_moving_gc_thread(void *arg)
-{
- struct bch_dev *ca = arg;
- struct bch_fs *c = ca->fs;
- struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last;
- u64 available, want, next;
-
- set_freezable();
-
- while (!kthread_should_stop()) {
- if (kthread_wait_freezable(c->copy_gc_enabled))
- break;
-
- last = atomic_long_read(&clock->now);
- /*
- * don't start copygc until less than half the gc reserve is
- * available:
- */
- available = dev_buckets_available(ca);
- want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
- c->opts.gc_reserve_percent, 200);
- if (available > want) {
- next = last + (available - want) *
- ca->mi.bucket_size;
- bch_kthread_io_clock_wait(clock, next);
- continue;
- }
-
- bch_moving_gc(ca);
- }
-
- return 0;
-}
-
-void bch_moving_gc_stop(struct bch_dev *ca)
-{
- ca->moving_gc_pd.rate.rate = UINT_MAX;
- bch_ratelimit_reset(&ca->moving_gc_pd.rate);
-
- if (ca->moving_gc_read)
- kthread_stop(ca->moving_gc_read);
- ca->moving_gc_read = NULL;
-}
-
-int bch_moving_gc_start(struct bch_dev *ca)
-{
- struct task_struct *t;
-
- BUG_ON(ca->moving_gc_read);
-
- if (ca->fs->opts.nochanges)
- return 0;
-
- if (bch_fs_init_fault("moving_gc_start"))
- return -ENOMEM;
-
- t = kthread_create(bch_moving_gc_thread, ca, "bch_copygc_read");
- if (IS_ERR(t))
- return PTR_ERR(t);
-
- ca->moving_gc_read = t;
- wake_up_process(ca->moving_gc_read);
-
- return 0;
-}
-
-void bch_dev_moving_gc_init(struct bch_dev *ca)
-{
- bch_pd_controller_init(&ca->moving_gc_pd);
- ca->moving_gc_pd.d_term = 0;
-}
diff --git a/libbcache/movinggc.h b/libbcache/movinggc.h
deleted file mode 100644
index 5afbf34f..00000000
--- a/libbcache/movinggc.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _BCACHE_MOVINGGC_H
-#define _BCACHE_MOVINGGC_H
-
-/*
- * We can't use the entire copygc reserve in one iteration of copygc: we may
- * need the buckets we're freeing up to go back into the copygc reserve to make
- * forward progress, but if the copygc reserve is full they'll be available for
- * any allocation - and it's possible that in a given iteration, we free up most
- * of the buckets we're going to free before we allocate most of the buckets
- * we're going to allocate.
- *
- * If we only use half of the reserve per iteration, then in steady state we'll
- * always have room in the reserve for the buckets we're going to need in the
- * next iteration:
- */
-#define COPYGC_BUCKETS_PER_ITER(ca) \
- ((ca)->free[RESERVE_MOVINGGC].size / 2)
-
-/*
- * Max sectors to move per iteration: Have to take into account internal
- * fragmentation from the multiple write points for each generation:
- */
-#define COPYGC_SECTORS_PER_ITER(ca) \
- ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
-
-void bch_moving_gc_stop(struct bch_dev *);
-int bch_moving_gc_start(struct bch_dev *);
-void bch_dev_moving_gc_init(struct bch_dev *);
-
-#endif
diff --git a/libbcache/notify.c b/libbcache/notify.c
deleted file mode 100644
index b06a8749..00000000
--- a/libbcache/notify.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Code for sending uevent notifications to user-space.
- *
- * Copyright 2015 Datera, Inc.
- */
-
-#include "bcache.h"
-#include "notify.h"
-
-#include <linux/kobject.h>
-
-#define notify_var(c, format, ...) \
-({ \
- int ret; \
- lockdep_assert_held(&(c)->uevent_lock); \
- ret = add_uevent_var(&(c)->uevent_env, format, ##__VA_ARGS__); \
- WARN_ON_ONCE(ret); \
-})
-
-static void notify_get(struct bch_fs *c)
-{
- struct kobj_uevent_env *env = &c->uevent_env;
-
- mutex_lock(&c->uevent_lock);
- env->envp_idx = 0;
- env->buflen = 0;
-
- notify_var(c, "SET_UUID=%pU", c->sb.user_uuid.b);
-}
-
-static void notify_get_cache(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
-
- notify_get(c);
- notify_var(c, "UUID=%pU", ca->uuid.b);
- notify_var(c, "BLOCKDEV=%s", ca->name);
-}
-
-static void notify_put(struct bch_fs *c)
-{
- struct kobj_uevent_env *env = &c->uevent_env;
-
- env->envp[env->envp_idx] = NULL;
- kobject_uevent_env(&c->kobj, KOBJ_CHANGE, env->envp);
- mutex_unlock(&c->uevent_lock);
-}
-
-void bch_notify_fs_read_write(struct bch_fs *c)
-{
- notify_get(c);
- notify_var(c, "STATE=active");
- notify_put(c);
-}
-
-void bch_notify_fs_read_only(struct bch_fs *c)
-{
- notify_get(c);
- notify_var(c, "STATE=readonly");
- notify_put(c);
-}
-
-void bch_notify_fs_stopped(struct bch_fs *c)
-{
- notify_get(c);
- notify_var(c, "STATE=stopped");
- notify_put(c);
-}
-
-void bch_notify_dev_read_write(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
-
- notify_get_cache(ca);
- notify_var(c, "STATE=active");
- notify_put(c);
-}
-
-void bch_notify_dev_read_only(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
-
- notify_get_cache(ca);
- notify_var(c, "STATE=readonly");
- notify_put(c);
-}
-
-void bch_notify_dev_added(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
-
- notify_get_cache(ca);
- notify_var(c, "STATE=removing");
- notify_put(c);
-}
-
-void bch_notify_dev_error(struct bch_dev *ca, bool fatal)
-{
- struct bch_fs *c = ca->fs;
-
- notify_get_cache(ca);
- notify_var(c, "STATE=error");
- notify_var(c, "FATAL=%d", fatal);
- notify_put(c);
-}
diff --git a/libbcache/notify.h b/libbcache/notify.h
deleted file mode 100644
index 2c1e3679..00000000
--- a/libbcache/notify.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Code for sending uevent notifications to user-space.
- *
- * Copyright 2015 Datera, Inc.
- */
-
-#ifndef _NOTIFY_H
-#define _NOTIFY_H
-
-#ifndef NO_BCACHE_NOTIFY
-
-void bch_notify_fs_read_write(struct bch_fs *);
-void bch_notify_fs_read_only(struct bch_fs *);
-void bch_notify_fs_stopped(struct bch_fs *);
-
-void bch_notify_dev_read_write(struct bch_dev *);
-void bch_notify_dev_read_only(struct bch_dev *);
-void bch_notify_dev_added(struct bch_dev *);
-void bch_notify_dev_error(struct bch_dev *, bool);
-
-#else
-
-static inline void bch_notify_fs_read_write(struct bch_fs *c) {}
-static inline void bch_notify_fs_read_only(struct bch_fs *c) {}
-static inline void bch_notify_fs_stopped(struct bch_fs *c) {}
-
-static inline void bch_notify_dev_read_write(struct bch_dev *ca) {}
-static inline void bch_notify_dev_read_only(struct bch_dev *ca) {}
-static inline void bch_notify_dev_added(struct bch_dev *ca) {}
-static inline void bch_notify_dev_error(struct bch_dev *ca, bool b) {}
-
-#endif
-
-#endif /* _NOTIFY_H */
diff --git a/libbcache/opts.c b/libbcache/opts.c
deleted file mode 100644
index 41780d59..00000000
--- a/libbcache/opts.c
+++ /dev/null
@@ -1,241 +0,0 @@
-
-#include <linux/kernel.h>
-
-#include "opts.h"
-#include "util.h"
-
-const char * const bch_error_actions[] = {
- "continue",
- "remount-ro",
- "panic",
- NULL
-};
-
-const char * const bch_csum_types[] = {
- "none",
- "crc32c",
- "crc64",
- NULL
-};
-
-const char * const bch_compression_types[] = {
- "none",
- "lz4",
- "gzip",
- NULL
-};
-
-const char * const bch_str_hash_types[] = {
- "crc32c",
- "crc64",
- "siphash",
- NULL
-};
-
-const char * const bch_cache_replacement_policies[] = {
- "lru",
- "fifo",
- "random",
- NULL
-};
-
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
-const char * const bch_cache_modes[] = {
- "default",
- "writethrough",
- "writeback",
- "writearound",
- "none",
- NULL
-};
-
-const char * const bch_dev_state[] = {
- "readwrite",
- "readonly",
- "failed",
- "spare",
- NULL
-};
-
-const struct bch_option bch_opt_table[] = {
-#define OPT_BOOL() .type = BCH_OPT_BOOL
-#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max
-#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices
-
-#define BCH_OPT(_name, _mode, _sb_opt, _bits, _type) \
- [Opt_##_name] = { \
- .name = #_name, \
- .set_sb = SET_##_sb_opt, \
- _type \
- },
- BCH_VISIBLE_OPTS()
-#undef BCH_OPT
-};
-
-static enum bch_opt_id bch_opt_lookup(const char *name)
-{
- const struct bch_option *i;
-
- for (i = bch_opt_table;
- i < bch_opt_table + ARRAY_SIZE(bch_opt_table);
- i++)
- if (!strcmp(name, i->name))
- return i - bch_opt_table;
-
- return -1;
-}
-
-static u64 bch_opt_get(struct bch_opts *opts, enum bch_opt_id id)
-{
- switch (id) {
-#define BCH_OPT(_name, ...) \
- case Opt_##_name: \
- return opts->_name; \
-
- BCH_VISIBLE_OPTS()
-#undef BCH_OPT
-
- default:
- BUG();
- }
-}
-
-void bch_opt_set(struct bch_opts *opts, enum bch_opt_id id, u64 v)
-{
- switch (id) {
-#define BCH_OPT(_name, ...) \
- case Opt_##_name: \
- opts->_name = v; \
- break;
-
- BCH_VISIBLE_OPTS()
-#undef BCH_OPT
-
- default:
- BUG();
- }
-}
-
-/*
- * Initial options from superblock - here we don't want any options undefined,
- * any options the superblock doesn't specify are set to 0:
- */
-struct bch_opts bch_sb_opts(struct bch_sb *sb)
-{
- struct bch_opts opts = bch_opts_empty();
-
-#define BCH_OPT(_name, _mode, _sb_opt, ...) \
- if (_sb_opt != NO_SB_OPT) \
- opts._name = _sb_opt(sb);
-
- BCH_OPTS()
-#undef BCH_OPT
-
- return opts;
-}
-
-int parse_one_opt(enum bch_opt_id id, const char *val, u64 *res)
-{
- const struct bch_option *opt = &bch_opt_table[id];
- ssize_t ret;
-
- switch (opt->type) {
- case BCH_OPT_BOOL:
- ret = kstrtou64(val, 10, res);
- if (ret < 0)
- return ret;
-
- if (*res > 1)
- return -ERANGE;
- break;
- case BCH_OPT_UINT:
- ret = kstrtou64(val, 10, res);
- if (ret < 0)
- return ret;
-
- if (*res < opt->min || *res >= opt->max)
- return -ERANGE;
- break;
- case BCH_OPT_STR:
- ret = bch_read_string_list(val, opt->choices);
- if (ret < 0)
- return ret;
-
- *res = ret;
- break;
- }
-
- return 0;
-}
-
-int bch_parse_mount_opts(struct bch_opts *opts, char *options)
-{
- char *opt, *name, *val;
- int ret, id;
- u64 v;
-
- while ((opt = strsep(&options, ",")) != NULL) {
- name = strsep(&opt, "=");
- val = opt;
-
- if (val) {
- id = bch_opt_lookup(name);
- if (id < 0)
- return -EINVAL;
-
- ret = parse_one_opt(id, val, &v);
- if (ret < 0)
- return ret;
- } else {
- id = bch_opt_lookup(name);
- v = 1;
-
- if (id < 0 &&
- !strncmp("no", name, 2)) {
- id = bch_opt_lookup(name + 2);
- v = 0;
- }
-
- if (bch_opt_table[id].type != BCH_OPT_BOOL)
- return -EINVAL;
- }
-
- bch_opt_set(opts, id, v);
- }
-
- return 0;
-}
-
-enum bch_opt_id bch_parse_sysfs_opt(const char *name, const char *val,
- u64 *res)
-{
- enum bch_opt_id id = bch_opt_lookup(name);
- int ret;
-
- if (id < 0)
- return -EINVAL;
-
- ret = parse_one_opt(id, val, res);
- if (ret < 0)
- return ret;
-
- return id;
-}
-
-ssize_t bch_opt_show(struct bch_opts *opts, const char *name,
- char *buf, size_t size)
-{
- enum bch_opt_id id = bch_opt_lookup(name);
- const struct bch_option *opt;
- u64 v;
-
- if (id < 0)
- return -EINVAL;
-
- v = bch_opt_get(opts, id);
- opt = &bch_opt_table[id];
-
- return opt->type == BCH_OPT_STR
- ? bch_snprint_string_list(buf, size, opt->choices, v)
- : snprintf(buf, size, "%lli\n", v);
-}
diff --git a/libbcache/opts.h b/libbcache/opts.h
deleted file mode 100644
index 253b7399..00000000
--- a/libbcache/opts.h
+++ /dev/null
@@ -1,168 +0,0 @@
-#ifndef _BCACHE_OPTS_H
-#define _BCACHE_OPTS_H
-
-#include <linux/bcache.h>
-#include <linux/bug.h>
-#include <linux/log2.h>
-#include <linux/string.h>
-
-extern const char * const bch_error_actions[];
-extern const char * const bch_csum_types[];
-extern const char * const bch_compression_types[];
-extern const char * const bch_str_hash_types[];
-extern const char * const bch_cache_replacement_policies[];
-extern const char * const bch_cache_modes[];
-extern const char * const bch_dev_state[];
-
-/*
- * Mount options; we also store defaults in the superblock.
- *
- * Also exposed via sysfs: if an option is writeable, and it's also stored in
- * the superblock, changing it via sysfs (currently? might change this) also
- * updates the superblock.
- *
- * We store options as signed integers, where -1 means undefined. This means we
- * can pass the mount options to bch_fs_alloc() as a whole struct, and then only
- * apply the options from that struct that are defined.
- */
-
-/* dummy option, for options that aren't stored in the superblock */
-LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0);
-
-/**
- * BCH_OPT(name, mode, sb_opt, type, ...)
- *
- * @name - name of mount option, sysfs attribute, and struct bch_opts
- * member
- *
- * @mode - sysfs attr permissions
- *
- * @sb_option - name of corresponding superblock option
- *
- * @type - one of OPT_BOOL, OPT_UINT, OPT_STR
- */
-
-enum opt_type {
- BCH_OPT_BOOL,
- BCH_OPT_UINT,
- BCH_OPT_STR,
-};
-
-#define BCH_VISIBLE_OPTS() \
- BCH_OPT(errors, 0644, BCH_SB_ERROR_ACTION, \
- s8, OPT_STR(bch_error_actions)) \
- BCH_OPT(metadata_replicas, 0444, BCH_SB_META_REPLICAS_WANT,\
- s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \
- BCH_OPT(data_replicas, 0444, BCH_SB_DATA_REPLICAS_WANT,\
- s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \
- BCH_OPT(metadata_replicas_required, 0444, BCH_SB_META_REPLICAS_REQ,\
- s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \
- BCH_OPT(data_replicas_required, 0444, BCH_SB_DATA_REPLICAS_REQ,\
- s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \
- BCH_OPT(metadata_checksum, 0644, BCH_SB_META_CSUM_TYPE, \
- s8, OPT_STR(bch_csum_types)) \
- BCH_OPT(data_checksum, 0644, BCH_SB_DATA_CSUM_TYPE, \
- s8, OPT_STR(bch_csum_types)) \
- BCH_OPT(compression, 0644, BCH_SB_COMPRESSION_TYPE,\
- s8, OPT_STR(bch_compression_types)) \
- BCH_OPT(str_hash, 0644, BCH_SB_STR_HASH_TYPE, \
- s8, OPT_STR(bch_str_hash_types)) \
- BCH_OPT(inodes_32bit, 0644, BCH_SB_INODE_32BIT, \
- s8, OPT_BOOL()) \
- BCH_OPT(gc_reserve_percent, 0444, BCH_SB_GC_RESERVE, \
- s8, OPT_UINT(5, 21)) \
- BCH_OPT(root_reserve_percent, 0444, BCH_SB_ROOT_RESERVE, \
- s8, OPT_UINT(0, 100)) \
- BCH_OPT(wide_macs, 0644, BCH_SB_128_BIT_MACS, \
- s8, OPT_BOOL()) \
- BCH_OPT(verbose_recovery, 0444, NO_SB_OPT, \
- s8, OPT_BOOL()) \
- BCH_OPT(posix_acl, 0444, NO_SB_OPT, \
- s8, OPT_BOOL()) \
- BCH_OPT(journal_flush_disabled, 0644, NO_SB_OPT, \
- s8, OPT_BOOL()) \
- BCH_OPT(nofsck, 0444, NO_SB_OPT, \
- s8, OPT_BOOL()) \
- BCH_OPT(fix_errors, 0444, NO_SB_OPT, \
- s8, OPT_BOOL()) \
- BCH_OPT(nochanges, 0444, NO_SB_OPT, \
- s8, OPT_BOOL()) \
- BCH_OPT(noreplay, 0444, NO_SB_OPT, \
- s8, OPT_BOOL()) \
- BCH_OPT(norecovery, 0444, NO_SB_OPT, \
- s8, OPT_BOOL()) \
- BCH_OPT(noexcl, 0444, NO_SB_OPT, \
- s8, OPT_BOOL()) \
- BCH_OPT(sb, 0444, NO_SB_OPT, \
- s64, OPT_UINT(0, S64_MAX)) \
-
-#define BCH_OPTS() \
- BCH_OPT(read_only, 0444, NO_SB_OPT, \
- s8, OPT_BOOL()) \
- BCH_OPT(nostart, 0444, NO_SB_OPT, \
- s8, OPT_BOOL()) \
- BCH_VISIBLE_OPTS()
-
-struct bch_opts {
-#define BCH_OPT(_name, _mode, _sb_opt, _bits, ...) \
- _bits _name;
-
- BCH_OPTS()
-#undef BCH_OPT
-};
-
-enum bch_opt_id {
-#define BCH_OPT(_name, ...) \
- Opt_##_name,
-
- BCH_VISIBLE_OPTS()
-#undef BCH_OPT
-};
-
-struct bch_option {
- const char *name;
- void (*set_sb)(struct bch_sb *, u64);
- enum opt_type type;
-
- union {
- struct {
- u64 min, max;
- };
- struct {
- const char * const *choices;
- };
- };
-
-};
-
-extern const struct bch_option bch_opt_table[];
-
-static inline struct bch_opts bch_opts_empty(void)
-{
- struct bch_opts ret;
-
- memset(&ret, 255, sizeof(ret));
- return ret;
-}
-
-static inline void bch_opts_apply(struct bch_opts *dst, struct bch_opts src)
-{
-#define BCH_OPT(_name, ...) \
- if (src._name >= 0) \
- dst->_name = src._name;
-
- BCH_OPTS()
-#undef BCH_OPT
-}
-
-#define opt_defined(_opt) ((_opt) >= 0)
-
-void bch_opt_set(struct bch_opts *, enum bch_opt_id, u64);
-struct bch_opts bch_sb_opts(struct bch_sb *);
-
-int bch_parse_mount_opts(struct bch_opts *, char *);
-enum bch_opt_id bch_parse_sysfs_opt(const char *, const char *, u64 *);
-
-ssize_t bch_opt_show(struct bch_opts *, const char *, char *, size_t);
-
-#endif /* _BCACHE_OPTS_H */
diff --git a/libbcache/request.c b/libbcache/request.c
deleted file mode 100644
index b24770bc..00000000
--- a/libbcache/request.c
+++ /dev/null
@@ -1,809 +0,0 @@
-/*
- * Handle a read or a write request and decide what to do with it.
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- *
- * Main pieces here:
- *
- * 1) Data insert path, via bch_data_insert() -- writes data to cache and
- * updates extents btree
- * 2) Read path, via bch_read() -- for now only used by bcachefs and ioctl
- * interface
- * 3) Read path, via cache_lookup() and struct search -- used by block device
- * make_request functions
- * 4) Cache promotion -- used by bch_read() and cache_lookup() to copy data to
- * the cache, either from a backing device or a cache device in a higher tier
- *
- * One tricky thing that comes up is a race condition where a bucket may be
- * re-used while reads from it are still in flight. To guard against this, we
- * save the ptr that is being read and check if it is stale once the read
- * completes. If the ptr is stale, the read is retried.
- *
- * #2 and #3 will be unified further in the future.
- */
-
-#include "bcache.h"
-#include "blockdev.h"
-#include "btree_update.h"
-#include "btree_iter.h"
-#include "clock.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "io.h"
-#include "journal.h"
-#include "keybuf.h"
-#include "request.h"
-#include "writeback.h"
-#include "stats.h"
-
-#include <linux/module.h>
-#include <linux/hash.h>
-#include <linux/random.h>
-#include <linux/backing-dev.h>
-
-#include <trace/events/bcache.h>
-
-#define CUTOFF_CACHE_ADD 10
-#define CUTOFF_CACHE_READA 15
-
-/* Congested? */
-
-unsigned bch_get_congested(struct bch_fs *c)
-{
- int i;
- long rand;
-
- if (!c->congested_read_threshold_us &&
- !c->congested_write_threshold_us)
- return 0;
-
- i = (local_clock_us() - c->congested_last_us) / 1024;
- if (i < 0)
- return 0;
-
- i += atomic_read(&c->congested);
- if (i >= 0)
- return 0;
-
- i += CONGESTED_MAX;
-
- if (i > 0)
- i = fract_exp_two(i, 6);
-
- rand = get_random_int();
- i -= bitmap_weight(&rand, BITS_PER_LONG);
-
- return i > 0 ? i : 1;
-}
-
-static void add_sequential(struct task_struct *t)
-{
- t->sequential_io_avg = ewma_add(t->sequential_io_avg,
- t->sequential_io, 3);
- t->sequential_io = 0;
-}
-
-static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
-{
- return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
-}
-
-static bool check_should_bypass(struct cached_dev *dc, struct bio *bio, int rw)
-{
- struct bch_fs *c = dc->disk.c;
- unsigned mode = BDEV_CACHE_MODE(dc->disk_sb.sb);
- unsigned sectors, congested = bch_get_congested(c);
- struct task_struct *task = current;
- struct io *i;
-
- if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
- sectors_available(c) * 100 < c->capacity * CUTOFF_CACHE_ADD ||
- (bio_op(bio) == REQ_OP_DISCARD))
- goto skip;
-
- if (mode == CACHE_MODE_NONE ||
- (mode == CACHE_MODE_WRITEAROUND &&
- op_is_write(bio_op(bio))))
- goto skip;
-
- if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
- bio_sectors(bio) & (c->sb.block_size - 1)) {
- pr_debug("skipping unaligned io");
- goto skip;
- }
-
- if (bypass_torture_test(dc)) {
- if ((get_random_int() & 3) == 3)
- goto skip;
- else
- goto rescale;
- }
-
- if (!congested && !dc->sequential_cutoff)
- goto rescale;
-
- if (!congested &&
- mode == CACHE_MODE_WRITEBACK &&
- op_is_write(bio_op(bio)) &&
- (bio->bi_opf & REQ_SYNC))
- goto rescale;
-
- spin_lock(&dc->io_lock);
-
- hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
- if (i->last == bio->bi_iter.bi_sector &&
- time_before(jiffies, i->last_io))
- goto found;
-
- i = list_first_entry(&dc->io_lru, struct io, lru);
-
- add_sequential(task);
- i->sequential = 0;
-found:
- if (i->sequential + bio->bi_iter.bi_size > i->sequential)
- i->sequential += bio->bi_iter.bi_size;
-
- i->last = bio_end_sector(bio);
- i->last_io = jiffies + msecs_to_jiffies(5000);
- task->sequential_io = i->sequential;
-
- hlist_del(&i->hash);
- hlist_add_head(&i->hash, iohash(dc, i->last));
- list_move_tail(&i->lru, &dc->io_lru);
-
- spin_unlock(&dc->io_lock);
-
- sectors = max(task->sequential_io,
- task->sequential_io_avg) >> 9;
-
- if (dc->sequential_cutoff &&
- sectors >= dc->sequential_cutoff >> 9) {
- trace_bcache_bypass_sequential(bio);
- goto skip;
- }
-
- if (congested && sectors >= congested) {
- trace_bcache_bypass_congested(bio);
- goto skip;
- }
-
-rescale:
- return false;
-skip:
- bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
- return true;
-}
-
-/* Common code for the make_request functions */
-
-/**
- * request_endio - endio function for backing device bios
- */
-static void request_endio(struct bio *bio)
-{
- struct closure *cl = bio->bi_private;
-
- if (bio->bi_error) {
- struct search *s = container_of(cl, struct search, cl);
- s->iop.error = bio->bi_error;
- /* Only cache read errors are recoverable */
- s->recoverable = false;
- }
-
- bio_put(bio);
- closure_put(cl);
-}
-
-static void bio_complete(struct search *s)
-{
- if (s->orig_bio) {
- generic_end_io_acct(bio_data_dir(s->orig_bio),
- &s->d->disk->part0, s->start_time);
-
- trace_bcache_request_end(s->d, s->orig_bio);
- s->orig_bio->bi_error = s->iop.error;
- bio_endio(s->orig_bio);
- s->orig_bio = NULL;
- }
-}
-
-static void do_bio_hook(struct search *s, struct bio *orig_bio)
-{
- int rw = bio_data_dir(orig_bio);
- struct bio *bio = rw ? &s->wbio.bio : &s->rbio.bio;
-
- bio_init(bio);
- __bio_clone_fast(bio, orig_bio);
- bio->bi_end_io = request_endio;
- bio->bi_private = &s->cl;
-
- bio_cnt_set(bio, 3);
-}
-
-static void search_free(struct closure *cl)
-{
- struct search *s = container_of(cl, struct search, cl);
-
- bio_complete(s);
-
- if (s->iop.bio)
- bio_put(&s->iop.bio->bio);
-
- closure_debug_destroy(cl);
- mempool_free(s, &s->d->c->search);
-}
-
-static inline struct search *search_alloc(struct bio *bio,
- struct bcache_device *d)
-{
- struct search *s;
-
- s = mempool_alloc(&d->c->search, GFP_NOIO);
-
- closure_init(&s->cl, NULL);
- do_bio_hook(s, bio);
-
- s->orig_bio = bio;
- s->d = d;
- s->recoverable = 1;
- s->bypass = 0;
- s->write = op_is_write(bio_op(bio));
- s->read_dirty_data = 0;
- s->cache_miss = 0;
- s->start_time = jiffies;
- s->inode = bcache_dev_inum(d);
-
- s->iop.c = d->c;
- s->iop.bio = NULL;
- s->iop.error = 0;
-
- return s;
-}
-
-/* Cached devices */
-
-static void cached_dev_bio_complete(struct closure *cl)
-{
- struct search *s = container_of(cl, struct search, cl);
- struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
-
- search_free(cl);
- cached_dev_put(dc);
-}
-
-/* Process reads */
-
-static void cached_dev_read_error(struct closure *cl)
-{
- struct search *s = container_of(cl, struct search, cl);
- struct bio *bio = &s->rbio.bio;
-
- if (s->recoverable) {
- /* Read bucket invalidate races are handled here, also plain
- * old IO errors from the cache that can be retried from the
- * backing device (reads of clean data) */
- trace_bcache_read_retry(s->orig_bio);
-
- s->iop.error = 0;
- do_bio_hook(s, s->orig_bio);
-
- /* XXX: invalidate cache, don't count twice */
-
- closure_bio_submit(bio, cl);
- }
-
- continue_at(cl, cached_dev_bio_complete, NULL);
-}
-
-static void cached_dev_read_done(struct closure *cl)
-{
- struct search *s = container_of(cl, struct search, cl);
- struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
-
- if (dc->verify && s->recoverable && !s->read_dirty_data)
- bch_data_verify(dc, s->orig_bio);
-
- continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
-}
-
-static void cached_dev_read_done_bh(struct closure *cl)
-{
- struct search *s = container_of(cl, struct search, cl);
- struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
-
- bch_mark_cache_accounting(s->iop.c, dc, !s->cache_miss, s->bypass);
- trace_bcache_read(s->orig_bio, !s->cache_miss, s->bypass);
-
- if (s->iop.error)
- continue_at_nobarrier(cl, cached_dev_read_error, s->iop.c->wq);
- else if (dc->verify)
- continue_at_nobarrier(cl, cached_dev_read_done, s->iop.c->wq);
- else
- continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
-}
-
-/**
- * __cache_promote -- insert result of read bio into cache
- *
- * Used for backing devices and flash-only volumes.
- *
- * @orig_bio must actually be a bbio with a valid key.
- */
-void __cache_promote(struct bch_fs *c, struct bch_read_bio *orig_bio,
- struct bkey_s_c old,
- struct bkey_s_c new,
- unsigned write_flags)
-{
-#if 0
- struct cache_promote_op *op;
- struct bio *bio;
- unsigned pages = DIV_ROUND_UP(orig_bio->bio.bi_iter.bi_size, PAGE_SIZE);
-
- /* XXX: readahead? */
-
- op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
- if (!op)
- goto out_submit;
-
- /* clone the bbio */
- memcpy(&op->bio, orig_bio, offsetof(struct bbio, bio));
-
- bio = &op->bio.bio.bio;
- bio_init(bio);
- bio_get(bio);
- bio->bi_bdev = orig_bio->bio.bi_bdev;
- bio->bi_iter.bi_sector = orig_bio->bio.bi_iter.bi_sector;
- bio->bi_iter.bi_size = orig_bio->bio.bi_iter.bi_size;
- bio->bi_end_io = cache_promote_endio;
- bio->bi_private = &op->cl;
- bio->bi_io_vec = bio->bi_inline_vecs;
- bch_bio_map(bio, NULL);
-
- if (bio_alloc_pages(bio, __GFP_NOWARN|GFP_NOIO))
- goto out_free;
-
- orig_bio->ca = NULL;
-
- closure_init(&op->cl, &c->cl);
- op->orig_bio = &orig_bio->bio;
- op->stale = 0;
-
- bch_write_op_init(&op->iop, c, &op->bio, &c->promote_write_point,
- new, old,
- BCH_WRITE_ALLOC_NOWAIT|write_flags);
- op->iop.nr_replicas = 1;
-
- //bch_cut_front(bkey_start_pos(&orig_bio->key.k), &op->iop.insert_key);
- //bch_cut_back(orig_bio->key.k.p, &op->iop.insert_key.k);
-
- trace_bcache_promote(&orig_bio->bio);
-
- op->bio.bio.submit_time_us = local_clock_us();
- closure_bio_submit(bio, &op->cl);
-
- continue_at(&op->cl, cache_promote_write, c->wq);
-out_free:
- kfree(op);
-out_submit:
- generic_make_request(&orig_bio->bio);
-#endif
-}
-
-/**
- * cached_dev_cache_miss - populate cache with data from backing device
- *
- * We don't write to the cache if s->bypass is set.
- */
-static int cached_dev_cache_miss(struct btree_iter *iter, struct search *s,
- struct bio *bio, unsigned sectors)
-{
- int ret;
- unsigned reada = 0;
- struct bio *miss;
- BKEY_PADDED(key) replace;
-
- s->cache_miss = 1;
-
- if (s->bypass)
- goto nopromote;
-#if 0
- struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
-
- /* XXX: broken */
- if (!(bio->bi_opf & REQ_RAHEAD) &&
- !(bio->bi_opf & REQ_META) &&
- ((u64) sectors_available(dc->disk.c) * 100 <
- (u64) iter->c->capacity * CUTOFF_CACHE_READA))
- reada = min_t(sector_t, dc->readahead >> 9,
- bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
-#endif
- sectors = min(sectors, bio_sectors(bio) + reada);
-
- replace.key.k = KEY(s->inode,
- bio->bi_iter.bi_sector + sectors,
- sectors);
-
- ret = bch_btree_insert_check_key(iter, &replace.key);
- if (ret == -EINTR)
- return ret;
-
- miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
-
- miss->bi_end_io = request_endio;
- miss->bi_private = &s->cl;
-
- //to_bbio(miss)->key.k = KEY(s->inode,
- // bio_end_sector(miss),
- // bio_sectors(miss));
- to_rbio(miss)->ca = NULL;
-
- closure_get(&s->cl);
- __cache_promote(s->iop.c, to_rbio(miss),
- bkey_i_to_s_c(&replace.key),
- bkey_to_s_c(&KEY(replace.key.k.p.inode,
- replace.key.k.p.offset,
- replace.key.k.size)),
- BCH_WRITE_CACHED);
-
- return 0;
-nopromote:
- miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
-
- miss->bi_end_io = request_endio;
- miss->bi_private = &s->cl;
- closure_bio_submit(miss, &s->cl);
-
- return 0;
-}
-
-static void cached_dev_read(struct cached_dev *dc, struct search *s)
-{
- struct bch_fs *c = s->iop.c;
- struct closure *cl = &s->cl;
- struct bio *bio = &s->rbio.bio;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- bch_increment_clock(c, bio_sectors(bio), READ);
-
- for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
- POS(s->inode, bio->bi_iter.bi_sector), k) {
- BKEY_PADDED(k) tmp;
- struct extent_pick_ptr pick;
- unsigned sectors, bytes;
- bool is_last;
-retry:
- bkey_reassemble(&tmp.k, k);
- bch_btree_iter_unlock(&iter);
- k = bkey_i_to_s_c(&tmp.k);
-
- bch_extent_pick_ptr(c, k, &pick);
- if (IS_ERR(pick.ca)) {
- bcache_io_error(c, bio, "no device to read from");
- goto out;
- }
-
- sectors = min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
- bio->bi_iter.bi_sector;
- bytes = sectors << 9;
- is_last = bytes == bio->bi_iter.bi_size;
- swap(bio->bi_iter.bi_size, bytes);
-
- if (pick.ca) {
- PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
- c->prio_clock[READ].hand;
-
- if (!bkey_extent_is_cached(k.k))
- s->read_dirty_data = true;
-
- bch_read_extent(c, &s->rbio, k, &pick,
- BCH_READ_ACCOUNT_TIMES|
- BCH_READ_RETRY_IF_STALE|
- (!s->bypass ? BCH_READ_PROMOTE : 0)|
- (is_last ? BCH_READ_IS_LAST : 0));
- } else {
- /* not present (hole), or stale cached data */
- if (cached_dev_cache_miss(&iter, s, bio, sectors)) {
- k = bch_btree_iter_peek_with_holes(&iter);
- if (btree_iter_err(k))
- break;
- goto retry;
- }
- }
-
- swap(bio->bi_iter.bi_size, bytes);
- bio_advance(bio, bytes);
-
- if (is_last) {
- bch_btree_iter_unlock(&iter);
- goto out;
- }
- }
-
- /*
- * If we get here, it better have been because there was an error
- * reading a btree node
- */
- ret = bch_btree_iter_unlock(&iter);
- BUG_ON(!ret);
- bcache_io_error(c, bio, "btree IO error %i", ret);
-out:
- continue_at(cl, cached_dev_read_done_bh, NULL);
-}
-
-/* Process writes */
-
-static void cached_dev_write_complete(struct closure *cl)
-{
- struct search *s = container_of(cl, struct search, cl);
- struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
-
- up_read_non_owner(&dc->writeback_lock);
- cached_dev_bio_complete(cl);
-}
-
-static void cached_dev_write(struct cached_dev *dc, struct search *s)
-{
- struct closure *cl = &s->cl;
- struct bio *bio = &s->wbio.bio;
- bool writeback = false;
- bool bypass = s->bypass;
- struct bkey insert_key = KEY(s->inode,
- bio_end_sector(bio),
- bio_sectors(bio));
- unsigned flags = BCH_WRITE_DISCARD_ON_ERROR;
-
- down_read_non_owner(&dc->writeback_lock);
- if (bch_keybuf_check_overlapping(&dc->writeback_keys,
- bkey_start_pos(&insert_key),
- insert_key.p)) {
- /*
- * We overlap with some dirty data undergoing background
- * writeback, force this write to writeback
- */
- bypass = false;
- writeback = true;
- }
-
- /*
- * Discards aren't _required_ to do anything, so skipping if
- * check_overlapping returned true is ok
- *
- * But check_overlapping drops dirty keys for which io hasn't started,
- * so we still want to call it.
- */
- if (bio_op(bio) == REQ_OP_DISCARD)
- bypass = true;
-
- if (should_writeback(dc, bio, BDEV_CACHE_MODE(dc->disk_sb.sb),
- bypass)) {
- bypass = false;
- writeback = true;
- }
-
- if (bypass) {
- /*
- * If this is a bypass-write (as opposed to a discard), send
- * it down to the backing device. If this is a discard, only
- * send it to the backing device if the backing device
- * supports discards. Otherwise, we simply discard the key
- * range from the cache and don't touch the backing device.
- */
- if ((bio_op(bio) != REQ_OP_DISCARD) ||
- blk_queue_discard(bdev_get_queue(dc->disk_sb.bdev)))
- closure_bio_submit(s->orig_bio, cl);
- } else if (writeback) {
- bch_writeback_add(dc);
-
- if (bio->bi_opf & REQ_PREFLUSH) {
- /* Also need to send a flush to the backing device */
- struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
- &dc->disk.bio_split);
-
- flush->bi_bdev = bio->bi_bdev;
- flush->bi_end_io = request_endio;
- flush->bi_private = cl;
- bio_set_op_attrs(flush, REQ_OP_WRITE, WRITE_FLUSH);
-
- closure_bio_submit(flush, cl);
- }
- } else {
- struct bio *writethrough =
- bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split);
-
- closure_bio_submit(writethrough, cl);
-
- flags |= BCH_WRITE_CACHED;
- flags |= BCH_WRITE_ALLOC_NOWAIT;
- }
-
- if (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
- flags |= BCH_WRITE_FLUSH;
- if (bypass)
- flags |= BCH_WRITE_DISCARD;
-
- bch_write_op_init(&s->iop, dc->disk.c, &s->wbio,
- (struct disk_reservation) { 0 },
- foreground_write_point(dc->disk.c,
- (unsigned long) current),
- bkey_start_pos(&insert_key),
- NULL, flags);
-
- closure_call(&s->iop.cl, bch_write, NULL, cl);
- continue_at(cl, cached_dev_write_complete, NULL);
-}
-
-/* Cached devices - read & write stuff */
-
-static void __cached_dev_make_request(struct request_queue *q, struct bio *bio)
-{
- struct search *s;
- struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
- struct cached_dev *dc = container_of(d, struct cached_dev, disk);
- int rw = bio_data_dir(bio);
-
- generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
-
- bio->bi_bdev = dc->disk_sb.bdev;
- bio->bi_iter.bi_sector += le64_to_cpu(dc->disk_sb.sb->data_offset);
-
- if (cached_dev_get(dc)) {
- struct bio *clone;
-
- s = search_alloc(bio, d);
- trace_bcache_request_start(s->d, bio);
-
- clone = rw ? &s->wbio.bio : &s->rbio.bio;
-
- if (!bio->bi_iter.bi_size) {
- if (s->orig_bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
- bch_journal_flush_async(&s->iop.c->journal,
- &s->cl);
-
- /*
- * If it's a flush, we send the flush to the backing
- * device too
- */
- closure_bio_submit(clone, &s->cl);
-
- continue_at(&s->cl, cached_dev_bio_complete, NULL);
- } else {
- s->bypass = check_should_bypass(dc, bio, rw);
-
- if (rw)
- cached_dev_write(dc, s);
- else
- cached_dev_read(dc, s);
- }
- } else {
- if ((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(dc->disk_sb.bdev)))
- bio_endio(bio);
- else
- generic_make_request(bio);
- }
-}
-
-static blk_qc_t cached_dev_make_request(struct request_queue *q,
- struct bio *bio)
-{
- __cached_dev_make_request(q, bio);
- return BLK_QC_T_NONE;
-}
-
-static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
- unsigned int cmd, unsigned long arg)
-{
- struct cached_dev *dc = container_of(d, struct cached_dev, disk);
- return __blkdev_driver_ioctl(dc->disk_sb.bdev, mode, cmd, arg);
-}
-
-static int cached_dev_congested(void *data, int bits)
-{
- struct bcache_device *d = data;
- struct cached_dev *dc = container_of(d, struct cached_dev, disk);
- struct request_queue *q = bdev_get_queue(dc->disk_sb.bdev);
- int ret = 0;
-
- if (bdi_congested(&q->backing_dev_info, bits))
- return 1;
-
- if (cached_dev_get(dc)) {
- ret |= bch_congested(d->c, bits);
- cached_dev_put(dc);
- }
-
- return ret;
-}
-
-void bch_cached_dev_request_init(struct cached_dev *dc)
-{
- struct gendisk *g = dc->disk.disk;
-
- g->queue->make_request_fn = cached_dev_make_request;
- g->queue->backing_dev_info.congested_fn = cached_dev_congested;
- dc->disk.ioctl = cached_dev_ioctl;
-}
-
-/* Blockdev volumes */
-
-static void __blockdev_volume_make_request(struct request_queue *q,
- struct bio *bio)
-{
- struct search *s;
- struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
- int rw = bio_data_dir(bio);
-
- generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
-
- trace_bcache_request_start(d, bio);
-
- s = search_alloc(bio, d);
-
- if (!bio->bi_iter.bi_size) {
- if (s->orig_bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
- bch_journal_flush_async(&s->iop.c->journal,
- &s->cl);
-
- continue_at(&s->cl, search_free, NULL);
- } else if (rw) {
- struct disk_reservation res = { 0 };
- unsigned flags = 0;
-
- if (bio_op(bio) != REQ_OP_DISCARD &&
- bch_disk_reservation_get(d->c, &res, bio_sectors(bio), 0)) {
- s->iop.error = -ENOSPC;
- continue_at(&s->cl, search_free, NULL);
- return;
- }
-
- if (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA))
- flags |= BCH_WRITE_FLUSH;
- if (bio_op(bio) == REQ_OP_DISCARD)
- flags |= BCH_WRITE_DISCARD;
-
- bch_write_op_init(&s->iop, d->c, &s->wbio, res,
- foreground_write_point(d->c,
- (unsigned long) current),
- POS(s->inode, bio->bi_iter.bi_sector),
- NULL, flags);
-
- closure_call(&s->iop.cl, bch_write, NULL, &s->cl);
- } else {
- closure_get(&s->cl);
- bch_read(d->c, &s->rbio, bcache_dev_inum(d));
- }
- continue_at(&s->cl, search_free, NULL);
-}
-
-static blk_qc_t blockdev_volume_make_request(struct request_queue *q,
- struct bio *bio)
-{
- __blockdev_volume_make_request(q, bio);
- return BLK_QC_T_NONE;
-}
-
-static int blockdev_volume_ioctl(struct bcache_device *d, fmode_t mode,
- unsigned int cmd, unsigned long arg)
-{
- return -ENOTTY;
-}
-
-static int blockdev_volume_congested(void *data, int bits)
-{
- struct bcache_device *d = data;
-
- return bch_congested(d->c, bits);
-}
-
-void bch_blockdev_volume_request_init(struct bcache_device *d)
-{
- struct gendisk *g = d->disk;
-
- g->queue->make_request_fn = blockdev_volume_make_request;
- g->queue->backing_dev_info.congested_fn = blockdev_volume_congested;
- d->ioctl = blockdev_volume_ioctl;
-}
diff --git a/libbcache/request.h b/libbcache/request.h
deleted file mode 100644
index 1ee3d16f..00000000
--- a/libbcache/request.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _BCACHE_REQUEST_H_
-#define _BCACHE_REQUEST_H_
-
-#include "stats.h"
-
-struct bch_fs;
-struct cached_dev;
-struct bcache_device;
-struct kmem_cache;
-
-unsigned bch_get_congested(struct bch_fs *);
-
-void bch_cached_dev_request_init(struct cached_dev *dc);
-void bch_blockdev_volume_request_init(struct bcache_device *d);
-
-#endif /* _BCACHE_REQUEST_H_ */
diff --git a/libbcache/siphash.c b/libbcache/siphash.c
deleted file mode 100644
index 3a6c9c82..00000000
--- a/libbcache/siphash.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
-
-/*-
- * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
- * are the number of compression rounds and the number of finalization rounds.
- * A compression round is identical to a finalization round and this round
- * function is called SipRound. Given a 128-bit key k and a (possibly empty)
- * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
- *
- * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
- * by Jean-Philippe Aumasson and Daniel J. Bernstein,
- * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
- * https://131002.net/siphash/siphash.pdf
- * https://131002.net/siphash/
- */
-
-#include <asm/byteorder.h>
-#include <asm/unaligned.h>
-#include <linux/bitops.h>
-#include <linux/string.h>
-
-#include "siphash.h"
-
-static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
-{
- while (rounds--) {
- ctx->v[0] += ctx->v[1];
- ctx->v[2] += ctx->v[3];
- ctx->v[1] = rol64(ctx->v[1], 13);
- ctx->v[3] = rol64(ctx->v[3], 16);
-
- ctx->v[1] ^= ctx->v[0];
- ctx->v[3] ^= ctx->v[2];
- ctx->v[0] = rol64(ctx->v[0], 32);
-
- ctx->v[2] += ctx->v[1];
- ctx->v[0] += ctx->v[3];
- ctx->v[1] = rol64(ctx->v[1], 17);
- ctx->v[3] = rol64(ctx->v[3], 21);
-
- ctx->v[1] ^= ctx->v[2];
- ctx->v[3] ^= ctx->v[0];
- ctx->v[2] = rol64(ctx->v[2], 32);
- }
-}
-
-static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
-{
- u64 m = get_unaligned_le64(ptr);
-
- ctx->v[3] ^= m;
- SipHash_Rounds(ctx, rounds);
- ctx->v[0] ^= m;
-}
-
-void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
-{
- u64 k0, k1;
-
- k0 = le64_to_cpu(key->k0);
- k1 = le64_to_cpu(key->k1);
-
- ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
- ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
- ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
- ctx->v[3] = 0x7465646279746573ULL ^ k1;
-
- memset(ctx->buf, 0, sizeof(ctx->buf));
- ctx->bytes = 0;
-}
-
-void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
- const void *src, size_t len)
-{
- const u8 *ptr = src;
- size_t left, used;
-
- if (len == 0)
- return;
-
- used = ctx->bytes % sizeof(ctx->buf);
- ctx->bytes += len;
-
- if (used > 0) {
- left = sizeof(ctx->buf) - used;
-
- if (len >= left) {
- memcpy(&ctx->buf[used], ptr, left);
- SipHash_CRounds(ctx, ctx->buf, rc);
- len -= left;
- ptr += left;
- } else {
- memcpy(&ctx->buf[used], ptr, len);
- return;
- }
- }
-
- while (len >= sizeof(ctx->buf)) {
- SipHash_CRounds(ctx, ptr, rc);
- len -= sizeof(ctx->buf);
- ptr += sizeof(ctx->buf);
- }
-
- if (len > 0)
- memcpy(&ctx->buf[used], ptr, len);
-}
-
-void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
-{
- u64 r;
-
- r = SipHash_End(ctx, rc, rf);
-
- *((__le64 *) dst) = cpu_to_le64(r);
-}
-
-u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
-{
- u64 r;
- size_t left, used;
-
- used = ctx->bytes % sizeof(ctx->buf);
- left = sizeof(ctx->buf) - used;
- memset(&ctx->buf[used], 0, left - 1);
- ctx->buf[7] = ctx->bytes;
-
- SipHash_CRounds(ctx, ctx->buf, rc);
- ctx->v[2] ^= 0xff;
- SipHash_Rounds(ctx, rf);
-
- r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
- memset(ctx, 0, sizeof(*ctx));
- return (r);
-}
-
-u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
-{
- SIPHASH_CTX ctx;
-
- SipHash_Init(&ctx, key);
- SipHash_Update(&ctx, rc, rf, src, len);
- return SipHash_End(&ctx, rc, rf);
-}
diff --git a/libbcache/siphash.h b/libbcache/siphash.h
deleted file mode 100644
index 7a4b2241..00000000
--- a/libbcache/siphash.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
-/*-
- * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-/*
- * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
- * optimized for speed on short messages returning a 64bit hash/digest value.
- *
- * The number of rounds is defined during the initialization:
- * SipHash24_Init() for the fast and resonable strong version
- * SipHash48_Init() for the strong version (half as fast)
- *
- * struct SIPHASH_CTX ctx;
- * SipHash24_Init(&ctx);
- * SipHash_SetKey(&ctx, "16bytes long key");
- * SipHash_Update(&ctx, pointer_to_string, length_of_string);
- * SipHash_Final(output, &ctx);
- */
-
-#ifndef _SIPHASH_H_
-#define _SIPHASH_H_
-
-#include <linux/types.h>
-
-#define SIPHASH_BLOCK_LENGTH 8
-#define SIPHASH_KEY_LENGTH 16
-#define SIPHASH_DIGEST_LENGTH 8
-
-typedef struct _SIPHASH_CTX {
- u64 v[4];
- u8 buf[SIPHASH_BLOCK_LENGTH];
- u32 bytes;
-} SIPHASH_CTX;
-
-typedef struct {
- __le64 k0;
- __le64 k1;
-} SIPHASH_KEY;
-
-void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
-void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
-u64 SipHash_End(SIPHASH_CTX *, int, int);
-void SipHash_Final(void *, SIPHASH_CTX *, int, int);
-u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
-
-#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k))
-#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l))
-#define SipHash24_End(_d) SipHash_End((_d), 2, 4)
-#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4)
-#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l))
-
-#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k))
-#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l))
-#define SipHash48_End(_d) SipHash_End((_d), 4, 8)
-#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8)
-#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l))
-
-#endif /* _SIPHASH_H_ */
diff --git a/libbcache/six.c b/libbcache/six.c
deleted file mode 100644
index 1bb8bfcc..00000000
--- a/libbcache/six.c
+++ /dev/null
@@ -1,396 +0,0 @@
-
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-
-#include "six.h"
-
-#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
-#define six_release(l) lock_release(l, 0, _RET_IP_)
-
-#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0)
-#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0)
-#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1)
-
-struct six_lock_vals {
- /* Value we add to the lock in order to take the lock: */
- u64 lock_val;
-
- /* If the lock has this value (used as a mask), taking the lock fails: */
- u64 lock_fail;
-
- /* Value we add to the lock in order to release the lock: */
- u64 unlock_val;
-
- /* Mask that indicates lock is held for this type: */
- u64 held_mask;
-
- /* Waitlist we wakeup when releasing the lock: */
- enum six_lock_type unlock_wakeup;
-};
-
-#define LOCK_VALS { \
- [SIX_LOCK_read] = { \
- .lock_val = __SIX_VAL(read_lock, 1), \
- .lock_fail = __SIX_LOCK_HELD_write, \
- .unlock_val = -__SIX_VAL(read_lock, 1), \
- .held_mask = __SIX_LOCK_HELD_read, \
- .unlock_wakeup = SIX_LOCK_write, \
- }, \
- [SIX_LOCK_intent] = { \
- .lock_val = __SIX_VAL(intent_lock, 1), \
- .lock_fail = __SIX_LOCK_HELD_intent, \
- .unlock_val = -__SIX_VAL(intent_lock, 1), \
- .held_mask = __SIX_LOCK_HELD_intent, \
- .unlock_wakeup = SIX_LOCK_intent, \
- }, \
- [SIX_LOCK_write] = { \
- .lock_val = __SIX_VAL(seq, 1), \
- .lock_fail = __SIX_LOCK_HELD_read, \
- .unlock_val = __SIX_VAL(seq, 1), \
- .held_mask = __SIX_LOCK_HELD_write, \
- .unlock_wakeup = SIX_LOCK_read, \
- }, \
-}
-
-static void six_set_owner(struct six_lock *lock, enum six_lock_type type)
-{
- if (type == SIX_LOCK_intent)
- lock->owner = current;
-}
-
-static void six_clear_owner(struct six_lock *lock, enum six_lock_type type)
-{
- if (type == SIX_LOCK_intent)
- lock->owner = NULL;
-}
-
-static inline bool __six_trylock_type(struct six_lock *lock,
- enum six_lock_type type)
-{
- const struct six_lock_vals l[] = LOCK_VALS;
- union six_lock_state old;
- u64 v = READ_ONCE(lock->state.v);
-
- do {
- old.v = v;
-
- EBUG_ON(type == SIX_LOCK_write &&
- ((old.v & __SIX_LOCK_HELD_write) ||
- !(old.v & __SIX_LOCK_HELD_intent)));
-
- if (old.v & l[type].lock_fail)
- return false;
- } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
- old.v,
- old.v + l[type].lock_val)) != old.v);
- return true;
-}
-
-bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-{
- bool ret = __six_trylock_type(lock, type);
-
- if (ret) {
- six_acquire(&lock->dep_map, 1);
- six_set_owner(lock, type);
- }
-
- return ret;
-}
-
-bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
- unsigned seq)
-{
- const struct six_lock_vals l[] = LOCK_VALS;
- union six_lock_state old;
- u64 v = READ_ONCE(lock->state.v);
-
- do {
- old.v = v;
-
- if (old.seq != seq || old.v & l[type].lock_fail)
- return false;
- } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
- old.v,
- old.v + l[type].lock_val)) != old.v);
-
- six_acquire(&lock->dep_map, 1);
- six_set_owner(lock, type);
- return true;
-}
-
-struct six_lock_waiter {
- struct list_head list;
- struct task_struct *task;
-};
-
-/* This is probably up there with the more evil things I've done */
-#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
-
-static inline int six_can_spin_on_owner(struct six_lock *lock)
-{
- struct task_struct *owner;
- int retval = 1;
-
- if (need_resched())
- return 0;
-
- rcu_read_lock();
- owner = READ_ONCE(lock->owner);
- if (owner)
- retval = owner->on_cpu;
- rcu_read_unlock();
- /*
- * if lock->owner is not set, the mutex owner may have just acquired
- * it and not set the owner yet or the mutex has been released.
- */
- return retval;
-}
-
-static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner)
-{
- bool ret = true;
-
- rcu_read_lock();
- while (lock->owner == owner) {
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_
- * checking lock->owner still matches owner. If that fails,
- * owner might point to freed memory. If it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
- */
- barrier();
-
- if (!owner->on_cpu || need_resched()) {
- ret = false;
- break;
- }
-
- cpu_relax_lowlatency();
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-{
- struct task_struct *task = current;
-
- if (type == SIX_LOCK_write)
- return false;
-
- preempt_disable();
- if (!six_can_spin_on_owner(lock))
- goto fail;
-
- if (!osq_lock(&lock->osq))
- goto fail;
-
- while (1) {
- struct task_struct *owner;
-
- /*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
- */
- owner = READ_ONCE(lock->owner);
- if (owner && !six_spin_on_owner(lock, owner))
- break;
-
- if (__six_trylock_type(lock, type)) {
- osq_unlock(&lock->osq);
- preempt_enable();
- return true;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!owner && (need_resched() || rt_task(task)))
- break;
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- cpu_relax_lowlatency();
- }
-
- osq_unlock(&lock->osq);
-fail:
- preempt_enable();
-
- /*
- * If we fell out of the spin path because of need_resched(),
- * reschedule now, before we try-lock again. This avoids getting
- * scheduled out right after we obtained the lock.
- */
- if (need_resched())
- schedule();
-
- return false;
-}
-
-void six_lock_type(struct six_lock *lock, enum six_lock_type type)
-{
- const struct six_lock_vals l[] = LOCK_VALS;
- union six_lock_state old, new;
- struct six_lock_waiter wait;
- u64 v;
-
- six_acquire(&lock->dep_map, 0);
-
- if (__six_trylock_type(lock, type))
- goto done;
-
- if (six_optimistic_spin(lock, type))
- goto done;
-
- lock_contended(&lock->dep_map, _RET_IP_);
-
- INIT_LIST_HEAD(&wait.list);
- wait.task = current;
-
- while (1) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (list_empty_careful(&wait.list)) {
- raw_spin_lock(&lock->wait_lock);
- list_add_tail(&wait.list, &lock->wait_list[type]);
- raw_spin_unlock(&lock->wait_lock);
- }
-
- v = READ_ONCE(lock->state.v);
- do {
- new.v = old.v = v;
-
- if (!(old.v & l[type].lock_fail))
- new.v += l[type].lock_val;
- else if (!(new.waiters & (1 << type)))
- new.waiters |= 1 << type;
- else
- break; /* waiting bit already set */
- } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
- old.v, new.v)) != old.v);
-
- if (!(old.v & l[type].lock_fail))
- break;
-
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
-
- if (!list_empty_careful(&wait.list)) {
- raw_spin_lock(&lock->wait_lock);
- list_del_init(&wait.list);
- raw_spin_unlock(&lock->wait_lock);
- }
-done:
- lock_acquired(&lock->dep_map, _RET_IP_);
- six_set_owner(lock, type);
-}
-
-static inline void six_lock_wakeup(struct six_lock *lock,
- union six_lock_state state,
- unsigned waitlist_id)
-{
- struct list_head *wait_list = &lock->wait_list[waitlist_id];
- struct six_lock_waiter *w, *next;
-
- if (waitlist_id == SIX_LOCK_write && state.read_lock)
- return;
-
- if (!(state.waiters & (1 << waitlist_id)))
- return;
-
- clear_bit(waitlist_bitnr(waitlist_id),
- (unsigned long *) &lock->state.v);
-
- raw_spin_lock(&lock->wait_lock);
-
- list_for_each_entry_safe(w, next, wait_list, list) {
- list_del_init(&w->list);
-
- if (wake_up_process(w->task) &&
- waitlist_id != SIX_LOCK_read) {
- if (!list_empty(wait_list))
- set_bit(waitlist_bitnr(waitlist_id),
- (unsigned long *) &lock->state.v);
- break;
- }
- }
-
- raw_spin_unlock(&lock->wait_lock);
-}
-
-void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
- const struct six_lock_vals l[] = LOCK_VALS;
- union six_lock_state state;
-
- six_clear_owner(lock, type);
-
- EBUG_ON(!(lock->state.v & l[type].held_mask));
- EBUG_ON(type == SIX_LOCK_write &&
- !(lock->state.v & __SIX_LOCK_HELD_intent));
-
- state.v = atomic64_add_return_release(l[type].unlock_val,
- &lock->state.counter);
- six_release(&lock->dep_map);
- six_lock_wakeup(lock, state, l[type].unlock_wakeup);
-}
-
-bool six_trylock_convert(struct six_lock *lock,
- enum six_lock_type from,
- enum six_lock_type to)
-{
- const struct six_lock_vals l[] = LOCK_VALS;
- union six_lock_state old, new;
- u64 v = READ_ONCE(lock->state.v);
-
- do {
- new.v = old.v = v;
- new.v += l[from].unlock_val;
-
- if (new.v & l[to].lock_fail)
- return false;
- } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
- old.v,
- new.v + l[to].lock_val)) != old.v);
-
- six_clear_owner(lock, from);
- six_set_owner(lock, to);
-
- six_lock_wakeup(lock, new, l[from].unlock_wakeup);
-
- return true;
-}
-
-/*
- * Increment read/intent lock count, assuming we already have it read or intent
- * locked:
- */
-void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
-{
- const struct six_lock_vals l[] = LOCK_VALS;
-
- EBUG_ON(type == SIX_LOCK_write);
- six_acquire(&lock->dep_map, 0);
-
- /* XXX: assert already locked, and that we don't overflow: */
-
- atomic64_add(l[type].lock_val, &lock->state.counter);
-}
-
-/* Convert from intent to read: */
-void six_lock_downgrade(struct six_lock *lock)
-{
- six_lock_increment(lock, SIX_LOCK_read);
- six_unlock_intent(lock);
-}
diff --git a/libbcache/six.h b/libbcache/six.h
deleted file mode 100644
index 01ed3385..00000000
--- a/libbcache/six.h
+++ /dev/null
@@ -1,136 +0,0 @@
-
-#ifndef _BCACHE_SIX_H
-#define _BCACHE_SIX_H
-
-#include <linux/lockdep.h>
-#include <linux/osq_lock.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-
-#include "util.h"
-
-/*
- * LOCK STATES:
- *
- * read, intent, write (i.e. shared/intent/exclusive, hence the name)
- *
- * read and write work as with normal read/write locks - a lock can have
- * multiple readers, but write excludes reads and other write locks.
- *
- * Intent does not block read, but it does block other intent locks. The idea is
- * by taking an intent lock, you can then later upgrade to a write lock without
- * dropping your read lock and without deadlocking - because no other thread has
- * the intent lock and thus no other thread could be trying to take the write
- * lock.
- */
-
-union six_lock_state {
- struct {
- atomic64_t counter;
- };
-
- struct {
- u64 v;
- };
-
- struct {
- /* for waitlist_bitnr() */
- unsigned long l;
- };
-
- struct {
- unsigned read_lock:26;
- unsigned intent_lock:3;
- unsigned waiters:3;
- /*
- * seq works much like in seqlocks: it's incremented every time
- * we lock and unlock for write.
- *
- * If it's odd write lock is held, even unlocked.
- *
- * Thus readers can unlock, and then lock again later iff it
- * hasn't been modified in the meantime.
- */
- u32 seq;
- };
-};
-
-#define SIX_LOCK_MAX_RECURSE ((1 << 3) - 1)
-
-enum six_lock_type {
- SIX_LOCK_read,
- SIX_LOCK_intent,
- SIX_LOCK_write,
-};
-
-struct six_lock {
- union six_lock_state state;
- struct task_struct *owner;
- struct optimistic_spin_queue osq;
-
- raw_spinlock_t wait_lock;
- struct list_head wait_list[3];
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-};
-
-static __always_inline void __six_lock_init(struct six_lock *lock,
- const char *name,
- struct lock_class_key *key)
-{
- atomic64_set(&lock->state.counter, 0);
- raw_spin_lock_init(&lock->wait_lock);
- INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
- INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
- INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_write]);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- debug_check_no_locks_freed((void *) lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-}
-
-#define six_lock_init(lock) \
-do { \
- static struct lock_class_key __key; \
- \
- __six_lock_init((lock), #lock, &__key); \
-} while (0)
-
-bool six_trylock_type(struct six_lock *, enum six_lock_type);
-bool six_relock_type(struct six_lock *, enum six_lock_type, unsigned);
-void six_lock_type(struct six_lock *, enum six_lock_type);
-void six_unlock_type(struct six_lock *, enum six_lock_type);
-bool six_trylock_convert(struct six_lock *, enum six_lock_type,
- enum six_lock_type);
-void six_lock_increment(struct six_lock *, enum six_lock_type);
-void six_lock_downgrade(struct six_lock *);
-
-#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v)
-
-#define __SIX_LOCK(type) \
-static __always_inline bool six_trylock_##type(struct six_lock *lock) \
-{ \
- return six_trylock_type(lock, SIX_LOCK_##type); \
-} \
- \
-static __always_inline bool six_relock_##type(struct six_lock *lock, u32 seq)\
-{ \
- return six_relock_type(lock, SIX_LOCK_##type, seq); \
-} \
- \
-static __always_inline void six_lock_##type(struct six_lock *lock) \
-{ \
- six_lock_type(lock, SIX_LOCK_##type); \
-} \
- \
-static __always_inline void six_unlock_##type(struct six_lock *lock) \
-{ \
- six_unlock_type(lock, SIX_LOCK_##type); \
-}
-
-__SIX_LOCK(read)
-__SIX_LOCK(intent)
-__SIX_LOCK(write)
-
-#endif /* _BCACHE_SIX_H */
diff --git a/libbcache/stats.c b/libbcache/stats.c
deleted file mode 100644
index a8a4eb36..00000000
--- a/libbcache/stats.c
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * bcache stats code
- *
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcache.h"
-#include "stats.h"
-#include "sysfs.h"
-
-/*
- * We keep absolute totals of various statistics, and addionally a set of three
- * rolling averages.
- *
- * Every so often, a timer goes off and rescales the rolling averages.
- * accounting_rescale[] is how many times the timer has to go off before we
- * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
- * and one day.
- *
- * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
- * and accounting_weight is what we use to rescale:
- *
- * pow(31 / 32, 22) ~= 1/2
- *
- * So that we don't have to increment each set of numbers every time we (say)
- * get a cache hit, we increment a single atomic_t in acc->collector, and when
- * the rescale function runs it resets the atomic counter to 0 and adds its
- * old value to each of the exported numbers.
- *
- * To reduce rounding error, the numbers in struct cache_stats are all
- * stored left shifted by 16, and scaled back in the sysfs show() function.
- */
-
-static const unsigned DAY_RESCALE = 288;
-static const unsigned HOUR_RESCALE = 12;
-static const unsigned FIVE_MINUTE_RESCALE = 1;
-static const unsigned accounting_delay = (HZ * 300) / 22;
-static const unsigned accounting_weight = 5;
-
-/* sysfs reading/writing */
-
-read_attribute(cache_hits);
-read_attribute(cache_misses);
-read_attribute(cache_bypass_hits);
-read_attribute(cache_bypass_misses);
-read_attribute(cache_hit_ratio);
-read_attribute(cache_readaheads);
-read_attribute(cache_miss_collisions);
-read_attribute(bypassed);
-read_attribute(foreground_write_ratio);
-read_attribute(foreground_writes);
-read_attribute(gc_writes);
-read_attribute(discards);
-
-SHOW(bch_stats)
-{
- struct cache_stats *s =
- container_of(kobj, struct cache_stats, kobj);
-#define var(stat) (s->stat >> 16)
- var_print(cache_hits);
- var_print(cache_misses);
- var_print(cache_bypass_hits);
- var_print(cache_bypass_misses);
-
- sysfs_print(cache_hit_ratio,
- DIV_SAFE(var(cache_hits) * 100,
- var(cache_hits) + var(cache_misses)));
-
- var_print(cache_readaheads);
- var_print(cache_miss_collisions);
-
- sysfs_hprint(bypassed, var(sectors_bypassed) << 9);
- sysfs_hprint(foreground_writes, var(foreground_write_sectors) << 9);
- sysfs_hprint(gc_writes, var(gc_write_sectors) << 9);
- sysfs_hprint(discards, var(discard_sectors) << 9);
-
- sysfs_print(foreground_write_ratio,
- DIV_SAFE(var(foreground_write_sectors) * 100,
- var(foreground_write_sectors) +
- var(gc_write_sectors)));
-#undef var
- return 0;
-}
-
-STORE(bch_stats)
-{
- return size;
-}
-
-static void bch_stats_release(struct kobject *k)
-{
-}
-
-static struct attribute *bch_stats_files[] = {
- &sysfs_cache_hits,
- &sysfs_cache_misses,
- &sysfs_cache_bypass_hits,
- &sysfs_cache_bypass_misses,
- &sysfs_cache_hit_ratio,
- &sysfs_cache_readaheads,
- &sysfs_cache_miss_collisions,
- &sysfs_bypassed,
- &sysfs_foreground_write_ratio,
- &sysfs_foreground_writes,
- &sysfs_gc_writes,
- &sysfs_discards,
- NULL
-};
-static KTYPE(bch_stats);
-
-int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
- struct kobject *parent)
-{
- int ret = kobject_add(&acc->total.kobj, parent,
- "stats_total");
- ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
- "stats_five_minute");
- ret = ret ?: kobject_add(&acc->hour.kobj, parent,
- "stats_hour");
- ret = ret ?: kobject_add(&acc->day.kobj, parent,
- "stats_day");
- return ret;
-}
-
-void bch_cache_accounting_clear(struct cache_accounting *acc)
-{
- memset(&acc->total.cache_hits,
- 0,
- sizeof(unsigned long) * 9);
-}
-
-void bch_cache_accounting_destroy(struct cache_accounting *acc)
-{
- kobject_put(&acc->total.kobj);
- kobject_put(&acc->five_minute.kobj);
- kobject_put(&acc->hour.kobj);
- kobject_put(&acc->day.kobj);
-
- atomic_set(&acc->closing, 1);
- if (del_timer_sync(&acc->timer))
- closure_return(&acc->cl);
-}
-
-/* EWMA scaling */
-
-static void scale_stat(unsigned long *stat)
-{
- *stat = ewma_add(*stat, 0, accounting_weight);
-}
-
-static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
-{
- if (++stats->rescale == rescale_at) {
- stats->rescale = 0;
- scale_stat(&stats->cache_hits);
- scale_stat(&stats->cache_misses);
- scale_stat(&stats->cache_bypass_hits);
- scale_stat(&stats->cache_bypass_misses);
- scale_stat(&stats->cache_readaheads);
- scale_stat(&stats->cache_miss_collisions);
- scale_stat(&stats->sectors_bypassed);
- scale_stat(&stats->foreground_write_sectors);
- scale_stat(&stats->gc_write_sectors);
- scale_stat(&stats->discard_sectors);
- }
-}
-
-static void scale_accounting(unsigned long data)
-{
- struct cache_accounting *acc = (struct cache_accounting *) data;
-
-#define move_stat(name) do { \
- unsigned t = atomic_xchg(&acc->collector.name, 0); \
- t <<= 16; \
- acc->five_minute.name += t; \
- acc->hour.name += t; \
- acc->day.name += t; \
- acc->total.name += t; \
-} while (0)
-
- move_stat(cache_hits);
- move_stat(cache_misses);
- move_stat(cache_bypass_hits);
- move_stat(cache_bypass_misses);
- move_stat(cache_readaheads);
- move_stat(cache_miss_collisions);
- move_stat(sectors_bypassed);
- move_stat(foreground_write_sectors);
- move_stat(gc_write_sectors);
- move_stat(discard_sectors);
-
- scale_stats(&acc->total, 0);
- scale_stats(&acc->day, DAY_RESCALE);
- scale_stats(&acc->hour, HOUR_RESCALE);
- scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
-
- acc->timer.expires += accounting_delay;
-
- if (!atomic_read(&acc->closing))
- add_timer(&acc->timer);
- else
- closure_return(&acc->cl);
-}
-
-void bch_cache_accounting_init(struct cache_accounting *acc,
- struct closure *parent)
-{
- kobject_init(&acc->total.kobj, &bch_stats_ktype);
- kobject_init(&acc->five_minute.kobj, &bch_stats_ktype);
- kobject_init(&acc->hour.kobj, &bch_stats_ktype);
- kobject_init(&acc->day.kobj, &bch_stats_ktype);
-
- closure_init(&acc->cl, parent);
- init_timer(&acc->timer);
- acc->timer.expires = jiffies + accounting_delay;
- acc->timer.data = (unsigned long) acc;
- acc->timer.function = scale_accounting;
- add_timer(&acc->timer);
-}
diff --git a/libbcache/stats.h b/libbcache/stats.h
deleted file mode 100644
index a3c7bd26..00000000
--- a/libbcache/stats.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef _BCACHE_STATS_H_
-#define _BCACHE_STATS_H_
-
-#include "stats_types.h"
-
-struct bch_fs;
-struct cached_dev;
-struct bcache_device;
-
-#ifndef NO_BCACHE_ACCOUNTING
-
-void bch_cache_accounting_init(struct cache_accounting *, struct closure *);
-int bch_cache_accounting_add_kobjs(struct cache_accounting *, struct kobject *);
-void bch_cache_accounting_clear(struct cache_accounting *);
-void bch_cache_accounting_destroy(struct cache_accounting *);
-
-#else
-
-static inline void bch_cache_accounting_init(struct cache_accounting *acc,
- struct closure *cl) {}
-static inline int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
- struct kobject *cl)
-{
- return 0;
-}
-static inline void bch_cache_accounting_clear(struct cache_accounting *acc) {}
-static inline void bch_cache_accounting_destroy(struct cache_accounting *acc) {}
-
-#endif
-
-static inline void mark_cache_stats(struct cache_stat_collector *stats,
- bool hit, bool bypass)
-{
- atomic_inc(&stats->cache_hit_array[!bypass][!hit]);
-}
-
-static inline void bch_mark_cache_accounting(struct bch_fs *c,
- struct cached_dev *dc,
- bool hit, bool bypass)
-{
- mark_cache_stats(&dc->accounting.collector, hit, bypass);
- mark_cache_stats(&c->accounting.collector, hit, bypass);
-}
-
-static inline void bch_mark_sectors_bypassed(struct bch_fs *c,
- struct cached_dev *dc,
- unsigned sectors)
-{
- atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
- atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
-}
-
-static inline void bch_mark_gc_write(struct bch_fs *c, int sectors)
-{
- atomic_add(sectors, &c->accounting.collector.gc_write_sectors);
-}
-
-static inline void bch_mark_foreground_write(struct bch_fs *c, int sectors)
-{
- atomic_add(sectors, &c->accounting.collector.foreground_write_sectors);
-}
-
-static inline void bch_mark_discard(struct bch_fs *c, int sectors)
-{
- atomic_add(sectors, &c->accounting.collector.discard_sectors);
-}
-
-#endif /* _BCACHE_STATS_H_ */
diff --git a/libbcache/stats_types.h b/libbcache/stats_types.h
deleted file mode 100644
index 28e4c69e..00000000
--- a/libbcache/stats_types.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef _BCACHE_STATS_TYPES_H_
-#define _BCACHE_STATS_TYPES_H_
-
-struct cache_stat_collector {
- union {
- struct {
- atomic_t cache_hits;
- atomic_t cache_misses;
- atomic_t cache_bypass_hits;
- atomic_t cache_bypass_misses;
- };
-
- /* cache_hit_array[!bypass][!hit]: */
- atomic_t cache_hit_array[2][2];
- };
-
-
- atomic_t cache_readaheads;
- atomic_t cache_miss_collisions;
- atomic_t sectors_bypassed;
- atomic_t foreground_write_sectors;
- atomic_t gc_write_sectors;
- atomic_t discard_sectors;
-};
-
-struct cache_stats {
- struct kobject kobj;
-
- unsigned long cache_hits;
- unsigned long cache_misses;
- unsigned long cache_bypass_hits;
- unsigned long cache_bypass_misses;
- unsigned long cache_readaheads;
- unsigned long cache_miss_collisions;
- unsigned long sectors_bypassed;
- unsigned long foreground_write_sectors;
- unsigned long gc_write_sectors;
- unsigned long discard_sectors;
-
- unsigned rescale;
-};
-
-struct cache_accounting {
- struct closure cl;
- struct timer_list timer;
- atomic_t closing;
-
- struct cache_stat_collector collector;
-
- struct cache_stats total;
- struct cache_stats five_minute;
- struct cache_stats hour;
- struct cache_stats day;
-};
-
-#endif /* _BCACHE_STATS_TYPES_H_ */
diff --git a/libbcache/str_hash.h b/libbcache/str_hash.h
deleted file mode 100644
index 1173dfe8..00000000
--- a/libbcache/str_hash.h
+++ /dev/null
@@ -1,384 +0,0 @@
-#ifndef _BCACHE_STR_HASH_H
-#define _BCACHE_STR_HASH_H
-
-#include "btree_iter.h"
-#include "checksum.h"
-#include "inode.h"
-#include "siphash.h"
-#include "super.h"
-
-#include <linux/crc32c.h>
-#include <crypto/hash.h>
-
-struct bch_hash_info {
- u8 type;
- union {
- __le64 crc_key;
- SIPHASH_KEY siphash_key;
- };
-};
-
-static inline struct bch_hash_info
-bch_hash_info_init(const struct bch_inode_unpacked *bi)
-{
- /* XXX ick */
- struct bch_hash_info info = {
- .type = (bi->i_flags >> INODE_STR_HASH_OFFSET) &
- ~(~0 << INODE_STR_HASH_BITS)
- };
-
- switch (info.type) {
- case BCH_STR_HASH_CRC32C:
- case BCH_STR_HASH_CRC64:
- info.crc_key = bi->i_hash_seed;
- break;
- case BCH_STR_HASH_SIPHASH: {
- SHASH_DESC_ON_STACK(desc, bch_sha256);
- u8 digest[crypto_shash_digestsize(bch_sha256)];
-
- desc->tfm = bch_sha256;
- desc->flags = 0;
-
- crypto_shash_digest(desc, (void *) &bi->i_hash_seed,
- sizeof(bi->i_hash_seed), digest);
- memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
- break;
- }
- default:
- BUG();
- }
-
- return info;
-}
-
-struct bch_str_hash_ctx {
- union {
- u32 crc32c;
- u64 crc64;
- SIPHASH_CTX siphash;
- };
-};
-
-static inline void bch_str_hash_init(struct bch_str_hash_ctx *ctx,
- const struct bch_hash_info *info)
-{
- switch (info->type) {
- case BCH_STR_HASH_CRC32C:
- ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key));
- break;
- case BCH_STR_HASH_CRC64:
- ctx->crc64 = bch_crc64_update(~0, &info->crc_key, sizeof(info->crc_key));
- break;
- case BCH_STR_HASH_SIPHASH:
- SipHash24_Init(&ctx->siphash, &info->siphash_key);
- break;
- default:
- BUG();
- }
-}
-
-static inline void bch_str_hash_update(struct bch_str_hash_ctx *ctx,
- const struct bch_hash_info *info,
- const void *data, size_t len)
-{
- switch (info->type) {
- case BCH_STR_HASH_CRC32C:
- ctx->crc32c = crc32c(ctx->crc32c, data, len);
- break;
- case BCH_STR_HASH_CRC64:
- ctx->crc64 = bch_crc64_update(ctx->crc64, data, len);
- break;
- case BCH_STR_HASH_SIPHASH:
- SipHash24_Update(&ctx->siphash, data, len);
- break;
- default:
- BUG();
- }
-}
-
-static inline u64 bch_str_hash_end(struct bch_str_hash_ctx *ctx,
- const struct bch_hash_info *info)
-{
- switch (info->type) {
- case BCH_STR_HASH_CRC32C:
- return ctx->crc32c;
- case BCH_STR_HASH_CRC64:
- return ctx->crc64 >> 1;
- case BCH_STR_HASH_SIPHASH:
- return SipHash24_End(&ctx->siphash) >> 1;
- default:
- BUG();
- }
-}
-
-struct bch_hash_desc {
- enum btree_id btree_id;
- u8 key_type;
- u8 whiteout_type;
-
- u64 (*hash_key)(const struct bch_hash_info *, const void *);
- u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
- bool (*cmp_key)(struct bkey_s_c, const void *);
- bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
-};
-
-static inline struct bkey_s_c
-bch_hash_lookup_at(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *iter, const void *search)
-{
- u64 inode = iter->pos.inode;
-
- do {
- struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
-
- if (btree_iter_err(k))
- return k;
-
- if (k.k->type == desc.key_type) {
- if (!desc.cmp_key(k, search))
- return k;
- } else if (k.k->type == desc.whiteout_type) {
- ;
- } else {
- /* hole, not found */
- break;
- }
-
- bch_btree_iter_advance_pos(iter);
- } while (iter->pos.inode == inode);
-
- return bkey_s_c_err(-ENOENT);
-}
-
-static inline struct bkey_s_c
-bch_hash_lookup_bkey_at(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *iter, struct bkey_s_c search)
-{
- u64 inode = iter->pos.inode;
-
- do {
- struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
-
- if (btree_iter_err(k))
- return k;
-
- if (k.k->type == desc.key_type) {
- if (!desc.cmp_bkey(k, search))
- return k;
- } else if (k.k->type == desc.whiteout_type) {
- ;
- } else {
- /* hole, not found */
- break;
- }
-
- bch_btree_iter_advance_pos(iter);
- } while (iter->pos.inode == inode);
-
- return bkey_s_c_err(-ENOENT);
-}
-
-static inline struct bkey_s_c
-bch_hash_lookup(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct bch_fs *c, u64 inode,
- struct btree_iter *iter, const void *key)
-{
- bch_btree_iter_init(iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)));
-
- return bch_hash_lookup_at(desc, info, iter, key);
-}
-
-static inline struct bkey_s_c
-bch_hash_lookup_intent(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct bch_fs *c, u64 inode,
- struct btree_iter *iter, const void *key)
-{
- bch_btree_iter_init_intent(iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)));
-
- return bch_hash_lookup_at(desc, info, iter, key);
-}
-
-static inline struct bkey_s_c
-bch_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter)
-{
- while (1) {
- struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
-
- if (btree_iter_err(k))
- return k;
-
- if (k.k->type != desc.key_type)
- return k;
-
- /* hash collision, keep going */
- bch_btree_iter_advance_pos(iter);
- if (iter->pos.inode != k.k->p.inode)
- return bkey_s_c_err(-ENOENT);
- }
-}
-
-static inline struct bkey_s_c bch_hash_hole(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct bch_fs *c, u64 inode,
- struct btree_iter *iter,
- const void *key)
-{
- bch_btree_iter_init_intent(iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)));
-
- return bch_hash_hole_at(desc, iter);
-}
-
-static inline int bch_hash_needs_whiteout(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *iter,
- struct btree_iter *start)
-{
- bch_btree_iter_set_pos(iter,
- btree_type_successor(start->btree_id, start->pos));
-
- while (1) {
- struct bkey_s_c k = bch_btree_iter_peek_with_holes(iter);
- int ret = btree_iter_err(k);
-
- if (ret)
- return ret;
-
- if (k.k->type != desc.key_type &&
- k.k->type != desc.whiteout_type)
- return false;
-
- if (k.k->type == desc.key_type &&
- desc.hash_bkey(info, k) <= start->pos.offset)
- return true;
-
- bch_btree_iter_advance_pos(iter);
- }
-}
-
-#define BCH_HASH_SET_MUST_CREATE 1
-#define BCH_HASH_SET_MUST_REPLACE 2
-
-static inline int bch_hash_set(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct bch_fs *c, u64 inode,
- u64 *journal_seq,
- struct bkey_i *insert, int flags)
-{
- struct btree_iter iter, hashed_slot;
- struct bkey_s_c k;
- int ret;
-
- bch_btree_iter_init_intent(&hashed_slot, c, desc.btree_id,
- POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))));
- bch_btree_iter_init_intent(&iter, c, desc.btree_id, hashed_slot.pos);
- bch_btree_iter_link(&hashed_slot, &iter);
-retry:
- /*
- * On hash collision, we have to keep the slot we hashed to locked while
- * we do the insert - to avoid racing with another thread deleting
- * whatever's in the slot we hashed to:
- */
- ret = bch_btree_iter_traverse(&hashed_slot);
- if (ret)
- goto err;
-
- /*
- * On -EINTR/retry, we dropped locks - always restart from the slot we
- * hashed to:
- */
- bch_btree_iter_copy(&iter, &hashed_slot);
-
- k = bch_hash_lookup_bkey_at(desc, info, &iter, bkey_i_to_s_c(insert));
-
- ret = btree_iter_err(k);
- if (ret == -ENOENT) {
- if (flags & BCH_HASH_SET_MUST_REPLACE) {
- ret = -ENOENT;
- goto err;
- }
-
- /*
- * Not found, so we're now looking for any open
- * slot - we might have skipped over a whiteout
- * that we could have used, so restart from the
- * slot we hashed to:
- */
- bch_btree_iter_copy(&iter, &hashed_slot);
- k = bch_hash_hole_at(desc, &iter);
- if ((ret = btree_iter_err(k)))
- goto err;
- } else if (!ret) {
- if (flags & BCH_HASH_SET_MUST_CREATE) {
- ret = -EEXIST;
- goto err;
- }
- } else {
- goto err;
- }
-
- insert->k.p = iter.pos;
- ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&iter, insert));
-err:
- if (ret == -EINTR)
- goto retry;
-
- /*
- * On successful insert, we don't want to clobber ret with error from
- * iter:
- */
- bch_btree_iter_unlock(&iter);
- bch_btree_iter_unlock(&hashed_slot);
- return ret;
-}
-
-static inline int bch_hash_delete(const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct bch_fs *c, u64 inode,
- u64 *journal_seq, const void *key)
-{
- struct btree_iter iter, whiteout_iter;
- struct bkey_s_c k;
- struct bkey_i delete;
- int ret = -ENOENT;
-
- bch_btree_iter_init_intent(&iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)));
- bch_btree_iter_init(&whiteout_iter, c, desc.btree_id,
- POS(inode, desc.hash_key(info, key)));
- bch_btree_iter_link(&iter, &whiteout_iter);
-retry:
- k = bch_hash_lookup_at(desc, info, &iter, key);
- if ((ret = btree_iter_err(k)))
- goto err;
-
- ret = bch_hash_needs_whiteout(desc, info, &whiteout_iter, &iter);
- if (ret < 0)
- goto err;
-
- bkey_init(&delete.k);
- delete.k.p = k.k->p;
- delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
-
- ret = bch_btree_insert_at(c, NULL, NULL, journal_seq,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&iter, &delete));
-err:
- if (ret == -EINTR)
- goto retry;
-
- bch_btree_iter_unlock(&whiteout_iter);
- bch_btree_iter_unlock(&iter);
- return ret;
-}
-
-#endif /* _BCACHE_STR_HASH_H */
diff --git a/libbcache/super-io.c b/libbcache/super-io.c
deleted file mode 100644
index 67c03e19..00000000
--- a/libbcache/super-io.c
+++ /dev/null
@@ -1,820 +0,0 @@
-
-#include "bcache.h"
-#include "blockdev.h"
-#include "checksum.h"
-#include "error.h"
-#include "io.h"
-#include "journal.h"
-#include "super-io.h"
-#include "super.h"
-#include "vstructs.h"
-
-#include <linux/backing-dev.h>
-#include <linux/sort.h>
-
-static inline void __bch_sb_layout_size_assert(void)
-{
- BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
-}
-
-struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb,
- enum bch_sb_field_type type)
-{
- struct bch_sb_field *f;
-
- /* XXX: need locking around superblock to access optional fields */
-
- vstruct_for_each(sb, f)
- if (le32_to_cpu(f->type) == type)
- return f;
- return NULL;
-}
-
-void bch_free_super(struct bcache_superblock *sb)
-{
- if (sb->bio)
- bio_put(sb->bio);
- if (!IS_ERR_OR_NULL(sb->bdev))
- blkdev_put(sb->bdev, sb->mode);
-
- free_pages((unsigned long) sb->sb, sb->page_order);
- memset(sb, 0, sizeof(*sb));
-}
-
-static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
-{
- struct bch_sb *new_sb;
- struct bio *bio;
-
- if (sb->page_order >= order && sb->sb)
- return 0;
-
- if (dynamic_fault("bcache:add:super_realloc"))
- return -ENOMEM;
-
- bio = bio_kmalloc(GFP_KERNEL, 1 << order);
- if (!bio)
- return -ENOMEM;
-
- if (sb->bio)
- bio_put(sb->bio);
- sb->bio = bio;
-
- new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
- if (!new_sb)
- return -ENOMEM;
-
- if (sb->sb)
- memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
-
- free_pages((unsigned long) sb->sb, sb->page_order);
- sb->sb = new_sb;
-
- sb->page_order = order;
-
- return 0;
-}
-
-static int bch_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
-{
- u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
- u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
-
- if (new_bytes > max_bytes) {
- char buf[BDEVNAME_SIZE];
-
- pr_err("%s: superblock too big: want %llu but have %llu",
- bdevname(sb->bdev, buf), new_bytes, max_bytes);
- return -ENOSPC;
- }
-
- return __bch_super_realloc(sb, get_order(new_bytes));
-}
-
-static int bch_fs_sb_realloc(struct bch_fs *c, unsigned u64s)
-{
- u64 bytes = __vstruct_bytes(struct bch_sb, u64s);
- struct bch_sb *sb;
- unsigned order = get_order(bytes);
-
- if (c->disk_sb && order <= c->disk_sb_order)
- return 0;
-
- sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
- if (!sb)
- return -ENOMEM;
-
- if (c->disk_sb)
- memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order);
-
- free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
-
- c->disk_sb = sb;
- c->disk_sb_order = order;
- return 0;
-}
-
-static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb,
- struct bch_sb_field *f,
- unsigned u64s)
-{
- unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
-
- if (!f) {
- f = vstruct_last(sb);
- memset(f, 0, sizeof(u64) * u64s);
- f->u64s = cpu_to_le32(u64s);
- f->type = 0;
- } else {
- void *src, *dst;
-
- src = vstruct_end(f);
- f->u64s = cpu_to_le32(u64s);
- dst = vstruct_end(f);
-
- memmove(dst, src, vstruct_end(sb) - src);
-
- if (dst > src)
- memset(src, 0, dst - src);
- }
-
- le32_add_cpu(&sb->u64s, u64s - old_u64s);
-
- return f;
-}
-
-struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *sb,
- enum bch_sb_field_type type,
- unsigned u64s)
-{
- struct bch_sb_field *f = bch_sb_field_get(sb->sb, type);
- ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
- ssize_t d = -old_u64s + u64s;
-
- if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
- return NULL;
-
- f = __bch_sb_field_resize(sb->sb, f, u64s);
- f->type = type;
- return f;
-}
-
-struct bch_sb_field *bch_fs_sb_field_resize(struct bch_fs *c,
- enum bch_sb_field_type type,
- unsigned u64s)
-{
- struct bch_sb_field *f = bch_sb_field_get(c->disk_sb, type);
- ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
- ssize_t d = -old_u64s + u64s;
- struct bch_dev *ca;
- unsigned i;
-
- lockdep_assert_held(&c->sb_lock);
-
- if (bch_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d))
- return NULL;
-
- /* XXX: we're not checking that offline device have enough space */
-
- for_each_online_member(ca, c, i) {
- struct bcache_superblock *sb = &ca->disk_sb;
-
- if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
- percpu_ref_put(&ca->ref);
- return NULL;
- }
- }
-
- f = __bch_sb_field_resize(c->disk_sb, f, u64s);
- f->type = type;
- return f;
-}
-
-static const char *validate_sb_layout(struct bch_sb_layout *layout)
-{
- u64 offset, prev_offset, max_sectors;
- unsigned i;
-
- if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
- return "Not a bcache superblock layout";
-
- if (layout->layout_type != 0)
- return "Invalid superblock layout type";
-
- if (!layout->nr_superblocks)
- return "Invalid superblock layout: no superblocks";
-
- if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
- return "Invalid superblock layout: too many superblocks";
-
- max_sectors = 1 << layout->sb_max_size_bits;
-
- prev_offset = le64_to_cpu(layout->sb_offset[0]);
-
- for (i = 1; i < layout->nr_superblocks; i++) {
- offset = le64_to_cpu(layout->sb_offset[i]);
-
- if (offset < prev_offset + max_sectors)
- return "Invalid superblock layout: superblocks overlap";
- prev_offset = offset;
- }
-
- return NULL;
-}
-
-static int u64_cmp(const void *_l, const void *_r)
-{
- u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
-
- return l < r ? -1 : l > r ? 1 : 0;
-}
-
-const char *bch_validate_journal_layout(struct bch_sb *sb,
- struct bch_member_cpu mi)
-{
- struct bch_sb_field_journal *journal;
- const char *err;
- unsigned nr;
- unsigned i;
- u64 *b;
-
- journal = bch_sb_get_journal(sb);
- if (!journal)
- return NULL;
-
- nr = bch_nr_journal_buckets(journal);
- if (!nr)
- return NULL;
-
- b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
- if (!b)
- return "cannot allocate memory";
-
- for (i = 0; i < nr; i++)
- b[i] = le64_to_cpu(journal->buckets[i]);
-
- sort(b, nr, sizeof(u64), u64_cmp, NULL);
-
- err = "journal bucket at sector 0";
- if (!b[0])
- goto err;
-
- err = "journal bucket before first bucket";
- if (b[0] < mi.first_bucket)
- goto err;
-
- err = "journal bucket past end of device";
- if (b[nr - 1] >= mi.nbuckets)
- goto err;
-
- err = "duplicate journal buckets";
- for (i = 0; i + 1 < nr; i++)
- if (b[i] == b[i + 1])
- goto err;
-
- err = NULL;
-err:
- kfree(b);
- return err;
-}
-
-static const char *bch_sb_validate_members(struct bch_sb *sb)
-{
- struct bch_sb_field_members *mi;
- unsigned i;
-
- mi = bch_sb_get_members(sb);
- if (!mi)
- return "Invalid superblock: member info area missing";
-
- if ((void *) (mi->members + sb->nr_devices) >
- vstruct_end(&mi->field))
- return "Invalid superblock: bad member info";
-
- for (i = 0; i < sb->nr_devices; i++) {
- if (bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
- continue;
-
- if (le16_to_cpu(mi->members[i].bucket_size) <
- BCH_SB_BTREE_NODE_SIZE(sb))
- return "bucket size smaller than btree node size";
- }
-
- return NULL;
-}
-
-const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
-{
- struct bch_sb *sb = disk_sb->sb;
- struct bch_sb_field *f;
- struct bch_sb_field_members *sb_mi;
- struct bch_member_cpu mi;
- const char *err;
- u16 block_size;
-
- switch (le64_to_cpu(sb->version)) {
- case BCACHE_SB_VERSION_CDEV_V4:
- break;
- default:
- return"Unsupported superblock version";
- }
-
- if (BCH_SB_INITIALIZED(sb) &&
- le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V4)
- return "Unsupported superblock version";
-
- block_size = le16_to_cpu(sb->block_size);
-
- if (!is_power_of_2(block_size) ||
- block_size > PAGE_SECTORS)
- return "Bad block size";
-
- if (bch_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
- return "Bad user UUID";
-
- if (bch_is_zero(sb->uuid.b, sizeof(uuid_le)))
- return "Bad internal UUID";
-
- if (!sb->nr_devices ||
- sb->nr_devices <= sb->dev_idx ||
- sb->nr_devices > BCH_SB_MEMBERS_MAX)
- return "Bad cache device number in set";
-
- if (!BCH_SB_META_REPLICAS_WANT(sb) ||
- BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
- return "Invalid number of metadata replicas";
-
- if (!BCH_SB_META_REPLICAS_REQ(sb) ||
- BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
- return "Invalid number of metadata replicas";
-
- if (!BCH_SB_META_REPLICAS_HAVE(sb) ||
- BCH_SB_META_REPLICAS_HAVE(sb) >
- BCH_SB_META_REPLICAS_WANT(sb))
- return "Invalid number of metadata replicas";
-
- if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
- BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
- return "Invalid number of data replicas";
-
- if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
- BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
- return "Invalid number of metadata replicas";
-
- if (!BCH_SB_DATA_REPLICAS_HAVE(sb) ||
- BCH_SB_DATA_REPLICAS_HAVE(sb) >
- BCH_SB_DATA_REPLICAS_WANT(sb))
- return "Invalid number of data replicas";
-
- if (!BCH_SB_BTREE_NODE_SIZE(sb))
- return "Btree node size not set";
-
- if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
- return "Btree node size not a power of two";
-
- if (BCH_SB_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
- return "Btree node size too large";
-
- if (BCH_SB_GC_RESERVE(sb) < 5)
- return "gc reserve percentage too small";
-
- if (1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) < block_size)
- return "max journal entry size too small";
-
- /* 4 mb max: */
- if (512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX)
- return "max journal entry size too big";
-
- if (!sb->time_precision ||
- le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
- return "invalid time precision";
-
- /* validate layout */
- err = validate_sb_layout(&sb->layout);
- if (err)
- return err;
-
- vstruct_for_each(sb, f) {
- if (!f->u64s)
- return "Invalid superblock: invalid optional field";
-
- if (vstruct_next(f) > vstruct_last(sb))
- return "Invalid superblock: invalid optional field";
-
- if (le32_to_cpu(f->type) >= BCH_SB_FIELD_NR)
- return "Invalid superblock: unknown optional field type";
- }
-
- err = bch_sb_validate_members(sb);
- if (err)
- return err;
-
- sb_mi = bch_sb_get_members(sb);
- mi = bch_mi_to_cpu(sb_mi->members + sb->dev_idx);
-
- if (mi.nbuckets > LONG_MAX)
- return "Too many buckets";
-
- if (mi.nbuckets - mi.first_bucket < 1 << 10)
- return "Not enough buckets";
-
- if (!is_power_of_2(mi.bucket_size) ||
- mi.bucket_size < PAGE_SECTORS ||
- mi.bucket_size < block_size)
- return "Bad bucket size";
-
- if (get_capacity(disk_sb->bdev->bd_disk) <
- mi.bucket_size * mi.nbuckets)
- return "Invalid superblock: device too small";
-
- err = bch_validate_journal_layout(sb, mi);
- if (err)
- return err;
-
- return NULL;
-}
-
-/* device open: */
-
-static const char *bch_blkdev_open(const char *path, fmode_t mode,
- void *holder, struct block_device **ret)
-{
- struct block_device *bdev;
-
- *ret = NULL;
- bdev = blkdev_get_by_path(path, mode, holder);
- if (bdev == ERR_PTR(-EBUSY))
- return "device busy";
-
- if (IS_ERR(bdev))
- return "failed to open device";
-
- if (mode & FMODE_WRITE)
- bdev_get_queue(bdev)->backing_dev_info.capabilities
- |= BDI_CAP_STABLE_WRITES;
-
- *ret = bdev;
- return NULL;
-}
-
-static void bch_sb_update(struct bch_fs *c)
-{
- struct bch_sb *src = c->disk_sb;
- struct bch_sb_field_members *mi = bch_sb_get_members(src);
- struct bch_dev *ca;
- unsigned i;
-
- lockdep_assert_held(&c->sb_lock);
-
- c->sb.uuid = src->uuid;
- c->sb.user_uuid = src->user_uuid;
- c->sb.block_size = le16_to_cpu(src->block_size);
- c->sb.btree_node_size = BCH_SB_BTREE_NODE_SIZE(src);
- c->sb.nr_devices = src->nr_devices;
- c->sb.clean = BCH_SB_CLEAN(src);
- c->sb.meta_replicas_have= BCH_SB_META_REPLICAS_HAVE(src);
- c->sb.data_replicas_have= BCH_SB_DATA_REPLICAS_HAVE(src);
- c->sb.str_hash_type = BCH_SB_STR_HASH_TYPE(src);
- c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src);
- c->sb.time_base_lo = le64_to_cpu(src->time_base_lo);
- c->sb.time_base_hi = le32_to_cpu(src->time_base_hi);
- c->sb.time_precision = le32_to_cpu(src->time_precision);
-
- for_each_member_device(ca, c, i)
- ca->mi = bch_mi_to_cpu(mi->members + i);
-}
-
-/* doesn't copy member info */
-static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
-{
- struct bch_sb_field *src_f, *dst_f;
-
- dst->version = src->version;
- dst->seq = src->seq;
- dst->uuid = src->uuid;
- dst->user_uuid = src->user_uuid;
- memcpy(dst->label, src->label, sizeof(dst->label));
-
- dst->block_size = src->block_size;
- dst->nr_devices = src->nr_devices;
-
- dst->time_base_lo = src->time_base_lo;
- dst->time_base_hi = src->time_base_hi;
- dst->time_precision = src->time_precision;
-
- memcpy(dst->flags, src->flags, sizeof(dst->flags));
- memcpy(dst->features, src->features, sizeof(dst->features));
- memcpy(dst->compat, src->compat, sizeof(dst->compat));
-
- vstruct_for_each(src, src_f) {
- if (src_f->type == BCH_SB_FIELD_journal)
- continue;
-
- dst_f = bch_sb_field_get(dst, src_f->type);
- dst_f = __bch_sb_field_resize(dst, dst_f,
- le32_to_cpu(src_f->u64s));
-
- memcpy(dst_f, src_f, vstruct_bytes(src_f));
- }
-}
-
-int bch_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
-{
- struct bch_sb_field_journal *journal_buckets =
- bch_sb_get_journal(src);
- unsigned journal_u64s = journal_buckets
- ? le32_to_cpu(journal_buckets->field.u64s)
- : 0;
-
- lockdep_assert_held(&c->sb_lock);
-
- if (bch_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s))
- return -ENOMEM;
-
- __copy_super(c->disk_sb, src);
- bch_sb_update(c);
-
- return 0;
-}
-
-int bch_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb;
- struct bch_sb_field_journal *journal_buckets =
- bch_sb_get_journal(dst);
- unsigned journal_u64s = journal_buckets
- ? le32_to_cpu(journal_buckets->field.u64s)
- : 0;
- unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
- int ret;
-
- ret = bch_sb_realloc(&ca->disk_sb, u64s);
- if (ret)
- return ret;
-
- __copy_super(dst, src);
-
- return 0;
-}
-
-/* read superblock: */
-
-static const char *read_one_super(struct bcache_superblock *sb, u64 offset)
-{
- struct bch_csum csum;
- size_t bytes;
- unsigned order;
-reread:
- bio_reset(sb->bio);
- sb->bio->bi_bdev = sb->bdev;
- sb->bio->bi_iter.bi_sector = offset;
- sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
- bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
- bch_bio_map(sb->bio, sb->sb);
-
- if (submit_bio_wait(sb->bio))
- return "IO error";
-
- if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
- return "Not a bcache superblock";
-
- if (le64_to_cpu(sb->sb->version) != BCACHE_SB_VERSION_CDEV_V4)
- return "Unsupported superblock version";
-
- bytes = vstruct_bytes(sb->sb);
-
- if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
- return "Bad superblock: too big";
-
- order = get_order(bytes);
- if (order > sb->page_order) {
- if (__bch_super_realloc(sb, order))
- return "cannot allocate memory";
- goto reread;
- }
-
- if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
- return "unknown csum type";
-
- /* XXX: verify MACs */
- csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
- (struct nonce) { 0 }, sb->sb);
-
- if (bch_crc_cmp(csum, sb->sb->csum))
- return "bad checksum reading superblock";
-
- return NULL;
-}
-
-const char *bch_read_super(struct bcache_superblock *sb,
- struct bch_opts opts,
- const char *path)
-{
- u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR;
- struct bch_sb_layout layout;
- const char *err;
- unsigned i;
-
- memset(sb, 0, sizeof(*sb));
- sb->mode = FMODE_READ;
-
- if (!(opt_defined(opts.noexcl) && opts.noexcl))
- sb->mode |= FMODE_EXCL;
-
- if (!(opt_defined(opts.nochanges) && opts.nochanges))
- sb->mode |= FMODE_WRITE;
-
- err = bch_blkdev_open(path, sb->mode, sb, &sb->bdev);
- if (err)
- return err;
-
- err = "cannot allocate memory";
- if (__bch_super_realloc(sb, 0))
- goto err;
-
- err = "dynamic fault";
- if (bch_fs_init_fault("read_super"))
- goto err;
-
- err = read_one_super(sb, offset);
- if (!err)
- goto got_super;
-
- if (offset != BCH_SB_SECTOR) {
- pr_err("error reading superblock: %s", err);
- goto err;
- }
-
- pr_err("error reading default superblock: %s", err);
-
- /*
- * Error reading primary superblock - read location of backup
- * superblocks:
- */
- bio_reset(sb->bio);
- sb->bio->bi_bdev = sb->bdev;
- sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
- sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
- bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
- /*
- * use sb buffer to read layout, since sb buffer is page aligned but
- * layout won't be:
- */
- bch_bio_map(sb->bio, sb->sb);
-
- err = "IO error";
- if (submit_bio_wait(sb->bio))
- goto err;
-
- memcpy(&layout, sb->sb, sizeof(layout));
- err = validate_sb_layout(&layout);
- if (err)
- goto err;
-
- for (i = 0; i < layout.nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout.sb_offset[i]);
-
- if (offset == BCH_SB_SECTOR)
- continue;
-
- err = read_one_super(sb, offset);
- if (!err)
- goto got_super;
- }
- goto err;
-got_super:
- pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
- le64_to_cpu(sb->sb->version),
- le64_to_cpu(sb->sb->flags),
- le64_to_cpu(sb->sb->seq),
- le16_to_cpu(sb->sb->u64s));
-
- err = "Superblock block size smaller than device block size";
- if (le16_to_cpu(sb->sb->block_size) << 9 <
- bdev_logical_block_size(sb->bdev))
- goto err;
-
- return NULL;
-err:
- bch_free_super(sb);
- return err;
-}
-
-/* write superblock: */
-
-static void write_super_endio(struct bio *bio)
-{
- struct bch_dev *ca = bio->bi_private;
-
- /* XXX: return errors directly */
-
- bch_dev_fatal_io_err_on(bio->bi_error, ca, "superblock write");
-
- bch_account_io_completion(ca);
-
- closure_put(&ca->fs->sb_write);
- percpu_ref_put(&ca->io_ref);
-}
-
-static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
-{
- struct bch_sb *sb = ca->disk_sb.sb;
- struct bio *bio = ca->disk_sb.bio;
-
- if (idx >= sb->layout.nr_superblocks)
- return false;
-
- if (!percpu_ref_tryget(&ca->io_ref))
- return false;
-
- sb->offset = sb->layout.sb_offset[idx];
-
- SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
- sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
- (struct nonce) { 0 }, sb);
-
- bio_reset(bio);
- bio->bi_bdev = ca->disk_sb.bdev;
- bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
- bio->bi_iter.bi_size =
- roundup(vstruct_bytes(sb),
- bdev_logical_block_size(ca->disk_sb.bdev));
- bio->bi_end_io = write_super_endio;
- bio->bi_private = ca;
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
- bch_bio_map(bio, sb);
-
- closure_bio_submit_punt(bio, &c->sb_write, c);
- return true;
-}
-
-void bch_write_super(struct bch_fs *c)
-{
- struct closure *cl = &c->sb_write;
- struct bch_dev *ca;
- unsigned i, super_idx = 0;
- bool wrote;
-
- lockdep_assert_held(&c->sb_lock);
-
- closure_init_stack(cl);
-
- le64_add_cpu(&c->disk_sb->seq, 1);
-
- for_each_online_member(ca, c, i)
- bch_sb_from_fs(c, ca);
-
- if (c->opts.nochanges)
- goto out;
-
- do {
- wrote = false;
- for_each_online_member(ca, c, i)
- if (write_one_super(c, ca, super_idx))
- wrote = true;
-
- closure_sync(cl);
- super_idx++;
- } while (wrote);
-out:
- /* Make new options visible after they're persistent: */
- bch_sb_update(c);
-}
-
-void bch_check_mark_super_slowpath(struct bch_fs *c, const struct bkey_i *k,
- bool meta)
-{
- struct bch_member *mi;
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
- const struct bch_extent_ptr *ptr;
- unsigned nr_replicas = 0;
-
- mutex_lock(&c->sb_lock);
-
- /* recheck, might have raced */
- if (bch_check_super_marked(c, k, meta)) {
- mutex_unlock(&c->sb_lock);
- return;
- }
-
- mi = bch_sb_get_members(c->disk_sb)->members;
-
- extent_for_each_ptr(e, ptr)
- if (!ptr->cached) {
- (meta
- ? SET_BCH_MEMBER_HAS_METADATA
- : SET_BCH_MEMBER_HAS_DATA)(mi + ptr->dev, true);
- nr_replicas++;
- }
-
- nr_replicas = min_t(unsigned, nr_replicas,
- (meta
- ? BCH_SB_META_REPLICAS_HAVE
- : BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb));
- (meta
- ? SET_BCH_SB_META_REPLICAS_HAVE
- : SET_BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb, nr_replicas);
-
- bch_write_super(c);
- mutex_unlock(&c->sb_lock);
-}
diff --git a/libbcache/super-io.h b/libbcache/super-io.h
deleted file mode 100644
index 1a9bd309..00000000
--- a/libbcache/super-io.h
+++ /dev/null
@@ -1,159 +0,0 @@
-#ifndef _BCACHE_SUPER_IO_H
-#define _BCACHE_SUPER_IO_H
-
-#include "extents.h"
-#include "super_types.h"
-
-#include <asm/byteorder.h>
-
-struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
-struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *,
- enum bch_sb_field_type, unsigned);
-struct bch_sb_field *bch_fs_sb_field_resize(struct bch_fs *,
- enum bch_sb_field_type, unsigned);
-
-#define field_to_type(_f, _name) \
- container_of_or_null(_f, struct bch_sb_field_##_name, field)
-
-#define BCH_SB_FIELD_TYPE(_name) \
-static inline struct bch_sb_field_##_name * \
-bch_sb_get_##_name(struct bch_sb *sb) \
-{ \
- return field_to_type(bch_sb_field_get(sb, \
- BCH_SB_FIELD_##_name), _name); \
-} \
- \
-static inline struct bch_sb_field_##_name * \
-bch_sb_resize_##_name(struct bcache_superblock *sb, unsigned u64s) \
-{ \
- return field_to_type(bch_sb_field_resize(sb, \
- BCH_SB_FIELD_##_name, u64s), _name); \
-} \
- \
-static inline struct bch_sb_field_##_name * \
-bch_fs_sb_resize_##_name(struct bch_fs *c, unsigned u64s) \
-{ \
- return field_to_type(bch_fs_sb_field_resize(c, \
- BCH_SB_FIELD_##_name, u64s), _name); \
-}
-
-BCH_SB_FIELD_TYPE(journal);
-BCH_SB_FIELD_TYPE(members);
-BCH_SB_FIELD_TYPE(crypt);
-
-static inline bool bch_sb_test_feature(struct bch_sb *sb,
- enum bch_sb_features f)
-{
- unsigned w = f / 64;
- unsigned b = f % 64;
-
- return le64_to_cpu(sb->features[w]) & (1ULL << b);
-}
-
-static inline void bch_sb_set_feature(struct bch_sb *sb,
- enum bch_sb_features f)
-{
- if (!bch_sb_test_feature(sb, f)) {
- unsigned w = f / 64;
- unsigned b = f % 64;
-
- le64_add_cpu(&sb->features[w], 1ULL << b);
- }
-}
-
-static inline __le64 bch_sb_magic(struct bch_fs *c)
-{
- __le64 ret;
- memcpy(&ret, &c->sb.uuid, sizeof(ret));
- return ret;
-}
-
-static inline __u64 jset_magic(struct bch_fs *c)
-{
- return __le64_to_cpu(bch_sb_magic(c) ^ JSET_MAGIC);
-}
-
-static inline __u64 pset_magic(struct bch_fs *c)
-{
- return __le64_to_cpu(bch_sb_magic(c) ^ PSET_MAGIC);
-}
-
-static inline __u64 bset_magic(struct bch_fs *c)
-{
- return __le64_to_cpu(bch_sb_magic(c) ^ BSET_MAGIC);
-}
-
-static inline struct bch_member_cpu bch_mi_to_cpu(struct bch_member *mi)
-{
- return (struct bch_member_cpu) {
- .nbuckets = le64_to_cpu(mi->nbuckets),
- .first_bucket = le16_to_cpu(mi->first_bucket),
- .bucket_size = le16_to_cpu(mi->bucket_size),
- .state = BCH_MEMBER_STATE(mi),
- .tier = BCH_MEMBER_TIER(mi),
- .has_metadata = BCH_MEMBER_HAS_METADATA(mi),
- .has_data = BCH_MEMBER_HAS_DATA(mi),
- .replacement = BCH_MEMBER_REPLACEMENT(mi),
- .discard = BCH_MEMBER_DISCARD(mi),
- .valid = !bch_is_zero(mi->uuid.b, sizeof(uuid_le)),
- };
-}
-
-int bch_sb_to_fs(struct bch_fs *, struct bch_sb *);
-int bch_sb_from_fs(struct bch_fs *, struct bch_dev *);
-
-void bch_free_super(struct bcache_superblock *);
-int bch_super_realloc(struct bcache_superblock *, unsigned);
-
-const char *bch_validate_journal_layout(struct bch_sb *,
- struct bch_member_cpu);
-const char *bch_validate_cache_super(struct bcache_superblock *);
-
-const char *bch_read_super(struct bcache_superblock *,
- struct bch_opts, const char *);
-void bch_write_super(struct bch_fs *);
-
-void bch_check_mark_super_slowpath(struct bch_fs *,
- const struct bkey_i *, bool);
-
-static inline bool bch_check_super_marked(struct bch_fs *c,
- const struct bkey_i *k, bool meta)
-{
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
- const struct bch_extent_ptr *ptr;
- unsigned nr_replicas = 0;
- bool ret = true;
-
- extent_for_each_ptr(e, ptr) {
- struct bch_dev *ca = c->devs[ptr->dev];
-
- if (ptr->cached)
- continue;
-
- if (!(meta
- ? ca->mi.has_metadata
- : ca->mi.has_data)) {
- ret = false;
- break;
- }
-
- nr_replicas++;
- }
-
- if (nr_replicas <
- (meta ? c->sb.meta_replicas_have : c->sb.data_replicas_have))
- ret = false;
-
- return ret;
-}
-
-static inline void bch_check_mark_super(struct bch_fs *c,
- const struct bkey_i *k, bool meta)
-{
- if (bch_check_super_marked(c, k, meta))
- return;
-
- bch_check_mark_super_slowpath(c, k, meta);
-}
-
-#endif /* _BCACHE_SUPER_IO_H */
diff --git a/libbcache/super.c b/libbcache/super.c
deleted file mode 100644
index f5f74936..00000000
--- a/libbcache/super.c
+++ /dev/null
@@ -1,2047 +0,0 @@
-/*
- * bcache setup/teardown code, and some metadata io - read a superblock and
- * figure out what to do with it.
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcache.h"
-#include "blockdev.h"
-#include "alloc.h"
-#include "btree_cache.h"
-#include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_io.h"
-#include "chardev.h"
-#include "checksum.h"
-#include "clock.h"
-#include "compress.h"
-#include "debug.h"
-#include "error.h"
-#include "fs.h"
-#include "fs-gc.h"
-#include "inode.h"
-#include "io.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "migrate.h"
-#include "movinggc.h"
-#include "notify.h"
-#include "stats.h"
-#include "super.h"
-#include "super-io.h"
-#include "tier.h"
-#include "writeback.h"
-
-#include <linux/backing-dev.h>
-#include <linux/blkdev.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/genhd.h>
-#include <linux/idr.h>
-#include <linux/kthread.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/random.h>
-#include <linux/reboot.h>
-#include <linux/sysfs.h>
-#include <crypto/hash.h>
-
-#include <trace/events/bcache.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
-
-static const uuid_le invalid_uuid = {
- .b = {
- 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
- 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
- }
-};
-
-static struct kset *bcache_kset;
-static LIST_HEAD(bch_fs_list);
-static DEFINE_MUTEX(bch_fs_list_lock);
-
-static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
-struct workqueue_struct *bcache_io_wq;
-struct crypto_shash *bch_sha256;
-
-static void bch_dev_free(struct bch_dev *);
-static int bch_dev_alloc(struct bch_fs *, unsigned);
-static int bch_dev_sysfs_online(struct bch_dev *);
-static void __bch_dev_read_only(struct bch_fs *, struct bch_dev *);
-
-struct bch_fs *bch_bdev_to_fs(struct block_device *bdev)
-{
- struct bch_fs *c;
- struct bch_dev *ca;
- unsigned i;
-
- mutex_lock(&bch_fs_list_lock);
- rcu_read_lock();
-
- list_for_each_entry(c, &bch_fs_list, list)
- for_each_member_device_rcu(ca, c, i)
- if (ca->disk_sb.bdev == bdev) {
- closure_get(&c->cl);
- goto found;
- }
- c = NULL;
-found:
- rcu_read_unlock();
- mutex_unlock(&bch_fs_list_lock);
-
- return c;
-}
-
-static struct bch_fs *__bch_uuid_to_fs(uuid_le uuid)
-{
- struct bch_fs *c;
-
- lockdep_assert_held(&bch_fs_list_lock);
-
- list_for_each_entry(c, &bch_fs_list, list)
- if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
- return c;
-
- return NULL;
-}
-
-struct bch_fs *bch_uuid_to_fs(uuid_le uuid)
-{
- struct bch_fs *c;
-
- mutex_lock(&bch_fs_list_lock);
- c = __bch_uuid_to_fs(uuid);
- if (c)
- closure_get(&c->cl);
- mutex_unlock(&bch_fs_list_lock);
-
- return c;
-}
-
-int bch_congested(struct bch_fs *c, int bdi_bits)
-{
- struct backing_dev_info *bdi;
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
-
- if (bdi_bits & (1 << WB_sync_congested)) {
- /* Reads - check all devices: */
- for_each_readable_member(ca, c, i) {
- bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
-
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- } else {
- /* Writes prefer fastest tier: */
- struct bch_tier *tier = READ_ONCE(c->fastest_tier);
- struct dev_group *grp = tier ? &tier->devs : &c->all_devs;
-
- rcu_read_lock();
- group_for_each_dev(ca, grp, i) {
- bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
-
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- rcu_read_unlock();
- }
-
- return ret;
-}
-
-static int bch_congested_fn(void *data, int bdi_bits)
-{
- struct bch_fs *c = data;
-
- return bch_congested(c, bdi_bits);
-}
-
-/* Filesystem RO/RW: */
-
-/*
- * For startup/shutdown of RW stuff, the dependencies are:
- *
- * - foreground writes depend on copygc and tiering (to free up space)
- *
- * - copygc and tiering depend on mark and sweep gc (they actually probably
- * don't because they either reserve ahead of time or don't block if
- * allocations fail, but allocations can require mark and sweep gc to run
- * because of generation number wraparound)
- *
- * - all of the above depends on the allocator threads
- *
- * - allocator depends on the journal (when it rewrites prios and gens)
- */
-
-static void __bch_fs_read_only(struct bch_fs *c)
-{
- struct bch_dev *ca;
- unsigned i;
-
- bch_tiering_stop(c);
-
- for_each_member_device(ca, c, i)
- bch_moving_gc_stop(ca);
-
- bch_gc_thread_stop(c);
-
- bch_btree_flush(c);
-
- for_each_member_device(ca, c, i)
- bch_dev_allocator_stop(ca);
-
- bch_fs_journal_stop(&c->journal);
-}
-
-static void bch_writes_disabled(struct percpu_ref *writes)
-{
- struct bch_fs *c = container_of(writes, struct bch_fs, writes);
-
- set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
- wake_up(&bch_read_only_wait);
-}
-
-void bch_fs_read_only(struct bch_fs *c)
-{
- mutex_lock(&c->state_lock);
- if (c->state != BCH_FS_STARTING &&
- c->state != BCH_FS_RW)
- goto out;
-
- if (test_bit(BCH_FS_ERROR, &c->flags))
- goto out;
-
- trace_fs_read_only(c);
-
- /*
- * Block new foreground-end write operations from starting - any new
- * writes will return -EROFS:
- *
- * (This is really blocking new _allocations_, writes to previously
- * allocated space can still happen until stopping the allocator in
- * bch_dev_allocator_stop()).
- */
- percpu_ref_kill(&c->writes);
-
- del_timer(&c->foreground_write_wakeup);
- cancel_delayed_work(&c->pd_controllers_update);
-
- c->foreground_write_pd.rate.rate = UINT_MAX;
- bch_wake_delayed_writes((unsigned long) c);
-
- /*
- * If we're not doing an emergency shutdown, we want to wait on
- * outstanding writes to complete so they don't see spurious errors due
- * to shutting down the allocator:
- *
- * If we are doing an emergency shutdown outstanding writes may
- * hang until we shutdown the allocator so we don't want to wait
- * on outstanding writes before shutting everything down - but
- * we do need to wait on them before returning and signalling
- * that going RO is complete:
- */
- wait_event(bch_read_only_wait,
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
- test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
-
- __bch_fs_read_only(c);
-
- wait_event(bch_read_only_wait,
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
-
- clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
-
- if (!bch_journal_error(&c->journal) &&
- !test_bit(BCH_FS_ERROR, &c->flags)) {
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_CLEAN(c->disk_sb, true);
- bch_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-
- c->state = BCH_FS_RO;
- bch_notify_fs_read_only(c);
- trace_fs_read_only_done(c);
-out:
- mutex_unlock(&c->state_lock);
-}
-
-static void bch_fs_read_only_work(struct work_struct *work)
-{
- struct bch_fs *c =
- container_of(work, struct bch_fs, read_only_work);
-
- bch_fs_read_only(c);
-}
-
-static void bch_fs_read_only_async(struct bch_fs *c)
-{
- queue_work(system_long_wq, &c->read_only_work);
-}
-
-bool bch_fs_emergency_read_only(struct bch_fs *c)
-{
- bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
-
- bch_fs_read_only_async(c);
- bch_journal_halt(&c->journal);
-
- wake_up(&bch_read_only_wait);
- return ret;
-}
-
-const char *bch_fs_read_write(struct bch_fs *c)
-{
- struct bch_dev *ca;
- const char *err = NULL;
- unsigned i;
-
- mutex_lock(&c->state_lock);
- if (c->state != BCH_FS_STARTING &&
- c->state != BCH_FS_RO)
- goto out;
-
- err = "error starting allocator thread";
- for_each_rw_member(ca, c, i)
- if (bch_dev_allocator_start(ca)) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
-
- err = "error starting btree GC thread";
- if (bch_gc_thread_start(c))
- goto err;
-
- err = "error starting moving GC thread";
- for_each_rw_member(ca, c, i)
- if (bch_moving_gc_start(ca)) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
-
- err = "error starting tiering thread";
- if (bch_tiering_start(c))
- goto err;
-
- schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
-
- if (c->state != BCH_FS_STARTING)
- percpu_ref_reinit(&c->writes);
-
- c->state = BCH_FS_RW;
- err = NULL;
-out:
- mutex_unlock(&c->state_lock);
- return err;
-err:
- __bch_fs_read_only(c);
- goto out;
-}
-
-/* Filesystem startup/shutdown: */
-
-static void bch_fs_free(struct bch_fs *c)
-{
- bch_fs_encryption_exit(c);
- bch_fs_btree_exit(c);
- bch_fs_journal_exit(&c->journal);
- bch_io_clock_exit(&c->io_clock[WRITE]);
- bch_io_clock_exit(&c->io_clock[READ]);
- bch_fs_compress_exit(c);
- bch_fs_blockdev_exit(c);
- bdi_destroy(&c->bdi);
- lg_lock_free(&c->usage_lock);
- free_percpu(c->usage_percpu);
- mempool_exit(&c->btree_bounce_pool);
- mempool_exit(&c->bio_bounce_pages);
- bioset_exit(&c->bio_write);
- bioset_exit(&c->bio_read_split);
- bioset_exit(&c->bio_read);
- bioset_exit(&c->btree_read_bio);
- mempool_exit(&c->btree_interior_update_pool);
- mempool_exit(&c->btree_reserve_pool);
- mempool_exit(&c->fill_iter);
- percpu_ref_exit(&c->writes);
-
- if (c->copygc_wq)
- destroy_workqueue(c->copygc_wq);
- if (c->wq)
- destroy_workqueue(c->wq);
-
- free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
- kfree(c);
- module_put(THIS_MODULE);
-}
-
-static void bch_fs_exit(struct bch_fs *c)
-{
- unsigned i;
-
- del_timer_sync(&c->foreground_write_wakeup);
- cancel_delayed_work_sync(&c->pd_controllers_update);
- cancel_work_sync(&c->read_only_work);
- cancel_work_sync(&c->bio_submit_work);
- cancel_work_sync(&c->read_retry_work);
-
- for (i = 0; i < c->sb.nr_devices; i++)
- if (c->devs[i])
- bch_dev_free(c->devs[i]);
-
- closure_debug_destroy(&c->cl);
- kobject_put(&c->kobj);
-}
-
-static void bch_fs_offline(struct bch_fs *c)
-{
- struct bch_dev *ca;
- unsigned i;
-
- mutex_lock(&bch_fs_list_lock);
- list_del(&c->list);
- mutex_unlock(&bch_fs_list_lock);
-
- for_each_member_device(ca, c, i)
- if (ca->kobj.state_in_sysfs &&
- ca->disk_sb.bdev)
- sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
- "bcache");
-
- if (c->kobj.state_in_sysfs)
- kobject_del(&c->kobj);
-
- bch_fs_debug_exit(c);
- bch_fs_chardev_exit(c);
-
- bch_cache_accounting_destroy(&c->accounting);
-
- kobject_put(&c->time_stats);
- kobject_put(&c->opts_dir);
- kobject_put(&c->internal);
-
- __bch_fs_read_only(c);
-}
-
-/*
- * should be __bch_fs_stop4 - block devices are closed, now we can finally
- * free it
- */
-void bch_fs_release(struct kobject *kobj)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
- bch_notify_fs_stopped(c);
- bch_fs_free(c);
-}
-
-/*
- * All activity on the filesystem should have stopped now - close devices:
- */
-static void __bch_fs_stop3(struct closure *cl)
-{
- struct bch_fs *c = container_of(cl, struct bch_fs, cl);
-
- bch_fs_exit(c);
-}
-
-/*
- * Openers (i.e. block devices) should have exited, shutdown all userspace
- * interfaces and wait for &c->cl to hit 0
- */
-static void __bch_fs_stop2(struct closure *cl)
-{
- struct bch_fs *c = container_of(cl, struct bch_fs, caching);
-
- bch_fs_offline(c);
-
- closure_return(cl);
-}
-
-/*
- * First phase of the shutdown process that's kicked off by bch_fs_stop_async();
- * we haven't waited for anything to stop yet, we're just punting to process
- * context to shut down block devices:
- */
-static void __bch_fs_stop1(struct closure *cl)
-{
- struct bch_fs *c = container_of(cl, struct bch_fs, caching);
-
- bch_blockdevs_stop(c);
-
- continue_at(cl, __bch_fs_stop2, system_wq);
-}
-
-void bch_fs_stop_async(struct bch_fs *c)
-{
- mutex_lock(&c->state_lock);
- if (c->state != BCH_FS_STOPPING) {
- c->state = BCH_FS_STOPPING;
- closure_queue(&c->caching);
- }
- mutex_unlock(&c->state_lock);
-}
-
-void bch_fs_stop(struct bch_fs *c)
-{
- mutex_lock(&c->state_lock);
- BUG_ON(c->state == BCH_FS_STOPPING);
- c->state = BCH_FS_STOPPING;
- mutex_unlock(&c->state_lock);
-
- bch_blockdevs_stop(c);
-
- closure_sync(&c->caching);
- closure_debug_destroy(&c->caching);
-
- bch_fs_offline(c);
-
- closure_put(&c->cl);
- closure_sync(&c->cl);
-
- bch_fs_exit(c);
-}
-
-/* Stop, detaching from backing devices: */
-void bch_fs_detach(struct bch_fs *c)
-{
- if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags))
- bch_fs_stop_async(c);
-}
-
-#define alloc_bucket_pages(gfp, ca) \
- ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
-
-static struct bch_fs *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
-{
- struct bch_sb_field_members *mi;
- struct bch_fs *c;
- unsigned i, iter_size, journal_entry_bytes;
-
- c = kzalloc(sizeof(struct bch_fs), GFP_KERNEL);
- if (!c)
- return NULL;
-
- __module_get(THIS_MODULE);
-
- c->minor = -1;
-
- mutex_init(&c->state_lock);
- mutex_init(&c->sb_lock);
- INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
- mutex_init(&c->btree_cache_lock);
- mutex_init(&c->bucket_lock);
- mutex_init(&c->btree_root_lock);
- INIT_WORK(&c->read_only_work, bch_fs_read_only_work);
-
- init_rwsem(&c->gc_lock);
-
-#define BCH_TIME_STAT(name, frequency_units, duration_units) \
- spin_lock_init(&c->name##_time.lock);
- BCH_TIME_STATS()
-#undef BCH_TIME_STAT
-
- bch_fs_allocator_init(c);
- bch_fs_tiering_init(c);
-
- INIT_LIST_HEAD(&c->list);
- INIT_LIST_HEAD(&c->cached_devs);
- INIT_LIST_HEAD(&c->btree_cache);
- INIT_LIST_HEAD(&c->btree_cache_freeable);
- INIT_LIST_HEAD(&c->btree_cache_freed);
-
- INIT_LIST_HEAD(&c->btree_interior_update_list);
- mutex_init(&c->btree_reserve_cache_lock);
- mutex_init(&c->btree_interior_update_lock);
-
- mutex_init(&c->bio_bounce_pages_lock);
- INIT_WORK(&c->bio_submit_work, bch_bio_submit_work);
- spin_lock_init(&c->bio_submit_lock);
- bio_list_init(&c->read_retry_list);
- spin_lock_init(&c->read_retry_lock);
- INIT_WORK(&c->read_retry_work, bch_read_retry_work);
- mutex_init(&c->zlib_workspace_lock);
-
- seqcount_init(&c->gc_pos_lock);
-
- c->prio_clock[READ].hand = 1;
- c->prio_clock[READ].min_prio = 0;
- c->prio_clock[WRITE].hand = 1;
- c->prio_clock[WRITE].min_prio = 0;
-
- c->congested_read_threshold_us = 2000;
- c->congested_write_threshold_us = 20000;
- c->error_limit = 16 << IO_ERROR_SHIFT;
- init_waitqueue_head(&c->writeback_wait);
-
- c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
-
- c->copy_gc_enabled = 1;
- c->tiering_enabled = 1;
- c->tiering_percent = 10;
-
- c->foreground_target_percent = 20;
-
- c->journal.write_time = &c->journal_write_time;
- c->journal.delay_time = &c->journal_delay_time;
- c->journal.blocked_time = &c->journal_blocked_time;
- c->journal.flush_seq_time = &c->journal_flush_seq_time;
-
- mutex_init(&c->uevent_lock);
-
- mutex_lock(&c->sb_lock);
-
- if (bch_sb_to_fs(c, sb)) {
- mutex_unlock(&c->sb_lock);
- goto err;
- }
-
- mutex_unlock(&c->sb_lock);
-
- scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
-
- bch_opts_apply(&c->opts, bch_sb_opts(sb));
- bch_opts_apply(&c->opts, opts);
-
- c->opts.nochanges |= c->opts.noreplay;
- c->opts.read_only |= c->opts.nochanges;
-
- c->block_bits = ilog2(c->sb.block_size);
-
- if (bch_fs_init_fault("fs_alloc"))
- goto err;
-
- iter_size = (btree_blocks(c) + 1) * 2 *
- sizeof(struct btree_node_iter_set);
-
- journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb);
-
- if (!(c->wq = alloc_workqueue("bcache",
- WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
- !(c->copygc_wq = alloc_workqueue("bcache_copygc",
- WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
- percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) ||
- mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
- sizeof(struct btree_reserve)) ||
- mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
- sizeof(struct btree_interior_update)) ||
- mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
- bioset_init(&c->btree_read_bio, 1, 0) ||
- bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
- bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
- bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
- mempool_init_page_pool(&c->bio_bounce_pages,
- max_t(unsigned,
- c->sb.btree_node_size,
- BCH_ENCODED_EXTENT_MAX) /
- PAGE_SECTORS, 0) ||
- !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
- lg_lock_init(&c->usage_lock) ||
- mempool_init_page_pool(&c->btree_bounce_pool, 1,
- ilog2(btree_pages(c))) ||
- bdi_setup_and_register(&c->bdi, "bcache") ||
- bch_fs_blockdev_init(c) ||
- bch_io_clock_init(&c->io_clock[READ]) ||
- bch_io_clock_init(&c->io_clock[WRITE]) ||
- bch_fs_journal_init(&c->journal, journal_entry_bytes) ||
- bch_fs_btree_init(c) ||
- bch_fs_encryption_init(c) ||
- bch_fs_compress_init(c) ||
- bch_check_set_has_compressed_data(c, c->opts.compression))
- goto err;
-
- c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
- c->bdi.congested_fn = bch_congested_fn;
- c->bdi.congested_data = c;
-
- mi = bch_sb_get_members(c->disk_sb);
- for (i = 0; i < c->sb.nr_devices; i++)
- if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) &&
- bch_dev_alloc(c, i))
- goto err;
-
- /*
- * Now that all allocations have succeeded, init various refcounty
- * things that let us shutdown:
- */
- closure_init(&c->cl, NULL);
-
- c->kobj.kset = bcache_kset;
- kobject_init(&c->kobj, &bch_fs_ktype);
- kobject_init(&c->internal, &bch_fs_internal_ktype);
- kobject_init(&c->opts_dir, &bch_fs_opts_dir_ktype);
- kobject_init(&c->time_stats, &bch_fs_time_stats_ktype);
-
- bch_cache_accounting_init(&c->accounting, &c->cl);
-
- closure_init(&c->caching, &c->cl);
- set_closure_fn(&c->caching, __bch_fs_stop1, system_wq);
-
- closure_get(&c->cl);
- continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq);
- return c;
-err:
- bch_fs_free(c);
- return NULL;
-}
-
-static const char *__bch_fs_online(struct bch_fs *c)
-{
- struct bch_dev *ca;
- const char *err = NULL;
- unsigned i;
- int ret;
-
- lockdep_assert_held(&bch_fs_list_lock);
-
- if (!list_empty(&c->list))
- return NULL;
-
- if (__bch_uuid_to_fs(c->sb.uuid))
- return "filesystem UUID already open";
-
- ret = bch_fs_chardev_init(c);
- if (ret)
- return "error creating character device";
-
- bch_fs_debug_init(c);
-
- if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
- kobject_add(&c->internal, &c->kobj, "internal") ||
- kobject_add(&c->opts_dir, &c->kobj, "options") ||
- kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
- bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
- return "error creating sysfs objects";
-
- mutex_lock(&c->state_lock);
-
- err = "error creating sysfs objects";
- __for_each_member_device(ca, c, i)
- if (bch_dev_sysfs_online(ca))
- goto err;
-
- err = "can't bring up blockdev volumes";
- if (bch_blockdev_volumes_start(c))
- goto err;
-
- bch_attach_backing_devs(c);
-
- list_add(&c->list, &bch_fs_list);
- err = NULL;
-err:
- mutex_unlock(&c->state_lock);
- return err;
-}
-
-static const char *bch_fs_online(struct bch_fs *c)
-{
- const char *err;
-
- mutex_lock(&bch_fs_list_lock);
- err = __bch_fs_online(c);
- mutex_unlock(&bch_fs_list_lock);
-
- return err;
-}
-
-static const char *__bch_fs_start(struct bch_fs *c)
-{
- const char *err = "cannot allocate memory";
- struct bch_sb_field_members *mi;
- struct bch_dev *ca;
- unsigned i, id;
- time64_t now;
- LIST_HEAD(journal);
- struct jset *j;
- int ret = -EINVAL;
-
- BUG_ON(c->state != BCH_FS_STARTING);
-
- mutex_lock(&c->sb_lock);
- for_each_online_member(ca, c, i)
- bch_sb_from_fs(c, ca);
- mutex_unlock(&c->sb_lock);
-
- if (BCH_SB_INITIALIZED(c->disk_sb)) {
- ret = bch_journal_read(c, &journal);
- if (ret)
- goto err;
-
- j = &list_entry(journal.prev, struct journal_replay, list)->j;
-
- c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
- c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
-
- err = "error reading priorities";
- for_each_readable_member(ca, c, i) {
- ret = bch_prio_read(ca);
- if (ret) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
- }
-
- for (id = 0; id < BTREE_ID_NR; id++) {
- unsigned level;
- struct bkey_i *k;
-
- err = "bad btree root";
- k = bch_journal_find_btree_root(c, j, id, &level);
- if (!k && id == BTREE_ID_EXTENTS)
- goto err;
- if (!k) {
- pr_debug("missing btree root: %d", id);
- continue;
- }
-
- err = "error reading btree root";
- if (bch_btree_root_read(c, id, k, level))
- goto err;
- }
-
- bch_verbose(c, "starting mark and sweep:");
-
- err = "error in recovery";
- if (bch_initial_gc(c, &journal))
- goto err;
-
- if (c->opts.noreplay)
- goto recovery_done;
-
- bch_verbose(c, "mark and sweep done");
-
- /*
- * bch_journal_start() can't happen sooner, or btree_gc_finish()
- * will give spurious errors about oldest_gen > bucket_gen -
- * this is a hack but oh well.
- */
- bch_journal_start(c);
-
- err = "error starting allocator thread";
- for_each_rw_member(ca, c, i)
- if (bch_dev_allocator_start(ca)) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
-
- bch_verbose(c, "starting journal replay:");
-
- err = "journal replay failed";
- ret = bch_journal_replay(c, &journal);
- if (ret)
- goto err;
-
- bch_verbose(c, "journal replay done");
-
- if (c->opts.norecovery)
- goto recovery_done;
-
- bch_verbose(c, "starting fsck:");
- err = "error in fsck";
- ret = bch_fsck(c, !c->opts.nofsck);
- if (ret)
- goto err;
-
- bch_verbose(c, "fsck done");
- } else {
- struct bch_inode_unpacked inode;
- struct bkey_inode_buf packed_inode;
- struct closure cl;
-
- closure_init_stack(&cl);
-
- bch_notice(c, "initializing new filesystem");
-
- bch_initial_gc(c, NULL);
-
- err = "unable to allocate journal buckets";
- for_each_rw_member(ca, c, i)
- if (bch_dev_journal_alloc(ca)) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
-
- /*
- * journal_res_get() will crash if called before this has
- * set up the journal.pin FIFO and journal.cur pointer:
- */
- bch_journal_start(c);
- bch_journal_set_replay_done(&c->journal);
-
- err = "error starting allocator thread";
- for_each_rw_member(ca, c, i)
- if (bch_dev_allocator_start(ca)) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
-
- err = "cannot allocate new btree root";
- for (id = 0; id < BTREE_ID_NR; id++)
- if (bch_btree_root_alloc(c, id, &cl)) {
- closure_sync(&cl);
- goto err;
- }
-
- /* Wait for new btree roots to be written: */
- closure_sync(&cl);
-
- bch_inode_init(c, &inode, 0, 0,
- S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
- inode.inum = BCACHE_ROOT_INO;
-
- bch_inode_pack(&packed_inode, &inode);
-
- err = "error creating root directory";
- if (bch_btree_insert(c, BTREE_ID_INODES,
- &packed_inode.inode.k_i,
- NULL, NULL, NULL, 0))
- goto err;
-
- err = "error writing first journal entry";
- if (bch_journal_meta(&c->journal))
- goto err;
- }
-recovery_done:
- err = "dynamic fault";
- if (bch_fs_init_fault("fs_start"))
- goto err;
-
- if (c->opts.read_only) {
- bch_fs_read_only(c);
- } else {
- err = bch_fs_read_write(c);
- if (err)
- goto err;
- }
-
- mutex_lock(&c->sb_lock);
- mi = bch_sb_get_members(c->disk_sb);
- now = ktime_get_seconds();
-
- for_each_member_device(ca, c, i)
- mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
-
- SET_BCH_SB_INITIALIZED(c->disk_sb, true);
- SET_BCH_SB_CLEAN(c->disk_sb, false);
- c->disk_sb->version = BCACHE_SB_VERSION_CDEV;
-
- bch_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- err = NULL;
-out:
- bch_journal_entries_free(&journal);
- return err;
-err:
- switch (ret) {
- case BCH_FSCK_ERRORS_NOT_FIXED:
- bch_err(c, "filesystem contains errors: please report this to the developers");
- pr_cont("mount with -o fix_errors to repair");
- err = "fsck error";
- break;
- case BCH_FSCK_REPAIR_UNIMPLEMENTED:
- bch_err(c, "filesystem contains errors: please report this to the developers");
- pr_cont("repair unimplemented: inform the developers so that it can be added");
- err = "fsck error";
- break;
- case BCH_FSCK_REPAIR_IMPOSSIBLE:
- bch_err(c, "filesystem contains errors, but repair impossible");
- err = "fsck error";
- break;
- case BCH_FSCK_UNKNOWN_VERSION:
- err = "unknown metadata version";;
- break;
- case -ENOMEM:
- err = "cannot allocate memory";
- break;
- case -EIO:
- err = "IO error";
- break;
- }
-
- BUG_ON(!err);
- set_bit(BCH_FS_ERROR, &c->flags);
- goto out;
-}
-
-const char *bch_fs_start(struct bch_fs *c)
-{
- return __bch_fs_start(c) ?: bch_fs_online(c);
-}
-
-static const char *bch_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
-{
- struct bch_sb_field_members *sb_mi;
-
- sb_mi = bch_sb_get_members(sb);
- if (!sb_mi)
- return "Invalid superblock: member info area missing";
-
- if (le16_to_cpu(sb->block_size) != c->sb.block_size)
- return "mismatched block size";
-
- if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
- BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
- return "new cache bucket size is too small";
-
- return NULL;
-}
-
-static const char *bch_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
-{
- struct bch_sb *newest =
- le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
- struct bch_sb_field_members *mi = bch_sb_get_members(newest);
-
- if (uuid_le_cmp(fs->uuid, sb->uuid))
- return "device not a member of filesystem";
-
- if (sb->dev_idx >= newest->nr_devices)
- return "device has invalid dev_idx";
-
- if (bch_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le)))
- return "device has been removed";
-
- if (fs->block_size != sb->block_size)
- return "mismatched block size";
-
- return NULL;
-}
-
-/* Device startup/shutdown: */
-
-void bch_dev_release(struct kobject *kobj)
-{
- struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
-
- kfree(ca);
-}
-
-static void bch_dev_free(struct bch_dev *ca)
-{
- unsigned i;
-
- cancel_work_sync(&ca->io_error_work);
-
- if (ca->kobj.state_in_sysfs &&
- ca->disk_sb.bdev)
- sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
- "bcache");
-
- if (ca->kobj.state_in_sysfs)
- kobject_del(&ca->kobj);
-
- bch_free_super(&ca->disk_sb);
- bch_dev_journal_exit(ca);
-
- free_percpu(ca->sectors_written);
- bioset_exit(&ca->replica_set);
- free_percpu(ca->usage_percpu);
- free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
- kfree(ca->prio_buckets);
- kfree(ca->bio_prio);
- vfree(ca->buckets);
- vfree(ca->oldest_gens);
- free_heap(&ca->heap);
- free_fifo(&ca->free_inc);
-
- for (i = 0; i < RESERVE_NR; i++)
- free_fifo(&ca->free[i]);
-
- percpu_ref_exit(&ca->io_ref);
- percpu_ref_exit(&ca->ref);
- kobject_put(&ca->kobj);
-}
-
-static void bch_dev_io_ref_release(struct percpu_ref *ref)
-{
- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
-
- complete(&ca->offline_complete);
-}
-
-static void __bch_dev_offline(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
-
- lockdep_assert_held(&c->state_lock);
-
- __bch_dev_read_only(ca->fs, ca);
-
- reinit_completion(&ca->offline_complete);
- percpu_ref_kill(&ca->io_ref);
- wait_for_completion(&ca->offline_complete);
-
- if (ca->kobj.state_in_sysfs) {
- struct kobject *block =
- &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
-
- sysfs_remove_link(block, "bcache");
- sysfs_remove_link(&ca->kobj, "block");
- }
-
- bch_free_super(&ca->disk_sb);
- bch_dev_journal_exit(ca);
-}
-
-static void bch_dev_ref_release(struct percpu_ref *ref)
-{
- struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
-
- complete(&ca->stop_complete);
-}
-
-static void bch_dev_stop(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
-
- lockdep_assert_held(&c->state_lock);
-
- BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
- rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
-
- synchronize_rcu();
-
- reinit_completion(&ca->stop_complete);
- percpu_ref_kill(&ca->ref);
- wait_for_completion(&ca->stop_complete);
-}
-
-static int bch_dev_sysfs_online(struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- int ret;
-
- if (!c->kobj.state_in_sysfs)
- return 0;
-
- if (!ca->kobj.state_in_sysfs) {
- ret = kobject_add(&ca->kobj, &ca->fs->kobj,
- "dev-%u", ca->dev_idx);
- if (ret)
- return ret;
- }
-
- if (ca->disk_sb.bdev) {
- struct kobject *block =
- &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
-
- ret = sysfs_create_link(block, &ca->kobj, "bcache");
- if (ret)
- return ret;
- ret = sysfs_create_link(&ca->kobj, block, "block");
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int bch_dev_alloc(struct bch_fs *c, unsigned dev_idx)
-{
- struct bch_member *member;
- size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
- size_t heap_size;
- unsigned i;
- struct bch_dev *ca;
-
- if (bch_fs_init_fault("dev_alloc"))
- return -ENOMEM;
-
- ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- if (!ca)
- return -ENOMEM;
-
- kobject_init(&ca->kobj, &bch_dev_ktype);
- init_completion(&ca->stop_complete);
- init_completion(&ca->offline_complete);
-
- spin_lock_init(&ca->self.lock);
- ca->self.nr = 1;
- rcu_assign_pointer(ca->self.d[0].dev, ca);
- ca->dev_idx = dev_idx;
-
- spin_lock_init(&ca->freelist_lock);
- spin_lock_init(&ca->prio_buckets_lock);
- mutex_init(&ca->heap_lock);
- bch_dev_moving_gc_init(ca);
-
- INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
-
- if (bch_fs_init_fault("dev_alloc"))
- goto err;
-
- member = bch_sb_get_members(c->disk_sb)->members + dev_idx;
-
- ca->mi = bch_mi_to_cpu(member);
- ca->uuid = member->uuid;
- ca->bucket_bits = ilog2(ca->mi.bucket_size);
- scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
-
- /* XXX: tune these */
- movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
- reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
- /*
- * free_inc must be smaller than the copygc reserve: if it was bigger,
- * one copygc iteration might not make enough buckets available to fill
- * up free_inc and allow the allocator to make forward progress
- */
- free_inc_reserve = movinggc_reserve / 2;
- heap_size = movinggc_reserve * 8;
-
- if (percpu_ref_init(&ca->ref, bch_dev_ref_release,
- 0, GFP_KERNEL) ||
- percpu_ref_init(&ca->io_ref, bch_dev_io_ref_release,
- PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
- !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
- !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
- !init_fifo(&ca->free[RESERVE_MOVINGGC],
- movinggc_reserve, GFP_KERNEL) ||
- !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
- !init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) ||
- !init_heap(&ca->heap, heap_size, GFP_KERNEL) ||
- !(ca->oldest_gens = vzalloc(sizeof(u8) *
- ca->mi.nbuckets)) ||
- !(ca->buckets = vzalloc(sizeof(struct bucket) *
- ca->mi.nbuckets)) ||
- !(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) *
- 2, GFP_KERNEL)) ||
- !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
- !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
- !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
- bioset_init(&ca->replica_set, 4,
- offsetof(struct bch_write_bio, bio)) ||
- !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
- goto err;
-
- ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
-
- total_reserve = ca->free_inc.size;
- for (i = 0; i < RESERVE_NR; i++)
- total_reserve += ca->free[i].size;
-
- ca->copygc_write_point.group = &ca->self;
- ca->tiering_write_point.group = &ca->self;
-
- ca->fs = c;
- rcu_assign_pointer(c->devs[ca->dev_idx], ca);
-
- if (bch_dev_sysfs_online(ca))
- pr_warn("error creating sysfs objects");
-
- return 0;
-err:
- bch_dev_free(ca);
- return -ENOMEM;
-}
-
-static int __bch_dev_online(struct bch_fs *c, struct bcache_superblock *sb)
-{
- struct bch_dev *ca;
- int ret;
-
- lockdep_assert_held(&c->sb_lock);
-
- if (le64_to_cpu(sb->sb->seq) >
- le64_to_cpu(c->disk_sb->seq))
- bch_sb_to_fs(c, sb->sb);
-
- BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
- !c->devs[sb->sb->dev_idx]);
-
- ca = c->devs[sb->sb->dev_idx];
- if (ca->disk_sb.bdev) {
- bch_err(c, "already have device online in slot %u",
- sb->sb->dev_idx);
- return -EINVAL;
- }
-
- ret = bch_dev_journal_init(ca, sb->sb);
- if (ret)
- return ret;
-
- /*
- * Increase journal write timeout if flushes to this device are
- * expensive:
- */
- if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) &&
- journal_flushes_device(ca))
- c->journal.write_delay_ms =
- max(c->journal.write_delay_ms, 1000U);
-
- /* Commit: */
- ca->disk_sb = *sb;
- if (sb->mode & FMODE_EXCL)
- ca->disk_sb.bdev->bd_holder = ca;
- memset(sb, 0, sizeof(*sb));
-
- if (c->sb.nr_devices == 1)
- bdevname(ca->disk_sb.bdev, c->name);
- bdevname(ca->disk_sb.bdev, ca->name);
-
- if (bch_dev_sysfs_online(ca))
- pr_warn("error creating sysfs objects");
-
- lg_local_lock(&c->usage_lock);
- if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
- bch_mark_dev_metadata(ca->fs, ca);
- lg_local_unlock(&c->usage_lock);
-
- percpu_ref_reinit(&ca->io_ref);
- return 0;
-}
-
-/* Device management: */
-
-bool bch_fs_may_start(struct bch_fs *c, int flags)
-{
- struct bch_sb_field_members *mi;
- unsigned meta_missing = 0;
- unsigned data_missing = 0;
- bool degraded = false;
- unsigned i;
-
- mutex_lock(&c->sb_lock);
- mi = bch_sb_get_members(c->disk_sb);
-
- for (i = 0; i < c->disk_sb->nr_devices; i++)
- if (!c->devs[i] &&
- !bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) {
- degraded = true;
- if (BCH_MEMBER_HAS_METADATA(&mi->members[i]))
- meta_missing++;
- if (BCH_MEMBER_HAS_DATA(&mi->members[i]))
- data_missing++;
- }
- mutex_unlock(&c->sb_lock);
-
- if (degraded &&
- !(flags & BCH_FORCE_IF_DEGRADED))
- return false;
-
- if (meta_missing &&
- !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
- return false;
-
- if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) &&
- !(flags & BCH_FORCE_IF_METADATA_LOST))
- return false;
-
- if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
- return false;
-
- if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) &&
- !(flags & BCH_FORCE_IF_DATA_LOST))
- return false;
-
- return true;
-}
-
-bool bch_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
-{
- lockdep_assert_held(&c->state_lock);
-
- if (new_state == BCH_MEMBER_STATE_RW)
- return true;
-
- if (ca->mi.has_data &&
- !(flags & BCH_FORCE_IF_DATA_DEGRADED))
- return false;
-
- if (ca->mi.has_data &&
- c->sb.data_replicas_have <= 1 &&
- !(flags & BCH_FORCE_IF_DATA_LOST))
- return false;
-
- if (ca->mi.has_metadata &&
- !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
- return false;
-
- if (ca->mi.has_metadata &&
- c->sb.meta_replicas_have <= 1 &&
- !(flags & BCH_FORCE_IF_METADATA_LOST))
- return false;
-
- return true;
-}
-
-static void __bch_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
-{
- bch_moving_gc_stop(ca);
-
- /*
- * This stops new data writes (e.g. to existing open data
- * buckets) and then waits for all existing writes to
- * complete.
- */
- bch_dev_allocator_stop(ca);
-
- bch_dev_group_remove(&c->journal.devs, ca);
-}
-
-static const char *__bch_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
-{
- lockdep_assert_held(&c->state_lock);
-
- BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
-
- trace_bcache_cache_read_write(ca);
-
- if (bch_dev_allocator_start(ca))
- return "error starting allocator thread";
-
- if (bch_moving_gc_start(ca))
- return "error starting moving GC thread";
-
- if (bch_tiering_start(c))
- return "error starting tiering thread";
-
- bch_notify_dev_read_write(ca);
- trace_bcache_cache_read_write_done(ca);
-
- return NULL;
-}
-
-int __bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
-{
- struct bch_sb_field_members *mi;
-
- if (ca->mi.state == new_state)
- return 0;
-
- if (!bch_dev_state_allowed(c, ca, new_state, flags))
- return -EINVAL;
-
- if (new_state == BCH_MEMBER_STATE_RW) {
- if (__bch_dev_read_write(c, ca))
- return -ENOMEM;
- } else {
- __bch_dev_read_only(c, ca);
- }
-
- bch_notice(ca, "%s", bch_dev_state[new_state]);
-
- mutex_lock(&c->sb_lock);
- mi = bch_sb_get_members(c->disk_sb);
- SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
- bch_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
-
-int bch_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
-{
- int ret;
-
- mutex_lock(&c->state_lock);
- ret = __bch_dev_set_state(c, ca, new_state, flags);
- mutex_unlock(&c->state_lock);
-
- return ret;
-}
-
-/* Device add/removal: */
-
-int bch_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
- struct bch_sb_field_members *mi;
- unsigned dev_idx = ca->dev_idx;
- int ret = -EINVAL;
-
- mutex_lock(&c->state_lock);
-
- percpu_ref_put(&ca->ref); /* XXX */
-
- if (ca->mi.state == BCH_MEMBER_STATE_RW) {
- bch_err(ca, "Cannot remove RW device");
- goto err;
- }
-
- if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
- bch_err(ca, "Cannot remove without losing data");
- goto err;
- }
-
- /*
- * XXX: verify that dev_idx is really not in use anymore, anywhere
- *
- * flag_data_bad() does not check btree pointers
- */
- ret = bch_flag_data_bad(ca);
- if (ret) {
- bch_err(ca, "Remove failed");
- goto err;
- }
-
- if (ca->mi.has_data || ca->mi.has_metadata) {
- bch_err(ca, "Remove failed, still has data");
- goto err;
- }
-
- /*
- * Ok, really doing the remove:
- * Drop device's prio pointer before removing it from superblock:
- */
- spin_lock(&c->journal.lock);
- c->journal.prio_buckets[dev_idx] = 0;
- spin_unlock(&c->journal.lock);
-
- bch_journal_meta(&c->journal);
-
- __bch_dev_offline(ca);
- bch_dev_stop(ca);
- bch_dev_free(ca);
-
- /*
- * Free this device's slot in the bch_member array - all pointers to
- * this device must be gone:
- */
- mutex_lock(&c->sb_lock);
- mi = bch_sb_get_members(c->disk_sb);
- memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
-
- bch_write_super(c);
-
- mutex_unlock(&c->sb_lock);
- mutex_unlock(&c->state_lock);
- return 0;
-err:
- mutex_unlock(&c->state_lock);
- return ret;
-}
-
-int bch_dev_add(struct bch_fs *c, const char *path)
-{
- struct bcache_superblock sb;
- const char *err;
- struct bch_dev *ca = NULL;
- struct bch_sb_field_members *mi, *dev_mi;
- struct bch_member saved_mi;
- unsigned dev_idx, nr_devices, u64s;
- int ret = -EINVAL;
-
- err = bch_read_super(&sb, bch_opts_empty(), path);
- if (err)
- return -EINVAL;
-
- err = bch_validate_cache_super(&sb);
- if (err)
- return -EINVAL;
-
- err = bch_dev_may_add(sb.sb, c);
- if (err)
- return -EINVAL;
-
- mutex_lock(&c->state_lock);
- mutex_lock(&c->sb_lock);
-
- /*
- * Preserve the old cache member information (esp. tier)
- * before we start bashing the disk stuff.
- */
- dev_mi = bch_sb_get_members(sb.sb);
- saved_mi = dev_mi->members[sb.sb->dev_idx];
- saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
-
- if (dynamic_fault("bcache:add:no_slot"))
- goto no_slot;
-
- mi = bch_sb_get_members(c->disk_sb);
- for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
- if (dev_idx >= c->sb.nr_devices ||
- bch_is_zero(mi->members[dev_idx].uuid.b,
- sizeof(uuid_le)))
- goto have_slot;
-no_slot:
- err = "no slots available in superblock";
- ret = -ENOSPC;
- goto err_unlock;
-
-have_slot:
- nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
- u64s = (sizeof(struct bch_sb_field_members) +
- sizeof(struct bch_member) * nr_devices) / sizeof(u64);
- err = "no space in superblock for member info";
-
- mi = bch_fs_sb_resize_members(c, u64s);
- if (!mi)
- goto err_unlock;
-
- dev_mi = bch_sb_resize_members(&sb, u64s);
- if (!dev_mi)
- goto err_unlock;
-
- memcpy(dev_mi, mi, u64s * sizeof(u64));
- dev_mi->members[dev_idx] = saved_mi;
-
- sb.sb->uuid = c->disk_sb->uuid;
- sb.sb->dev_idx = dev_idx;
- sb.sb->nr_devices = nr_devices;
-
- /* commit new member info */
- memcpy(mi, dev_mi, u64s * sizeof(u64));
- c->disk_sb->nr_devices = nr_devices;
- c->sb.nr_devices = nr_devices;
-
- if (bch_dev_alloc(c, dev_idx)) {
- err = "cannot allocate memory";
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- if (__bch_dev_online(c, &sb)) {
- err = "bch_dev_online() error";
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- bch_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- ca = c->devs[dev_idx];
- if (ca->mi.state == BCH_MEMBER_STATE_RW) {
- err = "journal alloc failed";
- if (bch_dev_journal_alloc(ca))
- goto err;
-
- err = __bch_dev_read_write(c, ca);
- if (err)
- goto err;
- }
-
- bch_notify_dev_added(ca);
- mutex_unlock(&c->state_lock);
- return 0;
-err_unlock:
- mutex_unlock(&c->sb_lock);
-err:
- mutex_unlock(&c->state_lock);
- bch_free_super(&sb);
-
- bch_err(c, "Unable to add device: %s", err);
- return ret ?: -EINVAL;
-}
-
-int bch_dev_online(struct bch_fs *c, const char *path)
-{
- struct bcache_superblock sb = { 0 };
- struct bch_dev *ca;
- unsigned dev_idx;
- const char *err;
-
- mutex_lock(&c->state_lock);
-
- err = bch_read_super(&sb, bch_opts_empty(), path);
- if (err)
- goto err;
-
- dev_idx = sb.sb->dev_idx;
-
- err = bch_dev_in_fs(c->disk_sb, sb.sb);
- if (err)
- goto err;
-
- mutex_lock(&c->sb_lock);
- if (__bch_dev_online(c, &sb)) {
- err = "__bch_dev_online() error";
- mutex_unlock(&c->sb_lock);
- goto err;
- }
- mutex_unlock(&c->sb_lock);
-
- ca = c->devs[dev_idx];
- if (ca->mi.state == BCH_MEMBER_STATE_RW) {
- err = __bch_dev_read_write(c, ca);
- if (err)
- goto err;
- }
-
- mutex_unlock(&c->state_lock);
- return 0;
-err:
- mutex_unlock(&c->state_lock);
- bch_free_super(&sb);
- bch_err(c, "error bringing %s online: %s", path, err);
- return -EINVAL;
-}
-
-int bch_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
- mutex_lock(&c->state_lock);
-
- if (!bch_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
- bch_err(ca, "Cannot offline required disk");
- mutex_unlock(&c->state_lock);
- return -EINVAL;
- }
-
- __bch_dev_read_only(c, ca);
- __bch_dev_offline(ca);
-
- mutex_unlock(&c->state_lock);
- return 0;
-}
-
-int bch_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
-{
- int ret;
-
- mutex_lock(&c->state_lock);
-
- if (ca->mi.state == BCH_MEMBER_STATE_RW) {
- bch_err(ca, "Cannot migrate data off RW device");
- mutex_unlock(&c->state_lock);
- return -EINVAL;
- }
-
- mutex_unlock(&c->state_lock);
-
- ret = bch_move_data_off_device(ca);
- if (ret) {
- bch_err(ca, "Error migrating data: %i", ret);
- return ret;
- }
-
- ret = bch_move_metadata_off_device(ca);
- if (ret) {
- bch_err(ca, "Error migrating metadata: %i", ret);
- return ret;
- }
-
- if (ca->mi.has_data || ca->mi.has_metadata) {
- bch_err(ca, "Migrate error: data still present");
- return -EINVAL;
- }
-
- return 0;
-}
-
-/* Filesystem open: */
-
-const char *bch_fs_open(char * const *devices, unsigned nr_devices,
- struct bch_opts opts, struct bch_fs **ret)
-{
- const char *err;
- struct bch_fs *c = NULL;
- struct bcache_superblock *sb;
- unsigned i, best_sb = 0;
-
- if (!nr_devices)
- return "need at least one device";
-
- if (!try_module_get(THIS_MODULE))
- return "module unloading";
-
- err = "cannot allocate memory";
- sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
- if (!sb)
- goto err;
-
- for (i = 0; i < nr_devices; i++) {
- err = bch_read_super(&sb[i], opts, devices[i]);
- if (err)
- goto err;
-
- err = "attempting to register backing device";
- if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
- goto err;
-
- err = bch_validate_cache_super(&sb[i]);
- if (err)
- goto err;
- }
-
- for (i = 1; i < nr_devices; i++)
- if (le64_to_cpu(sb[i].sb->seq) >
- le64_to_cpu(sb[best_sb].sb->seq))
- best_sb = i;
-
- for (i = 0; i < nr_devices; i++) {
- err = bch_dev_in_fs(sb[best_sb].sb, sb[i].sb);
- if (err)
- goto err;
- }
-
- err = "cannot allocate memory";
- c = bch_fs_alloc(sb[best_sb].sb, opts);
- if (!c)
- goto err;
-
- err = "bch_dev_online() error";
- mutex_lock(&c->sb_lock);
- for (i = 0; i < nr_devices; i++)
- if (__bch_dev_online(c, &sb[i])) {
- mutex_unlock(&c->sb_lock);
- goto err;
- }
- mutex_unlock(&c->sb_lock);
-
- err = "insufficient devices";
- if (!bch_fs_may_start(c, 0))
- goto err;
-
- if (!c->opts.nostart) {
- err = __bch_fs_start(c);
- if (err)
- goto err;
- }
-
- err = bch_fs_online(c);
- if (err)
- goto err;
-
- if (ret)
- *ret = c;
- else
- closure_put(&c->cl);
-
- err = NULL;
-out:
- kfree(sb);
- module_put(THIS_MODULE);
- if (err)
- c = NULL;
- return err;
-err:
- if (c)
- bch_fs_stop(c);
-
- for (i = 0; i < nr_devices; i++)
- bch_free_super(&sb[i]);
- goto out;
-}
-
-static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
- struct bch_opts opts)
-{
- const char *err;
- struct bch_fs *c;
- bool allocated_fs = false;
-
- err = bch_validate_cache_super(sb);
- if (err)
- return err;
-
- mutex_lock(&bch_fs_list_lock);
- c = __bch_uuid_to_fs(sb->sb->uuid);
- if (c) {
- closure_get(&c->cl);
-
- err = bch_dev_in_fs(c->disk_sb, sb->sb);
- if (err)
- goto err;
- } else {
- c = bch_fs_alloc(sb->sb, opts);
- err = "cannot allocate memory";
- if (!c)
- goto err;
-
- allocated_fs = true;
- }
-
- err = "bch_dev_online() error";
-
- mutex_lock(&c->sb_lock);
- if (__bch_dev_online(c, sb)) {
- mutex_unlock(&c->sb_lock);
- goto err;
- }
- mutex_unlock(&c->sb_lock);
-
- if (!c->opts.nostart && bch_fs_may_start(c, 0)) {
- err = __bch_fs_start(c);
- if (err)
- goto err;
- }
-
- err = __bch_fs_online(c);
- if (err)
- goto err;
-
- closure_put(&c->cl);
- mutex_unlock(&bch_fs_list_lock);
-
- return NULL;
-err:
- mutex_unlock(&bch_fs_list_lock);
-
- if (allocated_fs)
- bch_fs_stop(c);
- else if (c)
- closure_put(&c->cl);
-
- return err;
-}
-
-const char *bch_fs_open_incremental(const char *path)
-{
- struct bcache_superblock sb;
- struct bch_opts opts = bch_opts_empty();
- const char *err;
-
- err = bch_read_super(&sb, opts, path);
- if (err)
- return err;
-
- if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) {
- mutex_lock(&bch_fs_list_lock);
- err = bch_backing_dev_register(&sb);
- mutex_unlock(&bch_fs_list_lock);
- } else {
- err = __bch_fs_open_incremental(&sb, opts);
- }
-
- bch_free_super(&sb);
-
- return err;
-}
-
-/* Global interfaces/init */
-
-#define kobj_attribute_write(n, fn) \
- static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
-
-#define kobj_attribute_rw(n, show, store) \
- static struct kobj_attribute ksysfs_##n = \
- __ATTR(n, S_IWUSR|S_IRUSR, show, store)
-
-static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
- const char *, size_t);
-
-kobj_attribute_write(register, register_bcache);
-kobj_attribute_write(register_quiet, register_bcache);
-
-static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
- const char *buffer, size_t size)
-{
- ssize_t ret = -EINVAL;
- const char *err = "cannot allocate memory";
- char *path = NULL;
-
- if (!try_module_get(THIS_MODULE))
- return -EBUSY;
-
- if (!(path = kstrndup(skip_spaces(buffer), size, GFP_KERNEL)))
- goto err;
-
- err = bch_fs_open_incremental(strim(path));
- if (err)
- goto err;
-
- ret = size;
-out:
- kfree(path);
- module_put(THIS_MODULE);
- return ret;
-err:
- pr_err("error opening %s: %s", path, err);
- goto out;
-}
-
-static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
-{
- if (code == SYS_DOWN ||
- code == SYS_HALT ||
- code == SYS_POWER_OFF) {
- struct bch_fs *c;
-
- mutex_lock(&bch_fs_list_lock);
-
- if (!list_empty(&bch_fs_list))
- pr_info("Setting all devices read only:");
-
- list_for_each_entry(c, &bch_fs_list, list)
- bch_fs_read_only_async(c);
-
- list_for_each_entry(c, &bch_fs_list, list)
- bch_fs_read_only(c);
-
- mutex_unlock(&bch_fs_list_lock);
- }
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block reboot = {
- .notifier_call = bcache_reboot,
- .priority = INT_MAX, /* before any real devices */
-};
-
-static ssize_t reboot_test(struct kobject *k, struct kobj_attribute *attr,
- const char *buffer, size_t size)
-{
- bcache_reboot(NULL, SYS_DOWN, NULL);
- return size;
-}
-
-kobj_attribute_write(reboot, reboot_test);
-
-static void bcache_exit(void)
-{
- bch_debug_exit();
- bch_vfs_exit();
- bch_blockdev_exit();
- bch_chardev_exit();
- if (bcache_kset)
- kset_unregister(bcache_kset);
- if (bcache_io_wq)
- destroy_workqueue(bcache_io_wq);
- if (!IS_ERR_OR_NULL(bch_sha256))
- crypto_free_shash(bch_sha256);
- unregister_reboot_notifier(&reboot);
-}
-
-static int __init bcache_init(void)
-{
- static const struct attribute *files[] = {
- &ksysfs_register.attr,
- &ksysfs_register_quiet.attr,
- &ksysfs_reboot.attr,
- NULL
- };
-
- register_reboot_notifier(&reboot);
- closure_debug_init();
- bkey_pack_test();
-
- bch_sha256 = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(bch_sha256))
- goto err;
-
- if (!(bcache_io_wq = create_freezable_workqueue("bcache_io")) ||
- !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) ||
- sysfs_create_files(&bcache_kset->kobj, files) ||
- bch_chardev_init() ||
- bch_blockdev_init() ||
- bch_vfs_init() ||
- bch_debug_init())
- goto err;
-
- return 0;
-err:
- bcache_exit();
- return -ENOMEM;
-}
-
-#define BCH_DEBUG_PARAM(name, description) \
- bool bch_##name; \
- module_param_named(name, bch_##name, bool, 0644); \
- MODULE_PARM_DESC(name, description);
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-module_exit(bcache_exit);
-module_init(bcache_init);
diff --git a/libbcache/super.h b/libbcache/super.h
deleted file mode 100644
index 66c34308..00000000
--- a/libbcache/super.h
+++ /dev/null
@@ -1,136 +0,0 @@
-#ifndef _BCACHE_SUPER_H
-#define _BCACHE_SUPER_H
-
-#include "extents.h"
-
-#include <linux/bcache-ioctl.h>
-
-static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
-{
- return s >> ca->bucket_bits;
-}
-
-static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
-{
- return ((sector_t) b) << ca->bucket_bits;
-}
-
-static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-{
- return s & (ca->mi.bucket_size - 1);
-}
-
-static inline struct bch_dev *__bch_next_dev(struct bch_fs *c, unsigned *iter)
-{
- struct bch_dev *ca = NULL;
-
- while (*iter < c->sb.nr_devices &&
- !(ca = rcu_dereference_check(c->devs[*iter],
- lockdep_is_held(&c->state_lock))))
- (*iter)++;
-
- return ca;
-}
-
-#define __for_each_member_device(ca, c, iter) \
- for ((iter) = 0; ((ca) = __bch_next_dev((c), &(iter))); (iter)++)
-
-#define for_each_member_device_rcu(ca, c, iter) \
- __for_each_member_device(ca, c, iter)
-
-static inline struct bch_dev *bch_get_next_dev(struct bch_fs *c, unsigned *iter)
-{
- struct bch_dev *ca;
-
- rcu_read_lock();
- if ((ca = __bch_next_dev(c, iter)))
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
-
- return ca;
-}
-
-/*
- * If you break early, you must drop your ref on the current device
- */
-#define for_each_member_device(ca, c, iter) \
- for ((iter) = 0; \
- (ca = bch_get_next_dev(c, &(iter))); \
- percpu_ref_put(&ca->ref), (iter)++)
-
-static inline struct bch_dev *bch_get_next_online_dev(struct bch_fs *c,
- unsigned *iter,
- int state_mask)
-{
- struct bch_dev *ca;
-
- rcu_read_lock();
- while ((ca = __bch_next_dev(c, iter)) &&
- (!((1 << ca->mi.state) & state_mask) ||
- !percpu_ref_tryget(&ca->io_ref)))
- (*iter)++;
- rcu_read_unlock();
-
- return ca;
-}
-
-#define __for_each_online_member(ca, c, iter, state_mask) \
- for ((iter) = 0; \
- (ca = bch_get_next_online_dev(c, &(iter), state_mask)); \
- percpu_ref_put(&ca->io_ref), (iter)++)
-
-#define for_each_online_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, ~0)
-
-#define for_each_rw_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW)
-
-#define for_each_readable_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, \
- (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
-
-struct bch_fs *bch_bdev_to_fs(struct block_device *);
-struct bch_fs *bch_uuid_to_fs(uuid_le);
-int bch_congested(struct bch_fs *, int);
-
-void bch_dev_release(struct kobject *);
-
-bool bch_dev_state_allowed(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
-int __bch_dev_set_state(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
-int bch_dev_set_state(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
-
-int bch_dev_fail(struct bch_dev *, int);
-int bch_dev_remove(struct bch_fs *, struct bch_dev *, int);
-int bch_dev_add(struct bch_fs *, const char *);
-int bch_dev_online(struct bch_fs *, const char *);
-int bch_dev_offline(struct bch_fs *, struct bch_dev *, int);
-int bch_dev_evacuate(struct bch_fs *, struct bch_dev *);
-
-void bch_fs_detach(struct bch_fs *);
-
-bool bch_fs_emergency_read_only(struct bch_fs *);
-void bch_fs_read_only(struct bch_fs *);
-const char *bch_fs_read_write(struct bch_fs *);
-
-void bch_fs_release(struct kobject *);
-void bch_fs_stop_async(struct bch_fs *);
-void bch_fs_stop(struct bch_fs *);
-
-const char *bch_fs_start(struct bch_fs *);
-const char *bch_fs_open(char * const *, unsigned, struct bch_opts,
- struct bch_fs **);
-const char *bch_fs_open_incremental(const char *path);
-
-extern struct workqueue_struct *bcache_io_wq;
-extern struct crypto_shash *bch_sha256;
-
-extern struct kobj_type bch_fs_ktype;
-extern struct kobj_type bch_fs_internal_ktype;
-extern struct kobj_type bch_fs_time_stats_ktype;
-extern struct kobj_type bch_fs_opts_dir_ktype;
-extern struct kobj_type bch_dev_ktype;
-
-#endif /* _BCACHE_SUPER_H */
diff --git a/libbcache/super_types.h b/libbcache/super_types.h
deleted file mode 100644
index 69c747de..00000000
--- a/libbcache/super_types.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _BCACHE_SUPER_TYPES_H
-#define _BCACHE_SUPER_TYPES_H
-
-struct bcache_superblock {
- struct bch_sb *sb;
- struct block_device *bdev;
- struct bio *bio;
- unsigned page_order;
- fmode_t mode;
-};
-
-#endif /* _BCACHE_SUPER_TYPES_H */
diff --git a/libbcache/sysfs.c b/libbcache/sysfs.c
deleted file mode 100644
index 3536ec0c..00000000
--- a/libbcache/sysfs.c
+++ /dev/null
@@ -1,1336 +0,0 @@
-/*
- * bcache sysfs interfaces
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcache.h"
-#include "alloc.h"
-#include "blockdev.h"
-#include "compress.h"
-#include "sysfs.h"
-#include "btree_cache.h"
-#include "btree_iter.h"
-#include "btree_update.h"
-#include "btree_gc.h"
-#include "buckets.h"
-#include "inode.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "opts.h"
-#include "request.h"
-#include "super-io.h"
-#include "tier.h"
-#include "writeback.h"
-
-#include <linux/blkdev.h>
-#include <linux/sort.h>
-
-write_attribute(attach);
-write_attribute(detach);
-write_attribute(unregister);
-write_attribute(stop);
-write_attribute(clear_stats);
-write_attribute(trigger_btree_coalesce);
-write_attribute(trigger_gc);
-write_attribute(prune_cache);
-write_attribute(blockdev_volume_create);
-
-read_attribute(uuid);
-read_attribute(minor);
-read_attribute(bucket_size);
-read_attribute(bucket_size_bytes);
-read_attribute(block_size);
-read_attribute(block_size_bytes);
-read_attribute(btree_node_size);
-read_attribute(btree_node_size_bytes);
-read_attribute(first_bucket);
-read_attribute(nbuckets);
-read_attribute(tree_depth);
-read_attribute(root_usage_percent);
-read_attribute(read_priority_stats);
-read_attribute(write_priority_stats);
-read_attribute(fragmentation_stats);
-read_attribute(oldest_gen_stats);
-read_attribute(reserve_stats);
-read_attribute(btree_cache_size);
-read_attribute(cache_available_percent);
-read_attribute(compression_stats);
-read_attribute(written);
-read_attribute(btree_written);
-read_attribute(metadata_written);
-read_attribute(journal_debug);
-write_attribute(journal_flush);
-read_attribute(internal_uuid);
-
-read_attribute(btree_gc_running);
-
-read_attribute(btree_nodes);
-read_attribute(btree_used_percent);
-read_attribute(average_key_size);
-read_attribute(available_buckets);
-read_attribute(free_buckets);
-read_attribute(dirty_data);
-read_attribute(dirty_bytes);
-read_attribute(dirty_buckets);
-read_attribute(cached_data);
-read_attribute(cached_bytes);
-read_attribute(cached_buckets);
-read_attribute(meta_buckets);
-read_attribute(alloc_buckets);
-read_attribute(has_data);
-read_attribute(has_metadata);
-read_attribute(bset_tree_stats);
-read_attribute(alloc_debug);
-
-read_attribute(state);
-read_attribute(cache_read_races);
-read_attribute(writeback_keys_done);
-read_attribute(writeback_keys_failed);
-read_attribute(io_errors);
-rw_attribute(io_error_limit);
-rw_attribute(io_error_halflife);
-read_attribute(congested);
-rw_attribute(congested_read_threshold_us);
-rw_attribute(congested_write_threshold_us);
-
-rw_attribute(sequential_cutoff);
-rw_attribute(cache_mode);
-rw_attribute(writeback_metadata);
-rw_attribute(writeback_running);
-rw_attribute(writeback_percent);
-sysfs_pd_controller_attribute(writeback);
-
-read_attribute(stripe_size);
-read_attribute(partial_stripes_expensive);
-
-rw_attribute(journal_write_delay_ms);
-rw_attribute(journal_reclaim_delay_ms);
-read_attribute(journal_entry_size_max);
-
-rw_attribute(discard);
-rw_attribute(running);
-rw_attribute(label);
-rw_attribute(readahead);
-rw_attribute(verify);
-rw_attribute(bypass_torture_test);
-rw_attribute(cache_replacement_policy);
-
-rw_attribute(foreground_write_ratelimit_enabled);
-rw_attribute(copy_gc_enabled);
-sysfs_pd_controller_attribute(copy_gc);
-
-rw_attribute(tier);
-rw_attribute(tiering_enabled);
-rw_attribute(tiering_percent);
-sysfs_pd_controller_attribute(tiering);
-
-sysfs_pd_controller_attribute(foreground_write);
-
-rw_attribute(pd_controllers_update_seconds);
-
-rw_attribute(foreground_target_percent);
-
-rw_attribute(size);
-read_attribute(meta_replicas_have);
-read_attribute(data_replicas_have);
-
-#define BCH_DEBUG_PARAM(name, description) \
- rw_attribute(name);
-
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_OPT(_name, _mode, ...) \
- static struct attribute sysfs_opt_##_name = { \
- .name = #_name, .mode = _mode, \
- };
-
- BCH_VISIBLE_OPTS()
-#undef BCH_OPT
-
-#define BCH_TIME_STAT(name, frequency_units, duration_units) \
- sysfs_time_stats_attribute(name, frequency_units, duration_units);
- BCH_TIME_STATS()
-#undef BCH_TIME_STAT
-
-static struct attribute sysfs_state_rw = {
- .name = "state",
- .mode = S_IRUGO
-};
-
-SHOW(bch_cached_dev)
-{
- struct cached_dev *dc = container_of(kobj, struct cached_dev,
- disk.kobj);
- const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
-
-#define var(stat) (dc->stat)
-
- if (attr == &sysfs_cache_mode)
- return bch_snprint_string_list(buf, PAGE_SIZE,
- bch_cache_modes + 1,
- BDEV_CACHE_MODE(dc->disk_sb.sb));
-
- var_printf(verify, "%i");
- var_printf(bypass_torture_test, "%i");
- var_printf(writeback_metadata, "%i");
- var_printf(writeback_running, "%i");
- var_print(writeback_percent);
- sysfs_pd_controller_show(writeback, &dc->writeback_pd);
-
- sysfs_hprint(dirty_data,
- bcache_dev_sectors_dirty(&dc->disk) << 9);
- sysfs_print(dirty_bytes,
- bcache_dev_sectors_dirty(&dc->disk) << 9);
-
- sysfs_hprint(stripe_size, dc->disk.stripe_size << 9);
- var_printf(partial_stripes_expensive, "%u");
-
- var_hprint(sequential_cutoff);
- var_hprint(readahead);
-
- sysfs_print(running, atomic_read(&dc->running));
- sysfs_print(state, states[BDEV_STATE(dc->disk_sb.sb)]);
-
- if (attr == &sysfs_label) {
- memcpy(buf, dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
- buf[BCH_SB_LABEL_SIZE + 1] = '\0';
- strcat(buf, "\n");
- return strlen(buf);
- }
-
-#undef var
- return 0;
-}
-
-STORE(bch_cached_dev)
-{
- struct cached_dev *dc = container_of(kobj, struct cached_dev,
- disk.kobj);
- struct kobj_uevent_env *env;
-
-#define d_strtoul(var) sysfs_strtoul(var, dc->var)
-#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
-#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
-
- d_strtoul(verify);
- d_strtoul(bypass_torture_test);
- d_strtoul(writeback_metadata);
- d_strtoul(writeback_running);
- sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
- sysfs_pd_controller_store(writeback, &dc->writeback_pd);
-
- d_strtoi_h(sequential_cutoff);
- d_strtoi_h(readahead);
-
- if (attr == &sysfs_writeback_running)
- bch_writeback_queue(dc);
-
- if (attr == &sysfs_writeback_percent)
- schedule_delayed_work(&dc->writeback_pd_update,
- dc->writeback_pd_update_seconds * HZ);
-
- if (attr == &sysfs_clear_stats)
- bch_cache_accounting_clear(&dc->accounting);
-
- if (attr == &sysfs_running &&
- strtoul_or_return(buf))
- bch_cached_dev_run(dc);
-
- if (attr == &sysfs_cache_mode) {
- ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
-
- if (v < 0)
- return v;
-
- if ((unsigned) v != BDEV_CACHE_MODE(dc->disk_sb.sb)) {
- SET_BDEV_CACHE_MODE(dc->disk_sb.sb, v);
- bch_write_bdev_super(dc, NULL);
- }
- }
-
- if (attr == &sysfs_label) {
- u64 journal_seq = 0;
- int ret = 0;
-
- if (size > BCH_SB_LABEL_SIZE)
- return -EINVAL;
-
- mutex_lock(&dc->disk.inode_lock);
-
- memcpy(dc->disk_sb.sb->label, buf, size);
- if (size < BCH_SB_LABEL_SIZE)
- dc->disk_sb.sb->label[size] = '\0';
- if (size && dc->disk_sb.sb->label[size - 1] == '\n')
- dc->disk_sb.sb->label[size - 1] = '\0';
-
- memcpy(dc->disk.inode.v.i_label,
- dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
-
- bch_write_bdev_super(dc, NULL);
-
- if (dc->disk.c)
- ret = bch_btree_update(dc->disk.c, BTREE_ID_INODES,
- &dc->disk.inode.k_i,
- &journal_seq);
-
- mutex_unlock(&dc->disk.inode_lock);
-
- if (ret)
- return ret;
-
- if (dc->disk.c)
- ret = bch_journal_flush_seq(&dc->disk.c->journal,
- journal_seq);
- if (ret)
- return ret;
-
- env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
- if (!env)
- return -ENOMEM;
- add_uevent_var(env, "DRIVER=bcache");
- add_uevent_var(env, "CACHED_UUID=%pU", dc->disk_sb.sb->disk_uuid.b),
- add_uevent_var(env, "CACHED_LABEL=%s", buf);
- kobject_uevent_env(
- &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp);
- kfree(env);
- }
-
- if (attr == &sysfs_attach) {
- struct bch_fs *c;
- uuid_le uuid;
- int ret;
-
- if (uuid_parse(buf, &uuid))
- return -EINVAL;
-
- c = bch_uuid_to_fs(uuid);
- if (!c) {
- pr_err("Can't attach %s: cache set not found", buf);
- return -ENOENT;
- }
-
- dc->disk_sb.sb->set_uuid = uuid;
-
- ret = bch_cached_dev_attach(dc, c);
- closure_put(&c->cl);
- if (ret)
- return ret;
- }
-
- if (attr == &sysfs_detach && dc->disk.c)
- bch_cached_dev_detach(dc);
-
- if (attr == &sysfs_stop)
- bch_blockdev_stop(&dc->disk);
-
- return size;
-}
-
-static struct attribute *bch_cached_dev_files[] = {
- &sysfs_attach,
- &sysfs_detach,
- &sysfs_stop,
- &sysfs_cache_mode,
- &sysfs_writeback_metadata,
- &sysfs_writeback_running,
- &sysfs_writeback_percent,
- sysfs_pd_controller_files(writeback),
- &sysfs_dirty_data,
- &sysfs_dirty_bytes,
- &sysfs_stripe_size,
- &sysfs_partial_stripes_expensive,
- &sysfs_sequential_cutoff,
- &sysfs_clear_stats,
- &sysfs_running,
- &sysfs_state,
- &sysfs_label,
- &sysfs_readahead,
-#ifdef CONFIG_BCACHE_DEBUG
- &sysfs_verify,
- &sysfs_bypass_torture_test,
-#endif
- NULL
-};
-KTYPE(bch_cached_dev);
-
-SHOW(bch_blockdev_volume)
-{
- struct bcache_device *d = container_of(kobj, struct bcache_device,
- kobj);
-
- sysfs_hprint(size, le64_to_cpu(d->inode.v.i_size));
-
- if (attr == &sysfs_label) {
- memcpy(buf, d->inode.v.i_label, BCH_SB_LABEL_SIZE);
- buf[BCH_SB_LABEL_SIZE + 1] = '\0';
- strcat(buf, "\n");
- return strlen(buf);
- }
-
- return 0;
-}
-
-STORE(bch_blockdev_volume)
-{
- struct bcache_device *d = container_of(kobj, struct bcache_device,
- kobj);
-
- if (attr == &sysfs_size) {
- u64 journal_seq = 0;
- u64 v = strtoi_h_or_return(buf);
- int ret;
-
- mutex_lock(&d->inode_lock);
-
- if (v < le64_to_cpu(d->inode.v.i_size) ){
- ret = bch_inode_truncate(d->c, d->inode.k.p.inode,
- v >> 9, NULL, NULL);
- if (ret) {
- mutex_unlock(&d->inode_lock);
- return ret;
- }
- }
- d->inode.v.i_size = cpu_to_le64(v);
- ret = bch_btree_update(d->c, BTREE_ID_INODES,
- &d->inode.k_i, &journal_seq);
-
- mutex_unlock(&d->inode_lock);
-
- if (ret)
- return ret;
-
- ret = bch_journal_flush_seq(&d->c->journal, journal_seq);
- if (ret)
- return ret;
-
- set_capacity(d->disk, v >> 9);
- }
-
- if (attr == &sysfs_label) {
- u64 journal_seq = 0;
- int ret;
-
- mutex_lock(&d->inode_lock);
-
- memcpy(d->inode.v.i_label, buf, BCH_SB_LABEL_SIZE);
- ret = bch_btree_update(d->c, BTREE_ID_INODES,
- &d->inode.k_i, &journal_seq);
-
- mutex_unlock(&d->inode_lock);
-
- return ret ?: bch_journal_flush_seq(&d->c->journal, journal_seq);
- }
-
- if (attr == &sysfs_unregister) {
- set_bit(BCACHE_DEV_DETACHING, &d->flags);
- bch_blockdev_stop(d);
- }
-
- return size;
-}
-
-static struct attribute *bch_blockdev_volume_files[] = {
- &sysfs_unregister,
- &sysfs_label,
- &sysfs_size,
- NULL
-};
-KTYPE(bch_blockdev_volume);
-
-static int bch_bset_print_stats(struct bch_fs *c, char *buf)
-{
- struct bset_stats stats;
- size_t nodes = 0;
- struct btree *b;
- struct bucket_table *tbl;
- struct rhash_head *pos;
- unsigned iter;
-
- memset(&stats, 0, sizeof(stats));
-
- rcu_read_lock();
- for_each_cached_btree(b, c, tbl, iter, pos) {
- bch_btree_keys_stats(b, &stats);
- nodes++;
- }
- rcu_read_unlock();
-
- return snprintf(buf, PAGE_SIZE,
- "btree nodes: %zu\n"
- "written sets: %zu\n"
- "written key bytes: %zu\n"
- "unwritten sets: %zu\n"
- "unwritten key bytes: %zu\n"
- "no table sets: %zu\n"
- "no table key bytes: %zu\n"
- "floats: %zu\n"
- "failed unpacked: %zu\n"
- "failed prev: %zu\n"
- "failed overflow: %zu\n",
- nodes,
- stats.sets[BSET_RO_AUX_TREE].nr,
- stats.sets[BSET_RO_AUX_TREE].bytes,
- stats.sets[BSET_RW_AUX_TREE].nr,
- stats.sets[BSET_RW_AUX_TREE].bytes,
- stats.sets[BSET_NO_AUX_TREE].nr,
- stats.sets[BSET_NO_AUX_TREE].bytes,
- stats.floats,
- stats.failed_unpacked,
- stats.failed_prev,
- stats.failed_overflow);
-}
-
-static unsigned bch_root_usage(struct bch_fs *c)
-{
- unsigned bytes = 0;
- struct bkey_packed *k;
- struct btree *b;
- struct btree_node_iter iter;
-
- goto lock_root;
-
- do {
- six_unlock_read(&b->lock);
-lock_root:
- b = c->btree_roots[BTREE_ID_EXTENTS].b;
- six_lock_read(&b->lock);
- } while (b != c->btree_roots[BTREE_ID_EXTENTS].b);
-
- for_each_btree_node_key(b, k, &iter, btree_node_is_extents(b))
- bytes += bkey_bytes(k);
-
- six_unlock_read(&b->lock);
-
- return (bytes * 100) / btree_bytes(c);
-}
-
-static size_t bch_btree_cache_size(struct bch_fs *c)
-{
- size_t ret = 0;
- struct btree *b;
-
- mutex_lock(&c->btree_cache_lock);
- list_for_each_entry(b, &c->btree_cache, list)
- ret += btree_bytes(c);
-
- mutex_unlock(&c->btree_cache_lock);
- return ret;
-}
-
-static unsigned bch_fs_available_percent(struct bch_fs *c)
-{
- return div64_u64((u64) sectors_available(c) * 100,
- c->capacity ?: 1);
-}
-
-#if 0
-static unsigned bch_btree_used(struct bch_fs *c)
-{
- return div64_u64(c->gc_stats.key_bytes * 100,
- (c->gc_stats.nodes ?: 1) * btree_bytes(c));
-}
-
-static unsigned bch_average_key_size(struct bch_fs *c)
-{
- return c->gc_stats.nkeys
- ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
- : 0;
-}
-#endif
-
-static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
-{
- struct bch_fs_usage stats = bch_fs_usage_read(c);
-
- return scnprintf(buf, PAGE_SIZE,
- "capacity:\t\t%llu\n"
- "compressed:\n"
- "\tmeta:\t\t%llu\n"
- "\tdirty:\t\t%llu\n"
- "\tcached:\t\t%llu\n"
- "uncompressed:\n"
- "\tmeta:\t\t%llu\n"
- "\tdirty:\t\t%llu\n"
- "\tcached:\t\t%llu\n"
- "persistent reserved sectors:\t%llu\n"
- "online reserved sectors:\t%llu\n",
- c->capacity,
- stats.s[S_COMPRESSED][S_META],
- stats.s[S_COMPRESSED][S_DIRTY],
- stats.s[S_COMPRESSED][S_CACHED],
- stats.s[S_UNCOMPRESSED][S_META],
- stats.s[S_UNCOMPRESSED][S_DIRTY],
- stats.s[S_UNCOMPRESSED][S_CACHED],
- stats.persistent_reserved,
- stats.online_reserved);
-}
-
-static ssize_t bch_compression_stats(struct bch_fs *c, char *buf)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
- nr_compressed_extents = 0,
- compressed_sectors_compressed = 0,
- compressed_sectors_uncompressed = 0;
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k)
- if (k.k->type == BCH_EXTENT) {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const struct bch_extent_ptr *ptr;
- const union bch_extent_crc *crc;
-
- extent_for_each_ptr_crc(e, ptr, crc) {
- if (crc_compression_type(crc) == BCH_COMPRESSION_NONE) {
- nr_uncompressed_extents++;
- uncompressed_sectors += e.k->size;
- } else {
- nr_compressed_extents++;
- compressed_sectors_compressed +=
- crc_compressed_size(e.k, crc);
- compressed_sectors_uncompressed +=
- crc_uncompressed_size(e.k, crc);
- }
-
- /* only looking at the first ptr */
- break;
- }
- }
- bch_btree_iter_unlock(&iter);
-
- return snprintf(buf, PAGE_SIZE,
- "uncompressed data:\n"
- " nr extents: %llu\n"
- " size (bytes): %llu\n"
- "compressed data:\n"
- " nr extents: %llu\n"
- " compressed size (bytes): %llu\n"
- " uncompressed size (bytes): %llu\n",
- nr_uncompressed_extents,
- uncompressed_sectors << 9,
- nr_compressed_extents,
- compressed_sectors_compressed << 9,
- compressed_sectors_uncompressed << 9);
-}
-
-SHOW(bch_fs)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
- sysfs_print(minor, c->minor);
-
- sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms);
- sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
- sysfs_hprint(journal_entry_size_max, c->journal.entry_size_max);
-
- sysfs_hprint(block_size, block_bytes(c));
- sysfs_print(block_size_bytes, block_bytes(c));
- sysfs_hprint(btree_node_size, c->sb.btree_node_size << 9);
- sysfs_print(btree_node_size_bytes, c->sb.btree_node_size << 9);
-
- sysfs_hprint(btree_cache_size, bch_btree_cache_size(c));
- sysfs_print(cache_available_percent, bch_fs_available_percent(c));
-
- sysfs_print(btree_gc_running, c->gc_pos.phase != GC_PHASE_DONE);
-
-#if 0
- /* XXX: reimplement */
- sysfs_print(btree_used_percent, bch_btree_used(c));
- sysfs_print(btree_nodes, c->gc_stats.nodes);
- sysfs_hprint(average_key_size, bch_average_key_size(c));
-#endif
-
- sysfs_print(cache_read_races,
- atomic_long_read(&c->cache_read_races));
-
- sysfs_print(writeback_keys_done,
- atomic_long_read(&c->writeback_keys_done));
- sysfs_print(writeback_keys_failed,
- atomic_long_read(&c->writeback_keys_failed));
-
- /* See count_io_errors for why 88 */
- sysfs_print(io_error_halflife, c->error_decay * 88);
- sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT);
-
- sysfs_hprint(congested,
- ((uint64_t) bch_get_congested(c)) << 9);
- sysfs_print(congested_read_threshold_us,
- c->congested_read_threshold_us);
- sysfs_print(congested_write_threshold_us,
- c->congested_write_threshold_us);
-
- sysfs_printf(foreground_write_ratelimit_enabled, "%i",
- c->foreground_write_ratelimit_enabled);
- sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
- sysfs_pd_controller_show(foreground_write, &c->foreground_write_pd);
-
- sysfs_print(pd_controllers_update_seconds,
- c->pd_controllers_update_seconds);
- sysfs_print(foreground_target_percent, c->foreground_target_percent);
-
- sysfs_printf(tiering_enabled, "%i", c->tiering_enabled);
- sysfs_print(tiering_percent, c->tiering_percent);
-
- sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */
-
- sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have);
- sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have);
-
- /* Debugging: */
-
- if (attr == &sysfs_journal_debug)
- return bch_journal_print_debug(&c->journal, buf);
-
-#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
- if (!bch_fs_running(c))
- return -EPERM;
-
- if (attr == &sysfs_bset_tree_stats)
- return bch_bset_print_stats(c, buf);
- if (attr == &sysfs_alloc_debug)
- return show_fs_alloc_debug(c, buf);
-
- sysfs_print(tree_depth, c->btree_roots[BTREE_ID_EXTENTS].b->level);
- sysfs_print(root_usage_percent, bch_root_usage(c));
-
- if (attr == &sysfs_compression_stats)
- return bch_compression_stats(c, buf);
-
- sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
-
- return 0;
-}
-
-STORE(__bch_fs)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
- if (attr == &sysfs_unregister) {
- bch_fs_detach(c);
- return size;
- }
-
- if (attr == &sysfs_stop) {
- bch_fs_stop_async(c);
- return size;
- }
-
- if (attr == &sysfs_clear_stats) {
- atomic_long_set(&c->writeback_keys_done, 0);
- atomic_long_set(&c->writeback_keys_failed, 0);
- bch_cache_accounting_clear(&c->accounting);
-
- return size;
- }
-
- sysfs_strtoul(congested_read_threshold_us,
- c->congested_read_threshold_us);
- sysfs_strtoul(congested_write_threshold_us,
- c->congested_write_threshold_us);
-
- if (attr == &sysfs_io_error_limit) {
- c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
- return size;
- }
-
- /* See count_io_errors() for why 88 */
- if (attr == &sysfs_io_error_halflife) {
- c->error_decay = strtoul_or_return(buf) / 88;
- return size;
- }
-
- sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
- sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
-
- sysfs_strtoul(foreground_write_ratelimit_enabled,
- c->foreground_write_ratelimit_enabled);
-
- if (attr == &sysfs_copy_gc_enabled) {
- struct bch_dev *ca;
- unsigned i;
- ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
- ?: (ssize_t) size;
-
- for_each_member_device(ca, c, i)
- if (ca->moving_gc_read)
- wake_up_process(ca->moving_gc_read);
- return ret;
- }
-
- if (attr == &sysfs_tiering_enabled) {
- ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
- ?: (ssize_t) size;
-
- bch_tiering_start(c); /* issue wakeups */
- return ret;
- }
-
- sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
-
- sysfs_strtoul(pd_controllers_update_seconds,
- c->pd_controllers_update_seconds);
- sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
-
- sysfs_strtoul(tiering_percent, c->tiering_percent);
- sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */
-
- /* Debugging: */
-
-#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
- if (!bch_fs_running(c))
- return -EPERM;
-
- if (attr == &sysfs_journal_flush) {
- bch_journal_meta_async(&c->journal, NULL);
-
- return size;
- }
-
- if (attr == &sysfs_blockdev_volume_create) {
- u64 v = strtoi_h_or_return(buf);
- int r = bch_blockdev_volume_create(c, v);
-
- if (r)
- return r;
- }
-
- if (attr == &sysfs_trigger_btree_coalesce)
- bch_coalesce(c);
-
- /* Debugging: */
-
- if (attr == &sysfs_trigger_gc)
- bch_gc(c);
-
- if (attr == &sysfs_prune_cache) {
- struct shrink_control sc;
-
- sc.gfp_mask = GFP_KERNEL;
- sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_cache_shrink.scan_objects(&c->btree_cache_shrink, &sc);
- }
-
- return size;
-}
-
-STORE(bch_fs)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-
- mutex_lock(&c->state_lock);
- size = __bch_fs_store(kobj, attr, buf, size);
- mutex_unlock(&c->state_lock);
-
- return size;
-}
-
-static struct attribute *bch_fs_files[] = {
- &sysfs_unregister,
- &sysfs_stop,
- &sysfs_journal_write_delay_ms,
- &sysfs_journal_reclaim_delay_ms,
- &sysfs_journal_entry_size_max,
- &sysfs_blockdev_volume_create,
-
- &sysfs_block_size,
- &sysfs_block_size_bytes,
- &sysfs_btree_node_size,
- &sysfs_btree_node_size_bytes,
- &sysfs_tree_depth,
- &sysfs_root_usage_percent,
- &sysfs_btree_cache_size,
- &sysfs_cache_available_percent,
- &sysfs_compression_stats,
-
- &sysfs_average_key_size,
-
- &sysfs_io_error_limit,
- &sysfs_io_error_halflife,
- &sysfs_congested,
- &sysfs_congested_read_threshold_us,
- &sysfs_congested_write_threshold_us,
- &sysfs_clear_stats,
-
- &sysfs_meta_replicas_have,
- &sysfs_data_replicas_have,
-
- &sysfs_foreground_target_percent,
- &sysfs_tiering_percent,
-
- &sysfs_journal_flush,
- NULL
-};
-KTYPE(bch_fs);
-
-/* internal dir - just a wrapper */
-
-SHOW(bch_fs_internal)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
- return bch_fs_show(&c->kobj, attr, buf);
-}
-
-STORE(bch_fs_internal)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
- return bch_fs_store(&c->kobj, attr, buf, size);
-}
-
-static void bch_fs_internal_release(struct kobject *k)
-{
-}
-
-static struct attribute *bch_fs_internal_files[] = {
- &sysfs_journal_debug,
-
- &sysfs_alloc_debug,
-
- &sysfs_btree_gc_running,
-
- &sysfs_btree_nodes,
- &sysfs_btree_used_percent,
-
- &sysfs_bset_tree_stats,
- &sysfs_cache_read_races,
- &sysfs_writeback_keys_done,
- &sysfs_writeback_keys_failed,
-
- &sysfs_trigger_btree_coalesce,
- &sysfs_trigger_gc,
- &sysfs_prune_cache,
- &sysfs_foreground_write_ratelimit_enabled,
- &sysfs_copy_gc_enabled,
- &sysfs_tiering_enabled,
- sysfs_pd_controller_files(tiering),
- sysfs_pd_controller_files(foreground_write),
- &sysfs_internal_uuid,
-
-#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
- NULL
-};
-KTYPE(bch_fs_internal);
-
-/* options */
-
-SHOW(bch_fs_opts_dir)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
-
- return bch_opt_show(&c->opts, attr->name, buf, PAGE_SIZE);
-}
-
-STORE(bch_fs_opts_dir)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
- const struct bch_option *opt;
- enum bch_opt_id id;
- u64 v;
-
- id = bch_parse_sysfs_opt(attr->name, buf, &v);
- if (id < 0)
- return id;
-
- opt = &bch_opt_table[id];
-
- mutex_lock(&c->sb_lock);
-
- if (id == Opt_compression) {
- int ret = bch_check_set_has_compressed_data(c, v);
- if (ret) {
- mutex_unlock(&c->sb_lock);
- return ret;
- }
- }
-
- if (opt->set_sb != SET_NO_SB_OPT) {
- opt->set_sb(c->disk_sb, v);
- bch_write_super(c);
- }
-
- bch_opt_set(&c->opts, id, v);
-
- mutex_unlock(&c->sb_lock);
-
- return size;
-}
-
-static void bch_fs_opts_dir_release(struct kobject *k)
-{
-}
-
-static struct attribute *bch_fs_opts_dir_files[] = {
-#define BCH_OPT(_name, ...) \
- &sysfs_opt_##_name,
-
- BCH_VISIBLE_OPTS()
-#undef BCH_OPT
-
- NULL
-};
-KTYPE(bch_fs_opts_dir);
-
-/* time stats */
-
-SHOW(bch_fs_time_stats)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define BCH_TIME_STAT(name, frequency_units, duration_units) \
- sysfs_print_time_stats(&c->name##_time, name, \
- frequency_units, duration_units);
- BCH_TIME_STATS()
-#undef BCH_TIME_STAT
-
- return 0;
-}
-
-STORE(bch_fs_time_stats)
-{
- struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define BCH_TIME_STAT(name, frequency_units, duration_units) \
- sysfs_clear_time_stats(&c->name##_time, name);
- BCH_TIME_STATS()
-#undef BCH_TIME_STAT
-
- return size;
-}
-
-static void bch_fs_time_stats_release(struct kobject *k)
-{
-}
-
-static struct attribute *bch_fs_time_stats_files[] = {
-#define BCH_TIME_STAT(name, frequency_units, duration_units) \
- sysfs_time_stats_attribute_list(name, frequency_units, duration_units)
- BCH_TIME_STATS()
-#undef BCH_TIME_STAT
-
- NULL
-};
-KTYPE(bch_fs_time_stats);
-
-typedef unsigned (bucket_map_fn)(struct bch_dev *, struct bucket *, void *);
-
-static unsigned bucket_priority_fn(struct bch_dev *ca, struct bucket *g,
- void *private)
-{
- int rw = (private ? 1 : 0);
-
- return ca->fs->prio_clock[rw].hand - g->prio[rw];
-}
-
-static unsigned bucket_sectors_used_fn(struct bch_dev *ca, struct bucket *g,
- void *private)
-{
- return bucket_sectors_used(g);
-}
-
-static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, struct bucket *g,
- void *private)
-{
- return bucket_gc_gen(ca, g);
-}
-
-static ssize_t show_quantiles(struct bch_dev *ca, char *buf,
- bucket_map_fn *fn, void *private)
-{
- int cmp(const void *l, const void *r)
- { return *((unsigned *) r) - *((unsigned *) l); }
-
- size_t n = ca->mi.nbuckets, i;
- /* Compute 31 quantiles */
- unsigned q[31], *p;
- ssize_t ret = 0;
-
- p = vzalloc(ca->mi.nbuckets * sizeof(unsigned));
- if (!p)
- return -ENOMEM;
-
- for (i = ca->mi.first_bucket; i < n; i++)
- p[i] = fn(ca, &ca->buckets[i], private);
-
- sort(p, n, sizeof(unsigned), cmp, NULL);
-
- while (n &&
- !p[n - 1])
- --n;
-
- for (i = 0; i < ARRAY_SIZE(q); i++)
- q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)];
-
- vfree(p);
-
- for (i = 0; i < ARRAY_SIZE(q); i++)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "%u ", q[i]);
- buf[ret - 1] = '\n';
-
- return ret;
-
-}
-
-static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
-{
- enum alloc_reserve i;
- ssize_t ret;
-
- spin_lock(&ca->freelist_lock);
-
- ret = scnprintf(buf, PAGE_SIZE,
- "free_inc:\t%zu\t%zu\n",
- fifo_used(&ca->free_inc),
- ca->free_inc.size);
-
- for (i = 0; i < RESERVE_NR; i++)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret,
- "free[%u]:\t%zu\t%zu\n", i,
- fifo_used(&ca->free[i]),
- ca->free[i].size);
-
- spin_unlock(&ca->freelist_lock);
-
- return ret;
-}
-
-static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
-{
- struct bch_fs *c = ca->fs;
- struct bch_dev_usage stats = bch_dev_usage_read(ca);
-
- return scnprintf(buf, PAGE_SIZE,
- "free_inc: %zu/%zu\n"
- "free[RESERVE_PRIO]: %zu/%zu\n"
- "free[RESERVE_BTREE]: %zu/%zu\n"
- "free[RESERVE_MOVINGGC]: %zu/%zu\n"
- "free[RESERVE_NONE]: %zu/%zu\n"
- "alloc: %llu/%llu\n"
- "meta: %llu/%llu\n"
- "dirty: %llu/%llu\n"
- "available: %llu/%llu\n"
- "freelist_wait: %s\n"
- "open buckets: %u/%u (reserved %u)\n"
- "open_buckets_wait: %s\n",
- fifo_used(&ca->free_inc), ca->free_inc.size,
- fifo_used(&ca->free[RESERVE_PRIO]), ca->free[RESERVE_PRIO].size,
- fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size,
- fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
- fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
- stats.buckets_alloc, ca->mi.nbuckets - ca->mi.first_bucket,
- stats.buckets_meta, ca->mi.nbuckets - ca->mi.first_bucket,
- stats.buckets_dirty, ca->mi.nbuckets - ca->mi.first_bucket,
- __dev_buckets_available(ca, stats), ca->mi.nbuckets - ca->mi.first_bucket,
- c->freelist_wait.list.first ? "waiting" : "empty",
- c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
- c->open_buckets_wait.list.first ? "waiting" : "empty");
-}
-
-static u64 sectors_written(struct bch_dev *ca)
-{
- u64 ret = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- ret += *per_cpu_ptr(ca->sectors_written, cpu);
-
- return ret;
-}
-
-SHOW(bch_dev)
-{
- struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
- struct bch_fs *c = ca->fs;
- struct bch_dev_usage stats = bch_dev_usage_read(ca);
-
- sysfs_printf(uuid, "%pU\n", ca->uuid.b);
-
- sysfs_hprint(bucket_size, bucket_bytes(ca));
- sysfs_print(bucket_size_bytes, bucket_bytes(ca));
- sysfs_hprint(block_size, block_bytes(c));
- sysfs_print(block_size_bytes, block_bytes(c));
- sysfs_print(first_bucket, ca->mi.first_bucket);
- sysfs_print(nbuckets, ca->mi.nbuckets);
- sysfs_print(discard, ca->mi.discard);
- sysfs_hprint(written, sectors_written(ca) << 9);
- sysfs_hprint(btree_written,
- atomic64_read(&ca->btree_sectors_written) << 9);
- sysfs_hprint(metadata_written,
- (atomic64_read(&ca->meta_sectors_written) +
- atomic64_read(&ca->btree_sectors_written)) << 9);
-
- sysfs_print(io_errors,
- atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
-
- sysfs_hprint(dirty_data, stats.sectors[S_DIRTY] << 9);
- sysfs_print(dirty_bytes, stats.sectors[S_DIRTY] << 9);
- sysfs_print(dirty_buckets, stats.buckets_dirty);
- sysfs_hprint(cached_data, stats.sectors[S_CACHED] << 9);
- sysfs_print(cached_bytes, stats.sectors[S_CACHED] << 9);
- sysfs_print(cached_buckets, stats.buckets_cached);
- sysfs_print(meta_buckets, stats.buckets_meta);
- sysfs_print(alloc_buckets, stats.buckets_alloc);
- sysfs_print(available_buckets, dev_buckets_available(ca));
- sysfs_print(free_buckets, dev_buckets_free(ca));
- sysfs_print(has_data, ca->mi.has_data);
- sysfs_print(has_metadata, ca->mi.has_metadata);
-
- sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
-
- if (attr == &sysfs_cache_replacement_policy)
- return bch_snprint_string_list(buf, PAGE_SIZE,
- bch_cache_replacement_policies,
- ca->mi.replacement);
-
- sysfs_print(tier, ca->mi.tier);
-
- if (attr == &sysfs_state_rw)
- return bch_snprint_string_list(buf, PAGE_SIZE,
- bch_dev_state,
- ca->mi.state);
-
- if (attr == &sysfs_read_priority_stats)
- return show_quantiles(ca, buf, bucket_priority_fn, (void *) 0);
- if (attr == &sysfs_write_priority_stats)
- return show_quantiles(ca, buf, bucket_priority_fn, (void *) 1);
- if (attr == &sysfs_fragmentation_stats)
- return show_quantiles(ca, buf, bucket_sectors_used_fn, NULL);
- if (attr == &sysfs_oldest_gen_stats)
- return show_quantiles(ca, buf, bucket_oldest_gen_fn, NULL);
- if (attr == &sysfs_reserve_stats)
- return show_reserve_stats(ca, buf);
- if (attr == &sysfs_alloc_debug)
- return show_dev_alloc_debug(ca, buf);
-
- return 0;
-}
-
-STORE(bch_dev)
-{
- struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
- struct bch_fs *c = ca->fs;
- struct bch_member *mi;
-
- sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd);
-
- if (attr == &sysfs_discard) {
- bool v = strtoul_or_return(buf);
-
- mutex_lock(&c->sb_lock);
- mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
-
- if (v != BCH_MEMBER_DISCARD(mi)) {
- SET_BCH_MEMBER_DISCARD(mi, v);
- bch_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
- }
-
- if (attr == &sysfs_cache_replacement_policy) {
- ssize_t v = bch_read_string_list(buf, bch_cache_replacement_policies);
-
- if (v < 0)
- return v;
-
- mutex_lock(&c->sb_lock);
- mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
-
- if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
- SET_BCH_MEMBER_REPLACEMENT(mi, v);
- bch_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
- }
-
- if (attr == &sysfs_tier) {
- unsigned prev_tier;
- unsigned v = strtoul_restrict_or_return(buf,
- 0, BCH_TIER_MAX - 1);
-
- mutex_lock(&c->sb_lock);
- prev_tier = ca->mi.tier;
-
- if (v == ca->mi.tier) {
- mutex_unlock(&c->sb_lock);
- return size;
- }
-
- mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
- SET_BCH_MEMBER_TIER(mi, v);
- bch_write_super(c);
-
- bch_dev_group_remove(&c->tiers[prev_tier].devs, ca);
- bch_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
- mutex_unlock(&c->sb_lock);
-
- bch_recalc_capacity(c);
- bch_tiering_start(c);
- }
-
- if (attr == &sysfs_clear_stats) {
- int cpu;
-
- for_each_possible_cpu(cpu)
- *per_cpu_ptr(ca->sectors_written, cpu) = 0;
-
- atomic64_set(&ca->btree_sectors_written, 0);
- atomic64_set(&ca->meta_sectors_written, 0);
- atomic_set(&ca->io_count, 0);
- atomic_set(&ca->io_errors, 0);
- }
-
- return size;
-}
-
-static struct attribute *bch_dev_files[] = {
- &sysfs_uuid,
- &sysfs_bucket_size,
- &sysfs_bucket_size_bytes,
- &sysfs_block_size,
- &sysfs_block_size_bytes,
- &sysfs_first_bucket,
- &sysfs_nbuckets,
- &sysfs_read_priority_stats,
- &sysfs_write_priority_stats,
- &sysfs_fragmentation_stats,
- &sysfs_oldest_gen_stats,
- &sysfs_reserve_stats,
- &sysfs_available_buckets,
- &sysfs_free_buckets,
- &sysfs_dirty_data,
- &sysfs_dirty_bytes,
- &sysfs_dirty_buckets,
- &sysfs_cached_data,
- &sysfs_cached_bytes,
- &sysfs_cached_buckets,
- &sysfs_meta_buckets,
- &sysfs_alloc_buckets,
- &sysfs_has_data,
- &sysfs_has_metadata,
- &sysfs_discard,
- &sysfs_written,
- &sysfs_btree_written,
- &sysfs_metadata_written,
- &sysfs_io_errors,
- &sysfs_clear_stats,
- &sysfs_cache_replacement_policy,
- &sysfs_tier,
- &sysfs_state_rw,
- &sysfs_alloc_debug,
-
- sysfs_pd_controller_files(copy_gc),
- NULL
-};
-KTYPE(bch_dev);
diff --git a/libbcache/sysfs.h b/libbcache/sysfs.h
deleted file mode 100644
index 02700246..00000000
--- a/libbcache/sysfs.h
+++ /dev/null
@@ -1,103 +0,0 @@
-#ifndef _BCACHE_SYSFS_H_
-#define _BCACHE_SYSFS_H_
-
-#include "util.h"
-
-#define KTYPE(type) \
-struct kobj_type type ## _ktype = { \
- .release = type ## _release, \
- .sysfs_ops = &((const struct sysfs_ops) { \
- .show = type ## _show, \
- .store = type ## _store \
- }), \
- .default_attrs = type ## _files \
-}
-
-#define SHOW(fn) \
-static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
- char *buf) \
-
-#define STORE(fn) \
-static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
- const char *buf, size_t size) \
-
-#define __sysfs_attribute(_name, _mode) \
- static struct attribute sysfs_##_name = \
- { .name = #_name, .mode = _mode }
-
-#define write_attribute(n) __sysfs_attribute(n, S_IWUSR)
-#define read_attribute(n) __sysfs_attribute(n, S_IRUGO)
-#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR)
-
-#define sysfs_printf(file, fmt, ...) \
-do { \
- if (attr == &sysfs_ ## file) \
- return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \
-} while (0)
-
-#define sysfs_print(file, var) \
-do { \
- if (attr == &sysfs_ ## file) \
- return snprint(buf, PAGE_SIZE, var); \
-} while (0)
-
-#define sysfs_hprint(file, val) \
-do { \
- if (attr == &sysfs_ ## file) { \
- ssize_t ret = bch_hprint(buf, val); \
- strcat(buf, "\n"); \
- return ret + 1; \
- } \
-} while (0)
-
-#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var))
-#define var_print(_var) sysfs_print(_var, var(_var))
-#define var_hprint(_var) sysfs_hprint(_var, var(_var))
-
-#define sysfs_strtoul(file, var) \
-do { \
- if (attr == &sysfs_ ## file) \
- return strtoul_safe(buf, var) ?: (ssize_t) size; \
-} while (0)
-
-#define sysfs_strtoul_clamp(file, var, min, max) \
-do { \
- if (attr == &sysfs_ ## file) \
- return strtoul_safe_clamp(buf, var, min, max) \
- ?: (ssize_t) size; \
-} while (0)
-
-#define strtoul_or_return(cp) \
-({ \
- unsigned long _v; \
- int _r = kstrtoul(cp, 10, &_v); \
- if (_r) \
- return _r; \
- _v; \
-})
-
-#define strtoul_restrict_or_return(cp, min, max) \
-({ \
- unsigned long __v = 0; \
- int _r = strtoul_safe_restrict(cp, __v, min, max); \
- if (_r) \
- return _r; \
- __v; \
-})
-
-#define strtoi_h_or_return(cp) \
-({ \
- u64 _v; \
- int _r = strtoi_h(cp, &_v); \
- if (_r) \
- return _r; \
- _v; \
-})
-
-#define sysfs_hatoi(file, var) \
-do { \
- if (attr == &sysfs_ ## file) \
- return strtoi_h(buf, &var) ?: (ssize_t) size; \
-} while (0)
-
-#endif /* _BCACHE_SYSFS_H_ */
diff --git a/libbcache/tier.c b/libbcache/tier.c
deleted file mode 100644
index 8627ac3e..00000000
--- a/libbcache/tier.c
+++ /dev/null
@@ -1,282 +0,0 @@
-
-#include "bcache.h"
-#include "alloc.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "clock.h"
-#include "extents.h"
-#include "io.h"
-#include "keylist.h"
-#include "move.h"
-#include "super-io.h"
-#include "tier.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <trace/events/bcache.h>
-
-struct tiering_state {
- struct bch_tier *tier;
- unsigned sectors;
- unsigned stripe_size;
- unsigned dev_idx;
- struct bch_dev *ca;
-};
-
-static bool tiering_pred(struct bch_fs *c,
- struct tiering_state *s,
- struct bkey_s_c k)
-{
- if (bkey_extent_is_data(k.k)) {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const struct bch_extent_ptr *ptr;
- unsigned replicas = 0;
-
- /* Make sure we have room to add a new pointer: */
- if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
- BKEY_EXTENT_VAL_U64s_MAX)
- return false;
-
- extent_for_each_ptr(e, ptr)
- if (c->devs[ptr->dev]->mi.tier >= s->tier->idx)
- replicas++;
-
- return replicas < c->opts.data_replicas;
- }
-
- return false;
-}
-
-static void tier_put_device(struct tiering_state *s)
-{
- if (s->ca)
- percpu_ref_put(&s->ca->io_ref);
- s->ca = NULL;
-}
-
-/**
- * refill_next - move on to refilling the next cache's tiering keylist
- */
-static void tier_next_device(struct bch_fs *c, struct tiering_state *s)
-{
- if (!s->ca || s->sectors > s->stripe_size) {
- tier_put_device(s);
- s->sectors = 0;
- s->dev_idx++;
-
- spin_lock(&s->tier->devs.lock);
- if (s->dev_idx >= s->tier->devs.nr)
- s->dev_idx = 0;
-
- if (s->tier->devs.nr) {
- s->ca = s->tier->devs.d[s->dev_idx].dev;
- percpu_ref_get(&s->ca->io_ref);
- }
- spin_unlock(&s->tier->devs.lock);
- }
-}
-
-static int issue_tiering_move(struct bch_fs *c,
- struct tiering_state *s,
- struct moving_context *ctxt,
- struct bkey_s_c k)
-{
- int ret;
-
- ret = bch_data_move(c, ctxt, &s->ca->tiering_write_point, k, NULL);
- if (!ret) {
- trace_bcache_tiering_copy(k.k);
- s->sectors += k.k->size;
- } else {
- trace_bcache_tiering_alloc_fail(c, k.k->size);
- }
-
- return ret;
-}
-
-/**
- * tiering_next_cache - issue a move to write an extent to the next cache
- * device in round robin order
- */
-static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier)
-{
- struct moving_context ctxt;
- struct tiering_state s;
- struct btree_iter iter;
- struct bkey_s_c k;
- unsigned nr_devices = READ_ONCE(tier->devs.nr);
- int ret;
-
- if (!nr_devices)
- return 0;
-
- trace_bcache_tiering_start(c);
-
- memset(&s, 0, sizeof(s));
- s.tier = tier;
- s.stripe_size = 2048; /* 1 mb for now */
-
- bch_move_ctxt_init(&ctxt, &tier->pd.rate,
- nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
- bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
-
- while (!kthread_should_stop() &&
- !bch_move_ctxt_wait(&ctxt) &&
- (k = bch_btree_iter_peek(&iter)).k &&
- !btree_iter_err(k)) {
- if (!tiering_pred(c, &s, k))
- goto next;
-
- tier_next_device(c, &s);
- if (!s.ca)
- break;
-
- ret = issue_tiering_move(c, &s, &ctxt, k);
- if (ret) {
- bch_btree_iter_unlock(&iter);
-
- /* memory allocation failure, wait for some IO to finish */
- bch_move_ctxt_wait_for_io(&ctxt);
- continue;
- }
-next:
- bch_btree_iter_advance_pos(&iter);
- //bch_btree_iter_cond_resched(&iter);
-
- /* unlock before calling moving_context_wait() */
- bch_btree_iter_unlock(&iter);
- cond_resched();
- }
-
- bch_btree_iter_unlock(&iter);
- tier_put_device(&s);
- bch_move_ctxt_exit(&ctxt);
- trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
-
- return ctxt.sectors_moved;
-}
-
-static int bch_tiering_thread(void *arg)
-{
- struct bch_tier *tier = arg;
- struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
- struct io_clock *clock = &c->io_clock[WRITE];
- struct bch_dev *ca;
- u64 tier_capacity, available_sectors;
- unsigned long last;
- unsigned i;
-
- set_freezable();
-
- while (!kthread_should_stop()) {
- if (kthread_wait_freezable(c->tiering_enabled &&
- tier->devs.nr))
- break;
-
- while (1) {
- struct bch_tier *faster_tier;
-
- last = atomic_long_read(&clock->now);
-
- tier_capacity = available_sectors = 0;
- for (faster_tier = c->tiers;
- faster_tier != tier;
- faster_tier++) {
- spin_lock(&faster_tier->devs.lock);
- group_for_each_dev(ca, &faster_tier->devs, i) {
- tier_capacity +=
- (ca->mi.nbuckets -
- ca->mi.first_bucket) << ca->bucket_bits;
- available_sectors +=
- dev_buckets_available(ca) << ca->bucket_bits;
- }
- spin_unlock(&faster_tier->devs.lock);
- }
-
- if (available_sectors < (tier_capacity >> 1))
- break;
-
- bch_kthread_io_clock_wait(clock,
- last +
- available_sectors -
- (tier_capacity >> 1));
- if (kthread_should_stop())
- return 0;
- }
-
- read_tiering(c, tier);
- }
-
- return 0;
-}
-
-static void __bch_tiering_stop(struct bch_tier *tier)
-{
- tier->pd.rate.rate = UINT_MAX;
- bch_ratelimit_reset(&tier->pd.rate);
-
- if (tier->migrate)
- kthread_stop(tier->migrate);
-
- tier->migrate = NULL;
-}
-
-void bch_tiering_stop(struct bch_fs *c)
-{
- struct bch_tier *tier;
-
- for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++)
- __bch_tiering_stop(tier);
-}
-
-static int __bch_tiering_start(struct bch_tier *tier)
-{
- if (!tier->migrate) {
- struct task_struct *p =
- kthread_create(bch_tiering_thread, tier,
- "bch_tier[%u]", tier->idx);
- if (IS_ERR(p))
- return PTR_ERR(p);
-
- tier->migrate = p;
- }
-
- wake_up_process(tier->migrate);
- return 0;
-}
-
-int bch_tiering_start(struct bch_fs *c)
-{
- struct bch_tier *tier;
- bool have_faster_tier = false;
-
- if (c->opts.nochanges)
- return 0;
-
- for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
- if (!tier->devs.nr)
- continue;
-
- if (have_faster_tier) {
- int ret = __bch_tiering_start(tier);
- if (ret)
- return ret;
- } else {
- __bch_tiering_stop(tier);
- }
-
- have_faster_tier = true;
- }
-
- return 0;
-}
-
-void bch_fs_tiering_init(struct bch_fs *c)
-{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
- c->tiers[i].idx = i;
- bch_pd_controller_init(&c->tiers[i].pd);
- }
-}
diff --git a/libbcache/tier.h b/libbcache/tier.h
deleted file mode 100644
index b6f8d4a2..00000000
--- a/libbcache/tier.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _BCACHE_TIER_H
-#define _BCACHE_TIER_H
-
-void bch_tiering_stop(struct bch_fs *);
-int bch_tiering_start(struct bch_fs *);
-void bch_fs_tiering_init(struct bch_fs *);
-
-#endif
diff --git a/libbcache/trace.c b/libbcache/trace.c
deleted file mode 100644
index def525d1..00000000
--- a/libbcache/trace.c
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "bcache.h"
-#include "alloc_types.h"
-#include "blockdev_types.h"
-#include "buckets.h"
-#include "btree_types.h"
-#include "keylist.h"
-
-#include <linux/blktrace_api.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/bcache.h>
diff --git a/libbcache/util.c b/libbcache/util.c
deleted file mode 100644
index 5f816593..00000000
--- a/libbcache/util.c
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- * random utiility code, for bcache but in theory not specific to bcache
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/seq_file.h>
-#include <linux/types.h>
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-
-#include "util.h"
-
-#define simple_strtoint(c, end, base) simple_strtol(c, end, base)
-#define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
-
-#define STRTO_H(name, type) \
-int bch_ ## name ## _h(const char *cp, type *res) \
-{ \
- int u = 0; \
- char *e; \
- type i = simple_ ## name(cp, &e, 10); \
- \
- switch (tolower(*e)) { \
- default: \
- return -EINVAL; \
- case 'y': \
- case 'z': \
- u++; \
- case 'e': \
- u++; \
- case 'p': \
- u++; \
- case 't': \
- u++; \
- case 'g': \
- u++; \
- case 'm': \
- u++; \
- case 'k': \
- u++; \
- if (e++ == cp) \
- return -EINVAL; \
- case '\n': \
- case '\0': \
- if (*e == '\n') \
- e++; \
- } \
- \
- if (*e) \
- return -EINVAL; \
- \
- while (u--) { \
- if ((type) ~0 > 0 && \
- (type) ~0 / 1024 <= i) \
- return -EINVAL; \
- if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \
- (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \
- return -EINVAL; \
- i *= 1024; \
- } \
- \
- *res = i; \
- return 0; \
-} \
-
-STRTO_H(strtoint, int)
-STRTO_H(strtouint, unsigned int)
-STRTO_H(strtoll, long long)
-STRTO_H(strtoull, unsigned long long)
-
-ssize_t bch_hprint(char *buf, s64 v)
-{
- static const char units[] = "?kMGTPEZY";
- char dec[4] = "";
- int u, t = 0;
-
- for (u = 0; v >= 1024 || v <= -1024; u++) {
- t = v & ~(~0 << 10);
- v >>= 10;
- }
-
- if (!u)
- return sprintf(buf, "%lli", v);
-
- /*
- * 103 is magic: t is in the range [-1023, 1023] and we want
- * to turn it into [-9, 9]
- */
- if (v < 100 && v > -100)
- snprintf(dec, sizeof(dec), ".%i", t / 103);
-
- return sprintf(buf, "%lli%s%c", v, dec, units[u]);
-}
-
-ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
- size_t selected)
-{
- char *out = buf;
- size_t i;
-
- for (i = 0; list[i]; i++)
- out += snprintf(out, buf + size - out,
- i == selected ? "[%s] " : "%s ", list[i]);
-
- out[-1] = '\n';
- return out - buf;
-}
-
-ssize_t bch_read_string_list(const char *buf, const char * const list[])
-{
- size_t i;
- char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
- if (!d)
- return -ENOMEM;
-
- s = strim(d);
-
- for (i = 0; list[i]; i++)
- if (!strcmp(list[i], s))
- break;
-
- kfree(d);
-
- if (!list[i])
- return -EINVAL;
-
- return i;
-}
-
-bool bch_is_zero(const void *_p, size_t n)
-{
- const char *p = _p;
- size_t i;
-
- for (i = 0; i < n; i++)
- if (p[i])
- return false;
- return true;
-}
-
-void bch_time_stats_clear(struct time_stats *stats)
-{
- spin_lock(&stats->lock);
-
- stats->count = 0;
- stats->last_duration = 0;
- stats->max_duration = 0;
- stats->average_duration = 0;
- stats->average_frequency = 0;
- stats->last = 0;
-
- spin_unlock(&stats->lock);
-}
-
-void __bch_time_stats_update(struct time_stats *stats, u64 start_time)
-{
- u64 now, duration, last;
-
- stats->count++;
-
- now = local_clock();
- duration = time_after64(now, start_time)
- ? now - start_time : 0;
- last = time_after64(now, stats->last)
- ? now - stats->last : 0;
-
- stats->last_duration = duration;
- stats->max_duration = max(stats->max_duration, duration);
-
- if (stats->last) {
- stats->average_duration = ewma_add(stats->average_duration,
- duration << 8, 3);
-
- if (stats->average_frequency)
- stats->average_frequency =
- ewma_add(stats->average_frequency,
- last << 8, 3);
- else
- stats->average_frequency = last << 8;
- } else {
- stats->average_duration = duration << 8;
- }
-
- stats->last = now ?: 1;
-}
-
-void bch_time_stats_update(struct time_stats *stats, u64 start_time)
-{
- spin_lock(&stats->lock);
- __bch_time_stats_update(stats, start_time);
- spin_unlock(&stats->lock);
-}
-
-/**
- * bch_ratelimit_delay() - return how long to delay until the next time to do
- * some work
- *
- * @d - the struct bch_ratelimit to update
- *
- * Returns the amount of time to delay by, in jiffies
- */
-u64 bch_ratelimit_delay(struct bch_ratelimit *d)
-{
- u64 now = local_clock();
-
- return time_after64(d->next, now)
- ? nsecs_to_jiffies(d->next - now)
- : 0;
-}
-
-/**
- * bch_ratelimit_increment() - increment @d by the amount of work done
- *
- * @d - the struct bch_ratelimit to update
- * @done - the amount of work done, in arbitrary units
- */
-void bch_ratelimit_increment(struct bch_ratelimit *d, u64 done)
-{
- u64 now = local_clock();
-
- d->next += div_u64(done * NSEC_PER_SEC, d->rate);
-
- if (time_before64(now + NSEC_PER_SEC, d->next))
- d->next = now + NSEC_PER_SEC;
-
- if (time_after64(now - NSEC_PER_SEC * 2, d->next))
- d->next = now - NSEC_PER_SEC * 2;
-}
-
-int bch_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
-{
- while (1) {
- u64 delay = bch_ratelimit_delay(d);
-
- if (delay)
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (kthread_should_stop())
- return 1;
-
- if (!delay)
- return 0;
-
- schedule_timeout(delay);
- try_to_freeze();
- }
-}
-
-/*
- * Updates pd_controller. Attempts to scale inputed values to units per second.
- * @target: desired value
- * @actual: current value
- *
- * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
- * it makes actual go down.
- */
-void bch_pd_controller_update(struct bch_pd_controller *pd,
- s64 target, s64 actual, int sign)
-{
- s64 proportional, derivative, change;
-
- unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
-
- if (seconds_since_update == 0)
- return;
-
- pd->last_update = jiffies;
-
- proportional = actual - target;
- proportional *= seconds_since_update;
- proportional = div_s64(proportional, pd->p_term_inverse);
-
- derivative = actual - pd->last_actual;
- derivative = div_s64(derivative, seconds_since_update);
- derivative = ewma_add(pd->smoothed_derivative, derivative,
- (pd->d_term / seconds_since_update) ?: 1);
- derivative = derivative * pd->d_term;
- derivative = div_s64(derivative, pd->p_term_inverse);
-
- change = proportional + derivative;
-
- /* Don't increase rate if not keeping up */
- if (change > 0 &&
- pd->backpressure &&
- time_after64(local_clock(),
- pd->rate.next + NSEC_PER_MSEC))
- change = 0;
-
- change *= (sign * -1);
-
- pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
- 1, UINT_MAX);
-
- pd->last_actual = actual;
- pd->last_derivative = derivative;
- pd->last_proportional = proportional;
- pd->last_change = change;
- pd->last_target = target;
-}
-
-void bch_pd_controller_init(struct bch_pd_controller *pd)
-{
- pd->rate.rate = 1024;
- pd->last_update = jiffies;
- pd->p_term_inverse = 6000;
- pd->d_term = 30;
- pd->d_smooth = pd->d_term;
- pd->backpressure = 1;
-}
-
-size_t bch_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
-{
- /* 2^64 - 1 is 20 digits, plus null byte */
- char rate[21];
- char actual[21];
- char target[21];
- char proportional[21];
- char derivative[21];
- char change[21];
- s64 next_io;
-
- bch_hprint(rate, pd->rate.rate);
- bch_hprint(actual, pd->last_actual);
- bch_hprint(target, pd->last_target);
- bch_hprint(proportional, pd->last_proportional);
- bch_hprint(derivative, pd->last_derivative);
- bch_hprint(change, pd->last_change);
-
- next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
-
- return sprintf(buf,
- "rate:\t\t%s/sec\n"
- "target:\t\t%s\n"
- "actual:\t\t%s\n"
- "proportional:\t%s\n"
- "derivative:\t%s\n"
- "change:\t\t%s/sec\n"
- "next io:\t%llims\n",
- rate, target, actual, proportional,
- derivative, change, next_io);
-}
-
-void bch_bio_map(struct bio *bio, void *base)
-{
- size_t size = bio->bi_iter.bi_size;
- struct bio_vec *bv = bio->bi_io_vec;
-
- BUG_ON(!bio->bi_iter.bi_size);
- BUG_ON(bio->bi_vcnt);
-
- bv->bv_offset = base ? offset_in_page(base) : 0;
- goto start;
-
- for (; size; bio->bi_vcnt++, bv++) {
- bv->bv_offset = 0;
-start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
- size);
- BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
- if (base) {
- bv->bv_page = is_vmalloc_addr(base)
- ? vmalloc_to_page(base)
- : virt_to_page(base);
-
- base += bv->bv_len;
- }
-
- size -= bv->bv_len;
- }
-}
-
-size_t bch_rand_range(size_t max)
-{
- size_t rand;
-
- do {
- get_random_bytes(&rand, sizeof(rand));
- rand &= roundup_pow_of_two(max) - 1;
- } while (rand >= max);
-
- return rand;
-}
-
-void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
-
- __bio_for_each_segment(bv, dst, iter, dst_iter) {
- void *dstp = kmap_atomic(bv.bv_page);
- memcpy(dstp + bv.bv_offset, src, bv.bv_len);
- kunmap_atomic(dstp);
-
- src += bv.bv_len;
- }
-}
-
-void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
-{
- struct bio_vec bv;
- struct bvec_iter iter;
-
- __bio_for_each_segment(bv, src, iter, src_iter) {
- void *srcp = kmap_atomic(bv.bv_page);
- memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
- kunmap_atomic(srcp);
-
- dst += bv.bv_len;
- }
-}
diff --git a/libbcache/util.h b/libbcache/util.h
deleted file mode 100644
index 88cbe301..00000000
--- a/libbcache/util.h
+++ /dev/null
@@ -1,755 +0,0 @@
-#ifndef _BCACHE_UTIL_H
-#define _BCACHE_UTIL_H
-
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/errno.h>
-#include <linux/blkdev.h>
-#include <linux/freezer.h>
-#include <linux/kernel.h>
-#include <linux/llist.h>
-#include <linux/ratelimit.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/workqueue.h>
-
-#include "closure.h"
-
-#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9)
-#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT)
-
-struct closure;
-
-#ifdef CONFIG_BCACHE_DEBUG
-
-#define EBUG_ON(cond) BUG_ON(cond)
-#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
-#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
-#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0)
-#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0)
-#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0)
-#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0)
-#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0)
-#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i)
-#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
-#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
-
-#define memcpy(_dst, _src, _len) \
-do { \
- BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \
- (void *) (_dst) + (_len) <= (void *) (_src))); \
- memcpy(_dst, _src, _len); \
-} while (0)
-
-#else /* DEBUG */
-
-#define EBUG_ON(cond)
-#define atomic_dec_bug(v) atomic_dec(v)
-#define atomic_inc_bug(v, i) atomic_inc(v)
-#define atomic_sub_bug(i, v) atomic_sub(i, v)
-#define atomic_add_bug(i, v) atomic_add(i, v)
-#define atomic_long_dec_bug(v) atomic_long_dec(v)
-#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v)
-#define atomic64_dec_bug(v) atomic64_dec(v)
-#define atomic64_inc_bug(v, i) atomic64_inc(v)
-#define atomic64_sub_bug(i, v) atomic64_sub(i, v)
-#define atomic64_add_bug(i, v) atomic64_add(i, v)
-
-#endif
-
-#ifndef __CHECKER__
-#define __flatten __attribute__((flatten))
-#else
-/* sparse doesn't know about attribute((flatten)) */
-#define __flatten
-#endif
-
-#ifdef __LITTLE_ENDIAN
-#define CPU_BIG_ENDIAN 0
-#else
-#define CPU_BIG_ENDIAN 1
-#endif
-
-/* type hackery */
-
-#define type_is_exact(_val, _type) \
- __builtin_types_compatible_p(typeof(_val), _type)
-
-#define type_is(_val, _type) \
- (__builtin_types_compatible_p(typeof(_val), _type) || \
- __builtin_types_compatible_p(typeof(_val), const _type))
-
-static inline void *kvmalloc(size_t bytes, gfp_t gfp)
-{
- if (bytes <= PAGE_SIZE ||
- !(gfp & GFP_KERNEL))
- return kmalloc(bytes, gfp);
-
- return ((bytes <= KMALLOC_MAX_SIZE)
- ? kmalloc(bytes, gfp|__GFP_NOWARN)
- : NULL) ?:
- vmalloc(bytes);
-}
-
-#define DECLARE_HEAP(type, name) \
- struct { \
- size_t size, used; \
- type *data; \
- } name
-
-#define init_heap(heap, _size, gfp) \
-({ \
- size_t _bytes; \
- (heap)->used = 0; \
- (heap)->size = (_size); \
- _bytes = (heap)->size * sizeof(*(heap)->data); \
- (heap)->data = kvmalloc(_bytes, (gfp)); \
- (heap)->data; \
-})
-
-#define free_heap(heap) \
-do { \
- kvfree((heap)->data); \
- (heap)->data = NULL; \
-} while (0)
-
-#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j])
-
-#define heap_sift(h, i, cmp) \
-do { \
- size_t _r, _j = i; \
- \
- for (; _j * 2 + 1 < (h)->used; _j = _r) { \
- _r = _j * 2 + 1; \
- if (_r + 1 < (h)->used && \
- cmp((h)->data[_r], (h)->data[_r + 1])) \
- _r++; \
- \
- if (cmp((h)->data[_r], (h)->data[_j])) \
- break; \
- heap_swap(h, _r, _j); \
- } \
-} while (0)
-
-#define heap_sift_down(h, i, cmp) \
-do { \
- while (i) { \
- size_t p = (i - 1) / 2; \
- if (cmp((h)->data[i], (h)->data[p])) \
- break; \
- heap_swap(h, i, p); \
- i = p; \
- } \
-} while (0)
-
-#define heap_add(h, d, cmp) \
-({ \
- bool _r = !heap_full(h); \
- if (_r) { \
- size_t _i = (h)->used++; \
- (h)->data[_i] = d; \
- \
- heap_sift_down(h, _i, cmp); \
- heap_sift(h, _i, cmp); \
- } \
- _r; \
-})
-
-#define heap_del(h, i, cmp) \
-do { \
- size_t _i = (i); \
- \
- BUG_ON(_i >= (h)->used); \
- (h)->used--; \
- heap_swap(h, _i, (h)->used); \
- heap_sift_down(h, _i, cmp); \
- heap_sift(h, _i, cmp); \
-} while (0)
-
-#define heap_pop(h, d, cmp) \
-({ \
- bool _r = (h)->used; \
- if (_r) { \
- (d) = (h)->data[0]; \
- heap_del(h, 0, cmp); \
- } \
- _r; \
-})
-
-#define heap_peek(h) \
-({ \
- EBUG_ON(!(h)->used); \
- (h)->data[0]; \
-})
-
-#define heap_full(h) ((h)->used == (h)->size)
-
-#define heap_resort(heap, cmp) \
-do { \
- ssize_t _i; \
- for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \
- heap_sift(heap, _i, cmp); \
-} while (0)
-
-/*
- * Simple array based allocator - preallocates a number of elements and you can
- * never allocate more than that, also has no locking.
- *
- * Handy because if you know you only need a fixed number of elements you don't
- * have to worry about memory allocation failure, and sometimes a mempool isn't
- * what you want.
- *
- * We treat the free elements as entries in a singly linked list, and the
- * freelist as a stack - allocating and freeing push and pop off the freelist.
- */
-
-#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \
- struct { \
- type *freelist; \
- type data[size]; \
- } name
-
-#define array_alloc(array) \
-({ \
- typeof((array)->freelist) _ret = (array)->freelist; \
- \
- if (_ret) \
- (array)->freelist = *((typeof((array)->freelist) *) _ret);\
- \
- _ret; \
-})
-
-#define array_free(array, ptr) \
-do { \
- typeof((array)->freelist) _ptr = ptr; \
- \
- *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \
- (array)->freelist = _ptr; \
-} while (0)
-
-#define array_allocator_init(array) \
-do { \
- typeof((array)->freelist) _i; \
- \
- BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \
- (array)->freelist = NULL; \
- \
- for (_i = (array)->data; \
- _i < (array)->data + ARRAY_SIZE((array)->data); \
- _i++) \
- array_free(array, _i); \
-} while (0)
-
-#define array_freelist_empty(array) ((array)->freelist == NULL)
-
-#define ANYSINT_MAX(t) \
- ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
-
-int bch_strtoint_h(const char *, int *);
-int bch_strtouint_h(const char *, unsigned int *);
-int bch_strtoll_h(const char *, long long *);
-int bch_strtoull_h(const char *, unsigned long long *);
-
-static inline int bch_strtol_h(const char *cp, long *res)
-{
-#if BITS_PER_LONG == 32
- return bch_strtoint_h(cp, (int *) res);
-#else
- return bch_strtoll_h(cp, (long long *) res);
-#endif
-}
-
-static inline int bch_strtoul_h(const char *cp, long *res)
-{
-#if BITS_PER_LONG == 32
- return bch_strtouint_h(cp, (unsigned int *) res);
-#else
- return bch_strtoull_h(cp, (unsigned long long *) res);
-#endif
-}
-
-#define strtoi_h(cp, res) \
- ( type_is(*res, int) ? bch_strtoint_h(cp, (void *) res)\
- : type_is(*res, long) ? bch_strtol_h(cp, (void *) res)\
- : type_is(*res, long long) ? bch_strtoll_h(cp, (void *) res)\
- : type_is(*res, unsigned) ? bch_strtouint_h(cp, (void *) res)\
- : type_is(*res, unsigned long) ? bch_strtoul_h(cp, (void *) res)\
- : type_is(*res, unsigned long long) ? bch_strtoull_h(cp, (void *) res)\
- : -EINVAL)
-
-#define strtoul_safe(cp, var) \
-({ \
- unsigned long _v; \
- int _r = kstrtoul(cp, 10, &_v); \
- if (!_r) \
- var = _v; \
- _r; \
-})
-
-#define strtoul_safe_clamp(cp, var, min, max) \
-({ \
- unsigned long _v; \
- int _r = kstrtoul(cp, 10, &_v); \
- if (!_r) \
- var = clamp_t(typeof(var), _v, min, max); \
- _r; \
-})
-
-#define strtoul_safe_restrict(cp, var, min, max) \
-({ \
- unsigned long _v; \
- int _r = kstrtoul(cp, 10, &_v); \
- if (!_r && _v >= min && _v <= max) \
- var = _v; \
- else \
- _r = -EINVAL; \
- _r; \
-})
-
-#define snprint(buf, size, var) \
- snprintf(buf, size, \
- type_is(var, int) ? "%i\n" \
- : type_is(var, unsigned) ? "%u\n" \
- : type_is(var, long) ? "%li\n" \
- : type_is(var, unsigned long) ? "%lu\n" \
- : type_is(var, s64) ? "%lli\n" \
- : type_is(var, u64) ? "%llu\n" \
- : type_is(var, char *) ? "%s\n" \
- : "%i\n", var)
-
-ssize_t bch_hprint(char *buf, s64 v);
-
-bool bch_is_zero(const void *, size_t);
-
-ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
- size_t selected);
-
-ssize_t bch_read_string_list(const char *buf, const char * const list[]);
-
-struct time_stats {
- spinlock_t lock;
- u64 count;
- /*
- * all fields are in nanoseconds, averages are ewmas stored left shifted
- * by 8
- */
- u64 last_duration;
- u64 max_duration;
- u64 average_duration;
- u64 average_frequency;
- u64 last;
-};
-
-void bch_time_stats_clear(struct time_stats *stats);
-void __bch_time_stats_update(struct time_stats *stats, u64 time);
-void bch_time_stats_update(struct time_stats *stats, u64 time);
-
-static inline unsigned local_clock_us(void)
-{
- return local_clock() >> 10;
-}
-
-#define NSEC_PER_ns 1L
-#define NSEC_PER_us NSEC_PER_USEC
-#define NSEC_PER_ms NSEC_PER_MSEC
-#define NSEC_PER_sec NSEC_PER_SEC
-
-#define __print_time_stat(stats, name, stat, units) \
- sysfs_print(name ## _ ## stat ## _ ## units, \
- div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
-
-#define sysfs_print_time_stats(stats, name, \
- frequency_units, \
- duration_units) \
-do { \
- __print_time_stat(stats, name, \
- average_frequency, frequency_units); \
- __print_time_stat(stats, name, \
- average_duration, duration_units); \
- sysfs_print(name ## _ ##count, (stats)->count); \
- sysfs_print(name ## _ ##last_duration ## _ ## duration_units, \
- div_u64((stats)->last_duration, \
- NSEC_PER_ ## duration_units)); \
- sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \
- div_u64((stats)->max_duration, \
- NSEC_PER_ ## duration_units)); \
- \
- sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
- ? div_s64(local_clock() - (stats)->last, \
- NSEC_PER_ ## frequency_units) \
- : -1LL); \
-} while (0)
-
-#define sysfs_clear_time_stats(stats, name) \
-do { \
- if (attr == &sysfs_ ## name ## _clear) \
- bch_time_stats_clear(stats); \
-} while (0)
-
-#define sysfs_time_stats_attribute(name, \
- frequency_units, \
- duration_units) \
-write_attribute(name ## _clear); \
-read_attribute(name ## _count); \
-read_attribute(name ## _average_frequency_ ## frequency_units); \
-read_attribute(name ## _average_duration_ ## duration_units); \
-read_attribute(name ## _last_duration_ ## duration_units); \
-read_attribute(name ## _max_duration_ ## duration_units); \
-read_attribute(name ## _last_ ## frequency_units)
-
-#define sysfs_time_stats_attribute_list(name, \
- frequency_units, \
- duration_units) \
-&sysfs_ ## name ## _clear, \
-&sysfs_ ## name ## _count, \
-&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
-&sysfs_ ## name ## _average_duration_ ## duration_units, \
-&sysfs_ ## name ## _last_duration_ ## duration_units, \
-&sysfs_ ## name ## _max_duration_ ## duration_units, \
-&sysfs_ ## name ## _last_ ## frequency_units,
-
-#define ewma_add(ewma, val, weight) \
-({ \
- typeof(ewma) _ewma = (ewma); \
- typeof(weight) _weight = (weight); \
- \
- (((_ewma << _weight) - _ewma) + (val)) >> _weight; \
-})
-
-struct bch_ratelimit {
- /* Next time we want to do some work, in nanoseconds */
- u64 next;
-
- /*
- * Rate at which we want to do work, in units per nanosecond
- * The units here correspond to the units passed to
- * bch_ratelimit_increment()
- */
- unsigned rate;
-};
-
-static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
-{
- d->next = local_clock();
-}
-
-u64 bch_ratelimit_delay(struct bch_ratelimit *);
-void bch_ratelimit_increment(struct bch_ratelimit *, u64);
-int bch_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *);
-
-struct bch_pd_controller {
- struct bch_ratelimit rate;
- unsigned long last_update;
-
- s64 last_actual;
- s64 smoothed_derivative;
-
- unsigned p_term_inverse;
- unsigned d_smooth;
- unsigned d_term;
-
- /* for exporting to sysfs (no effect on behavior) */
- s64 last_derivative;
- s64 last_proportional;
- s64 last_change;
- s64 last_target;
-
- /* If true, the rate will not increase if bch_ratelimit_delay()
- * is not being called often enough. */
- bool backpressure;
-};
-
-void bch_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
-void bch_pd_controller_init(struct bch_pd_controller *);
-size_t bch_pd_controller_print_debug(struct bch_pd_controller *, char *);
-
-#define sysfs_pd_controller_attribute(name) \
- rw_attribute(name##_rate); \
- rw_attribute(name##_rate_bytes); \
- rw_attribute(name##_rate_d_term); \
- rw_attribute(name##_rate_p_term_inverse); \
- read_attribute(name##_rate_debug)
-
-#define sysfs_pd_controller_files(name) \
- &sysfs_##name##_rate, \
- &sysfs_##name##_rate_bytes, \
- &sysfs_##name##_rate_d_term, \
- &sysfs_##name##_rate_p_term_inverse, \
- &sysfs_##name##_rate_debug
-
-#define sysfs_pd_controller_show(name, var) \
-do { \
- sysfs_hprint(name##_rate, (var)->rate.rate); \
- sysfs_print(name##_rate_bytes, (var)->rate.rate); \
- sysfs_print(name##_rate_d_term, (var)->d_term); \
- sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \
- \
- if (attr == &sysfs_##name##_rate_debug) \
- return bch_pd_controller_print_debug(var, buf); \
-} while (0)
-
-#define sysfs_pd_controller_store(name, var) \
-do { \
- sysfs_strtoul_clamp(name##_rate, \
- (var)->rate.rate, 1, UINT_MAX); \
- sysfs_strtoul_clamp(name##_rate_bytes, \
- (var)->rate.rate, 1, UINT_MAX); \
- sysfs_strtoul(name##_rate_d_term, (var)->d_term); \
- sysfs_strtoul_clamp(name##_rate_p_term_inverse, \
- (var)->p_term_inverse, 1, INT_MAX); \
-} while (0)
-
-#define __DIV_SAFE(n, d, zero) \
-({ \
- typeof(n) _n = (n); \
- typeof(d) _d = (d); \
- _d ? _n / _d : zero; \
-})
-
-#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0)
-
-#define container_of_or_null(ptr, type, member) \
-({ \
- typeof(ptr) _ptr = ptr; \
- _ptr ? container_of(_ptr, type, member) : NULL; \
-})
-
-#define RB_INSERT(root, new, member, cmp) \
-({ \
- __label__ dup; \
- struct rb_node **n = &(root)->rb_node, *parent = NULL; \
- typeof(new) this; \
- int res, ret = -1; \
- \
- while (*n) { \
- parent = *n; \
- this = container_of(*n, typeof(*(new)), member); \
- res = cmp(new, this); \
- if (!res) \
- goto dup; \
- n = res < 0 \
- ? &(*n)->rb_left \
- : &(*n)->rb_right; \
- } \
- \
- rb_link_node(&(new)->member, parent, n); \
- rb_insert_color(&(new)->member, root); \
- ret = 0; \
-dup: \
- ret; \
-})
-
-#define RB_SEARCH(root, search, member, cmp) \
-({ \
- struct rb_node *n = (root)->rb_node; \
- typeof(&(search)) this, ret = NULL; \
- int res; \
- \
- while (n) { \
- this = container_of(n, typeof(search), member); \
- res = cmp(&(search), this); \
- if (!res) { \
- ret = this; \
- break; \
- } \
- n = res < 0 \
- ? n->rb_left \
- : n->rb_right; \
- } \
- ret; \
-})
-
-#define RB_GREATER(root, search, member, cmp) \
-({ \
- struct rb_node *n = (root)->rb_node; \
- typeof(&(search)) this, ret = NULL; \
- int res; \
- \
- while (n) { \
- this = container_of(n, typeof(search), member); \
- res = cmp(&(search), this); \
- if (res < 0) { \
- ret = this; \
- n = n->rb_left; \
- } else \
- n = n->rb_right; \
- } \
- ret; \
-})
-
-#define RB_FIRST(root, type, member) \
- container_of_or_null(rb_first(root), type, member)
-
-#define RB_LAST(root, type, member) \
- container_of_or_null(rb_last(root), type, member)
-
-#define RB_NEXT(ptr, member) \
- container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
-
-#define RB_PREV(ptr, member) \
- container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
-
-/* Does linear interpolation between powers of two */
-static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
-{
- unsigned fract = x & ~(~0 << fract_bits);
-
- x >>= fract_bits;
- x = 1 << x;
- x += (x * fract) >> fract_bits;
-
- return x;
-}
-
-void bch_bio_map(struct bio *bio, void *base);
-
-static inline sector_t bdev_sectors(struct block_device *bdev)
-{
- return bdev->bd_inode->i_size >> 9;
-}
-
-#define closure_bio_submit(bio, cl) \
-do { \
- closure_get(cl); \
- generic_make_request(bio); \
-} while (0)
-
-#define closure_bio_submit_punt(bio, cl, c) \
-do { \
- closure_get(cl); \
- bch_generic_make_request(bio, c); \
-} while (0)
-
-#define kthread_wait_freezable(cond) \
-({ \
- int _ret = 0; \
- while (1) { \
- set_current_state(TASK_INTERRUPTIBLE); \
- if (kthread_should_stop()) { \
- _ret = -1; \
- break; \
- } \
- \
- if (cond) \
- break; \
- \
- schedule(); \
- try_to_freeze(); \
- } \
- set_current_state(TASK_RUNNING); \
- _ret; \
-})
-
-size_t bch_rand_range(size_t);
-
-void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
-void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
-
-static inline void __memcpy_u64s(void *dst, const void *src,
- unsigned u64s)
-{
-#ifdef CONFIG_X86_64
- long d0, d1, d2;
- asm volatile("rep ; movsq"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (u64s), "1" (dst), "2" (src)
- : "memory");
-#else
- u64 *d = dst;
- const u64 *s = src;
-
- while (u64s--)
- *d++ = *s++;
-#endif
-}
-
-static inline void memcpy_u64s(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
- dst + u64s * sizeof(u64) <= src));
-
- __memcpy_u64s(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_down(void *dst, const void *src,
- unsigned u64s)
-{
- __memcpy_u64s(dst, src, u64s);
-}
-
-static inline void memmove_u64s_down(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(dst > src);
-
- __memmove_u64s_down(dst, src, u64s);
-}
-
-static inline void __memmove_u64s_up(void *_dst, const void *_src,
- unsigned u64s)
-{
- u64 *dst = (u64 *) _dst + u64s - 1;
- u64 *src = (u64 *) _src + u64s - 1;
-
-#ifdef CONFIG_X86_64
- long d0, d1, d2;
- asm volatile("std ;\n"
- "rep ; movsq\n"
- "cld ;\n"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (u64s), "1" (dst), "2" (src)
- : "memory");
-#else
- while (u64s--)
- *dst-- = *src--;
-#endif
-}
-
-static inline void memmove_u64s_up(void *dst, const void *src,
- unsigned u64s)
-{
- EBUG_ON(dst < src);
-
- __memmove_u64s_up(dst, src, u64s);
-}
-
-static inline void memmove_u64s(void *dst, const void *src,
- unsigned u64s)
-{
- if (dst < src)
- __memmove_u64s_down(dst, src, u64s);
- else
- __memmove_u64s_up(dst, src, u64s);
-}
-
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
- while (iter->bi_size) {
- struct bio_vec next = bio_iter_iovec(bio, *iter);
-
- if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
- page_address(next.bv_page) + next.bv_offset)
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter(bio, iter, next.bv_len);
- }
-#endif
- return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter) \
- __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
-#endif /* _BCACHE_UTIL_H */
diff --git a/libbcache/vstructs.h b/libbcache/vstructs.h
deleted file mode 100644
index ce2cece0..00000000
--- a/libbcache/vstructs.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef _VSTRUCTS_H
-#define _VSTRUCTS_H
-
-#include "util.h"
-
-/*
- * NOTE: we can't differentiate between __le64 and u64 with type_is - this
- * assumes u64 is little endian:
- */
-#define __vstruct_u64s(_s) \
-({ \
- ( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s) \
- : type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s) \
- : type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s) \
- : ((_s)->u64s)); \
-})
-
-#define __vstruct_bytes(_type, _u64s) \
-({ \
- BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \
- \
- (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
-})
-
-#define vstruct_bytes(_s) \
- __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
-
-#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \
- (round_up(__vstruct_bytes(_type, _u64s), \
- 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
-
-#define vstruct_blocks(_s, _sector_block_bits) \
- __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
-
-#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \
- __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \
- __vstruct_u64s(_s) + (_u64s))
-
-#define vstruct_sectors(_s, _sector_block_bits) \
- (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
-
-#define vstruct_next(_s) \
- ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s)))
-#define vstruct_last(_s) \
- ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s)))
-#define vstruct_end(_s) \
- ((void *) ((_s)->_data + __vstruct_u64s(_s)))
-
-#define vstruct_for_each(_s, _i) \
- for (_i = (_s)->start; \
- _i < vstruct_last(_s); \
- _i = vstruct_next(_i))
-
-#define vstruct_for_each_safe(_s, _i, _t) \
- for (_i = (_s)->start; \
- _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \
- _i = _t)
-
-#define vstruct_idx(_s, _idx) \
- ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
-
-#endif /* _VSTRUCTS_H */
diff --git a/libbcache/writeback.c b/libbcache/writeback.c
deleted file mode 100644
index 279cfe67..00000000
--- a/libbcache/writeback.c
+++ /dev/null
@@ -1,657 +0,0 @@
-/*
- * background writeback - scan btree for dirty data and write it to the backing
- * device
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcache.h"
-#include "btree_update.h"
-#include "clock.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "io.h"
-#include "keybuf.h"
-#include "keylist.h"
-#include "writeback.h"
-
-#include <linux/delay.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <trace/events/bcache.h>
-
-/* Rate limiting */
-
-static void __update_writeback_rate(struct cached_dev *dc)
-{
- struct bch_fs *c = dc->disk.c;
- u64 cache_dirty_target =
- div_u64(c->capacity * dc->writeback_percent, 100);
- s64 target = div64_u64(cache_dirty_target *
- bdev_sectors(dc->disk_sb.bdev),
- c->cached_dev_sectors);
- s64 dirty = bcache_dev_sectors_dirty(&dc->disk);
-
- bch_pd_controller_update(&dc->writeback_pd, target << 9,
- dirty << 9, -1);
-}
-
-static void update_writeback_rate(struct work_struct *work)
-{
- struct cached_dev *dc = container_of(to_delayed_work(work),
- struct cached_dev,
- writeback_pd_update);
-
- down_read(&dc->writeback_lock);
-
- if (atomic_read(&dc->has_dirty) &&
- dc->writeback_percent &&
- !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
- __update_writeback_rate(dc);
- else
- dc->writeback_pd.rate.rate = UINT_MAX;
-
- up_read(&dc->writeback_lock);
-
- schedule_delayed_work(&dc->writeback_pd_update,
- dc->writeback_pd_update_seconds * HZ);
-}
-
-struct dirty_io {
- struct closure cl;
- struct bch_replace_info replace;
- struct cached_dev *dc;
- struct bch_dev *ca;
- struct keybuf_key *w;
- struct bch_extent_ptr ptr;
- int error;
- bool from_mempool;
- /* Must be last */
- struct bio bio;
-};
-
-#define DIRTY_IO_MEMPOOL_BVECS 64
-#define DIRTY_IO_MEMPOOL_SECTORS (DIRTY_IO_MEMPOOL_BVECS * PAGE_SECTORS)
-
-static void dirty_init(struct dirty_io *io)
-{
- struct bio *bio = &io->bio;
-
- bio_init(bio);
- if (!io->dc->writeback_percent)
- bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
-
- bio->bi_iter.bi_size = io->replace.key.k.size << 9;
- bio->bi_max_vecs =
- DIV_ROUND_UP(io->replace.key.k.size, PAGE_SECTORS);
- bio->bi_io_vec = bio->bi_inline_vecs;
- bch_bio_map(bio, NULL);
-}
-
-static void dirty_io_destructor(struct closure *cl)
-{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
-
- if (io->from_mempool)
- mempool_free(io, &io->dc->writeback_io_pool);
- else
- kfree(io);
-}
-
-static void write_dirty_finish(struct closure *cl)
-{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
- struct cached_dev *dc = io->dc;
- struct bio_vec *bv;
- int i;
-
- bio_for_each_segment_all(bv, &io->bio, i)
- mempool_free(bv->bv_page, &dc->writeback_page_pool);
-
- if (!io->error) {
- BKEY_PADDED(k) tmp;
- int ret;
-
- bkey_copy(&tmp.k, &io->replace.key);
- io->replace.hook.fn = bch_extent_cmpxchg;
- bkey_extent_set_cached(&tmp.k.k, true);
-
- ret = bch_btree_insert(dc->disk.c, BTREE_ID_EXTENTS, &tmp.k,
- NULL, &io->replace.hook, NULL, 0);
- if (io->replace.successes == 0)
- trace_bcache_writeback_collision(&io->replace.key.k);
-
- atomic_long_inc(ret
- ? &dc->disk.c->writeback_keys_failed
- : &dc->disk.c->writeback_keys_done);
- }
-
- bch_keybuf_put(&dc->writeback_keys, io->w);
-
- closure_return_with_destructor(cl, dirty_io_destructor);
-}
-
-static void dirty_endio(struct bio *bio)
-{
- struct dirty_io *io = container_of(bio, struct dirty_io, bio);
-
- if (bio->bi_error) {
- trace_bcache_writeback_error(&io->replace.key.k,
- op_is_write(bio_op(&io->bio)),
- bio->bi_error);
- io->error = bio->bi_error;
- }
-
- closure_put(&io->cl);
-}
-
-static void write_dirty(struct closure *cl)
-{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
-
- if (!io->error) {
- dirty_init(io);
- bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
- io->bio.bi_iter.bi_sector =
- bkey_start_offset(&io->replace.key.k);
- io->bio.bi_bdev = io->dc->disk_sb.bdev;
- io->bio.bi_end_io = dirty_endio;
-
- closure_bio_submit(&io->bio, cl);
- }
-
- continue_at(cl, write_dirty_finish, io->dc->disk.c->wq);
-}
-
-static void read_dirty_endio(struct bio *bio)
-{
- struct dirty_io *io = container_of(bio, struct dirty_io, bio);
-
- bch_dev_nonfatal_io_err_on(bio->bi_error, io->ca, "writeback read");
-
- bch_account_io_completion(io->ca);
-
- if (ptr_stale(io->ca, &io->ptr))
- bio->bi_error = -EINTR;
-
- dirty_endio(bio);
-}
-
-static void read_dirty_submit(struct closure *cl)
-{
- struct dirty_io *io = container_of(cl, struct dirty_io, cl);
-
- closure_bio_submit(&io->bio, cl);
-
- continue_at(cl, write_dirty, system_freezable_wq);
-}
-
-static u64 read_dirty(struct cached_dev *dc)
-{
- struct keybuf_key *w;
- struct dirty_io *io;
- struct closure cl;
- unsigned i;
- struct bio_vec *bv;
- u64 sectors_written = 0;
- BKEY_PADDED(k) tmp;
-
- closure_init_stack(&cl);
-
- while (!bch_ratelimit_wait_freezable_stoppable(&dc->writeback_pd.rate)) {
- w = bch_keybuf_next(&dc->writeback_keys);
- if (!w)
- break;
-
- sectors_written += w->key.k.size;
- bkey_copy(&tmp.k, &w->key);
-
- while (tmp.k.k.size) {
- struct extent_pick_ptr pick;
-
- bch_extent_pick_ptr(dc->disk.c,
- bkey_i_to_s_c(&tmp.k),
- &pick);
- if (IS_ERR_OR_NULL(pick.ca))
- break;
-
- io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
- DIV_ROUND_UP(tmp.k.k.size,
- PAGE_SECTORS),
- GFP_KERNEL);
- if (!io) {
- trace_bcache_writeback_alloc_fail(pick.ca->fs,
- tmp.k.k.size);
- io = mempool_alloc(&dc->writeback_io_pool,
- GFP_KERNEL);
- memset(io, 0, sizeof(*io) +
- sizeof(struct bio_vec) *
- DIRTY_IO_MEMPOOL_BVECS);
- io->from_mempool = true;
-
- bkey_copy(&io->replace.key, &tmp.k);
-
- if (DIRTY_IO_MEMPOOL_SECTORS <
- io->replace.key.k.size)
- bch_key_resize(&io->replace.key.k,
- DIRTY_IO_MEMPOOL_SECTORS);
- } else {
- bkey_copy(&io->replace.key, &tmp.k);
- }
-
- io->dc = dc;
- io->ca = pick.ca;
- io->w = w;
- io->ptr = pick.ptr;
- atomic_inc(&w->ref);
-
- dirty_init(io);
- bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
- io->bio.bi_iter.bi_sector = pick.ptr.offset;
- io->bio.bi_bdev = pick.ca->disk_sb.bdev;
- io->bio.bi_end_io = read_dirty_endio;
-
- bio_for_each_segment_all(bv, &io->bio, i) {
- bv->bv_page =
- mempool_alloc(&dc->writeback_page_pool,
- i ? GFP_NOWAIT
- : GFP_KERNEL);
- if (!bv->bv_page) {
- BUG_ON(!i);
- io->bio.bi_vcnt = i;
-
- io->bio.bi_iter.bi_size =
- io->bio.bi_vcnt * PAGE_SIZE;
-
- bch_key_resize(&io->replace.key.k,
- bio_sectors(&io->bio));
- break;
- }
- }
-
- bch_cut_front(io->replace.key.k.p, &tmp.k);
- trace_bcache_writeback(&io->replace.key.k);
-
- bch_ratelimit_increment(&dc->writeback_pd.rate,
- io->replace.key.k.size << 9);
-
- closure_call(&io->cl, read_dirty_submit, NULL, &cl);
- }
-
- bch_keybuf_put(&dc->writeback_keys, w);
- }
-
- /*
- * Wait for outstanding writeback IOs to finish (and keybuf slots to be
- * freed) before refilling again
- */
- closure_sync(&cl);
-
- return sectors_written;
-}
-
-/* Scan for dirty data */
-
-static void __bcache_dev_sectors_dirty_add(struct bcache_device *d,
- u64 offset, int nr_sectors)
-{
- unsigned stripe_offset, stripe, sectors_dirty;
-
- if (!d)
- return;
-
- if (!d->stripe_sectors_dirty)
- return;
-
- stripe = offset_to_stripe(d, offset);
- stripe_offset = offset & (d->stripe_size - 1);
-
- while (nr_sectors) {
- int s = min_t(unsigned, abs(nr_sectors),
- d->stripe_size - stripe_offset);
-
- if (nr_sectors < 0)
- s = -s;
-
- if (stripe >= d->nr_stripes)
- return;
-
- sectors_dirty = atomic_add_return(s,
- d->stripe_sectors_dirty + stripe);
- if (sectors_dirty == d->stripe_size)
- set_bit(stripe, d->full_dirty_stripes);
- else
- clear_bit(stripe, d->full_dirty_stripes);
-
- nr_sectors -= s;
- stripe_offset = 0;
- stripe++;
- }
-}
-
-void bcache_dev_sectors_dirty_add(struct bch_fs *c, unsigned inode,
- u64 offset, int nr_sectors)
-{
- struct bcache_device *d;
-
- rcu_read_lock();
- d = bch_dev_find(c, inode);
- if (d)
- __bcache_dev_sectors_dirty_add(d, offset, nr_sectors);
- rcu_read_unlock();
-}
-
-static bool dirty_pred(struct keybuf *buf, struct bkey_s_c k)
-{
- struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
-
- BUG_ON(k.k->p.inode != bcache_dev_inum(&dc->disk));
-
- return bkey_extent_is_data(k.k) &&
- !bkey_extent_is_cached(k.k);
-}
-
-static void refill_full_stripes(struct cached_dev *dc)
-{
- struct keybuf *buf = &dc->writeback_keys;
- unsigned inode = bcache_dev_inum(&dc->disk);
- unsigned start_stripe, stripe, next_stripe;
- bool wrapped = false;
-
- stripe = offset_to_stripe(&dc->disk, buf->last_scanned.offset);
-
- if (stripe >= dc->disk.nr_stripes)
- stripe = 0;
-
- start_stripe = stripe;
-
- while (1) {
- stripe = find_next_bit(dc->disk.full_dirty_stripes,
- dc->disk.nr_stripes, stripe);
-
- if (stripe == dc->disk.nr_stripes)
- goto next;
-
- next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
- dc->disk.nr_stripes, stripe);
-
- buf->last_scanned = POS(inode,
- stripe * dc->disk.stripe_size);
-
- bch_refill_keybuf(dc->disk.c, buf,
- POS(inode,
- next_stripe * dc->disk.stripe_size),
- dirty_pred);
-
- if (array_freelist_empty(&buf->freelist))
- return;
-
- stripe = next_stripe;
-next:
- if (wrapped && stripe > start_stripe)
- return;
-
- if (stripe == dc->disk.nr_stripes) {
- stripe = 0;
- wrapped = true;
- }
- }
-}
-
-static u64 bch_writeback(struct cached_dev *dc)
-{
- struct keybuf *buf = &dc->writeback_keys;
- unsigned inode = bcache_dev_inum(&dc->disk);
- struct bpos start = POS(inode, 0);
- struct bpos end = POS(inode, KEY_OFFSET_MAX);
- struct bpos start_pos;
- u64 sectors_written = 0;
-
- buf->last_scanned = POS(inode, 0);
-
- while (bkey_cmp(buf->last_scanned, end) < 0 &&
- !kthread_should_stop()) {
- down_write(&dc->writeback_lock);
-
- if (!atomic_read(&dc->has_dirty)) {
- up_write(&dc->writeback_lock);
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (kthread_should_stop())
- return sectors_written;
-
- schedule();
- try_to_freeze();
- return sectors_written;
- }
-
- if (bkey_cmp(buf->last_scanned, end) >= 0)
- buf->last_scanned = POS(inode, 0);
-
- if (dc->partial_stripes_expensive) {
- refill_full_stripes(dc);
- if (array_freelist_empty(&buf->freelist))
- goto refill_done;
- }
-
- start_pos = buf->last_scanned;
- bch_refill_keybuf(dc->disk.c, buf, end, dirty_pred);
-
- if (bkey_cmp(buf->last_scanned, end) >= 0) {
- /*
- * If we get to the end start scanning again from the
- * beginning, and only scan up to where we initially
- * started scanning from:
- */
- buf->last_scanned = start;
- bch_refill_keybuf(dc->disk.c, buf, start_pos,
- dirty_pred);
- }
-
- if (RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
- atomic_set(&dc->has_dirty, 0);
- cached_dev_put(dc);
- SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN);
- bch_write_bdev_super(dc, NULL);
- }
-
-refill_done:
- up_write(&dc->writeback_lock);
-
- bch_ratelimit_reset(&dc->writeback_pd.rate);
- sectors_written += read_dirty(dc);
- }
-
- return sectors_written;
-}
-
-static int bch_writeback_thread(void *arg)
-{
- struct cached_dev *dc = arg;
- struct bch_fs *c = dc->disk.c;
- struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last;
- u64 sectors_written;
-
- set_freezable();
-
- while (!kthread_should_stop()) {
- if (kthread_wait_freezable(dc->writeback_running ||
- test_bit(BCACHE_DEV_DETACHING,
- &dc->disk.flags)))
- break;
-
- last = atomic_long_read(&clock->now);
-
- sectors_written = bch_writeback(dc);
-
- if (sectors_written < c->capacity >> 4)
- bch_kthread_io_clock_wait(clock,
- last + (c->capacity >> 5));
- }
-
- return 0;
-}
-
-/**
- * bch_keylist_recalc_oldest_gens - update oldest_gen pointers from writeback keys
- *
- * This prevents us from wrapping around gens for a bucket only referenced from
- * writeback keybufs. We don't actually care that the data in those buckets is
- * marked live, only that we don't wrap the gens.
- */
-void bch_writeback_recalc_oldest_gens(struct bch_fs *c)
-{
- struct radix_tree_iter iter;
- void **slot;
-
- rcu_read_lock();
-
- radix_tree_for_each_slot(slot, &c->devices, &iter, 0) {
- struct bcache_device *d;
- struct cached_dev *dc;
-
- d = radix_tree_deref_slot(slot);
-
- if (!CACHED_DEV(&d->inode.v))
- continue;
- dc = container_of(d, struct cached_dev, disk);
-
- bch_keybuf_recalc_oldest_gens(c, &dc->writeback_keys);
- }
-
- rcu_read_unlock();
-}
-
-/* Init */
-
-void bch_sectors_dirty_init(struct cached_dev *dc, struct bch_fs *c)
-{
- struct bcache_device *d = &dc->disk;
- struct btree_iter iter;
- struct bkey_s_c k;
-
- /*
- * We have to do this before the disk is added to the radix tree or we
- * race with moving GC
- */
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
- POS(bcache_dev_inum(d), 0), k) {
- if (k.k->p.inode > bcache_dev_inum(d))
- break;
-
- if (bkey_extent_is_data(k.k) &&
- !bkey_extent_is_cached(k.k))
- __bcache_dev_sectors_dirty_add(d,
- bkey_start_offset(k.k),
- k.k->size);
-
- bch_btree_iter_cond_resched(&iter);
- }
- bch_btree_iter_unlock(&iter);
-
- dc->writeback_pd.last_actual = bcache_dev_sectors_dirty(d);
-}
-
-void bch_cached_dev_writeback_stop(struct cached_dev *dc)
-{
- cancel_delayed_work_sync(&dc->writeback_pd_update);
- if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
- kthread_stop(dc->writeback_thread);
- dc->writeback_thread = NULL;
- }
-}
-
-void bch_cached_dev_writeback_free(struct cached_dev *dc)
-{
- struct bcache_device *d = &dc->disk;
-
- mempool_exit(&dc->writeback_page_pool);
- mempool_exit(&dc->writeback_io_pool);
- kvfree(d->full_dirty_stripes);
- kvfree(d->stripe_sectors_dirty);
-}
-
-int bch_cached_dev_writeback_init(struct cached_dev *dc)
-{
- struct bcache_device *d = &dc->disk;
- sector_t sectors;
- size_t n;
-
- sectors = get_capacity(dc->disk.disk);
-
- if (!d->stripe_size) {
-#ifdef CONFIG_BCACHE_DEBUG
- d->stripe_size = 1 << 0;
-#else
- d->stripe_size = 1 << 31;
-#endif
- }
-
- pr_debug("stripe size: %d sectors", d->stripe_size);
- d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
-
- if (!d->nr_stripes ||
- d->nr_stripes > INT_MAX ||
- d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
- pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
- (unsigned)d->nr_stripes);
- return -ENOMEM;
- }
-
- n = d->nr_stripes * sizeof(atomic_t);
- d->stripe_sectors_dirty = n < PAGE_SIZE << 6
- ? kzalloc(n, GFP_KERNEL)
- : vzalloc(n);
- if (!d->stripe_sectors_dirty) {
- pr_err("cannot allocate stripe_sectors_dirty");
- return -ENOMEM;
- }
-
- n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
- d->full_dirty_stripes = n < PAGE_SIZE << 6
- ? kzalloc(n, GFP_KERNEL)
- : vzalloc(n);
- if (!d->full_dirty_stripes) {
- pr_err("cannot allocate full_dirty_stripes");
- return -ENOMEM;
- }
-
- if (mempool_init_kmalloc_pool(&dc->writeback_io_pool, 4,
- sizeof(struct dirty_io) +
- sizeof(struct bio_vec) *
- DIRTY_IO_MEMPOOL_BVECS) ||
- mempool_init_page_pool(&dc->writeback_page_pool,
- (64 << 10) / PAGE_SIZE, 0))
- return -ENOMEM;
-
- init_rwsem(&dc->writeback_lock);
- bch_keybuf_init(&dc->writeback_keys);
-
- dc->writeback_metadata = true;
- dc->writeback_running = true;
- dc->writeback_percent = 10;
- dc->writeback_pd_update_seconds = 5;
-
- bch_pd_controller_init(&dc->writeback_pd);
- INIT_DELAYED_WORK(&dc->writeback_pd_update, update_writeback_rate);
-
- return 0;
-}
-
-int bch_cached_dev_writeback_start(struct cached_dev *dc)
-{
- dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
- "bcache_writeback");
- if (IS_ERR(dc->writeback_thread))
- return PTR_ERR(dc->writeback_thread);
-
- schedule_delayed_work(&dc->writeback_pd_update,
- dc->writeback_pd_update_seconds * HZ);
-
- bch_writeback_queue(dc);
-
- return 0;
-}
diff --git a/libbcache/writeback.h b/libbcache/writeback.h
deleted file mode 100644
index 82ce306e..00000000
--- a/libbcache/writeback.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifndef _BCACHE_WRITEBACK_H
-#define _BCACHE_WRITEBACK_H
-
-#include "blockdev.h"
-#include "buckets.h"
-
-#define CUTOFF_WRITEBACK 60
-#define CUTOFF_WRITEBACK_SYNC 30
-
-static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
-{
- uint64_t i, ret = 0;
-
- for (i = 0; i < d->nr_stripes; i++)
- ret += atomic_read(d->stripe_sectors_dirty + i);
-
- return ret;
-}
-
-static inline unsigned offset_to_stripe(struct bcache_device *d,
- uint64_t offset)
-{
- do_div(offset, d->stripe_size);
- return offset;
-}
-
-static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
- uint64_t offset,
- unsigned nr_sectors)
-{
- unsigned stripe = offset_to_stripe(&dc->disk, offset);
-
- while (1) {
- if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
- return true;
-
- if (nr_sectors <= dc->disk.stripe_size)
- return false;
-
- nr_sectors -= dc->disk.stripe_size;
- stripe++;
- }
-}
-
-static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
- unsigned cache_mode, bool would_skip)
-{
- struct bch_fs *c = dc->disk.c;
- u64 available = sectors_available(c);
-
- if (cache_mode != CACHE_MODE_WRITEBACK ||
- test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
- available * 100 < c->capacity * CUTOFF_WRITEBACK_SYNC)
- return false;
-
- if (dc->partial_stripes_expensive &&
- bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector,
- bio_sectors(bio)))
- return true;
-
- if (would_skip)
- return false;
-
- return bio->bi_opf & REQ_SYNC ||
- available * 100 < c->capacity * CUTOFF_WRITEBACK;
-}
-
-static inline void bch_writeback_queue(struct cached_dev *dc)
-{
- if (!IS_ERR_OR_NULL(dc->writeback_thread))
- wake_up_process(dc->writeback_thread);
-}
-
-static inline void bch_writeback_add(struct cached_dev *dc)
-{
- if (!atomic_read(&dc->has_dirty) &&
- !atomic_xchg(&dc->has_dirty, 1)) {
- atomic_inc(&dc->count);
-
- if (BDEV_STATE(dc->disk_sb.sb) != BDEV_STATE_DIRTY) {
- SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_DIRTY);
- /* XXX: should do this synchronously */
- bch_write_bdev_super(dc, NULL);
- }
-
- bch_writeback_queue(dc);
- }
-}
-
-#ifndef NO_BCACHE_WRITEBACK
-
-void bcache_dev_sectors_dirty_add(struct bch_fs *, unsigned, u64, int);
-
-void bch_writeback_recalc_oldest_gens(struct bch_fs *);
-void bch_sectors_dirty_init(struct cached_dev *, struct bch_fs *c);
-
-void bch_cached_dev_writeback_stop(struct cached_dev *);
-void bch_cached_dev_writeback_free(struct cached_dev *);
-int bch_cached_dev_writeback_init(struct cached_dev *);
-int bch_cached_dev_writeback_start(struct cached_dev *);
-
-#else
-
-static inline void bcache_dev_sectors_dirty_add(struct bch_fs *c,
- unsigned i, u64 o, int n) {}
-static inline void bch_writeback_recalc_oldest_gens(struct bch_fs *c) {}
-static inline void bch_sectors_dirty_init(struct cached_dev *dc,
- struct bch_fs *c) {}
-static inline void bch_cached_dev_writeback_stop(struct cached_dev *dc) {}
-static inline void bch_cached_dev_writeback_free(struct cached_dev *dc) {}
-static inline int bch_cached_dev_writeback_init(struct cached_dev *dc)
-{
- return 0;
-}
-static inline int bch_cached_dev_writeback_start(struct cached_dev *dc)
-{
- return 0;
-}
-
-#endif
-
-#endif
diff --git a/libbcache/xattr.c b/libbcache/xattr.c
deleted file mode 100644
index a5c66fa1..00000000
--- a/libbcache/xattr.c
+++ /dev/null
@@ -1,365 +0,0 @@
-
-#include "bcache.h"
-#include "bkey_methods.h"
-#include "btree_update.h"
-#include "extents.h"
-#include "fs.h"
-#include "str_hash.h"
-#include "xattr.h"
-
-#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
-
-struct xattr_search_key {
- u8 type;
- struct qstr name;
-};
-
-#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \
- { .type = _type, .name = QSTR_INIT(_name, _len) })
-
-static u64 bch_xattr_hash(const struct bch_hash_info *info,
- const struct xattr_search_key *key)
-{
- struct bch_str_hash_ctx ctx;
-
- bch_str_hash_init(&ctx, info);
- bch_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
- bch_str_hash_update(&ctx, info, key->name.name, key->name.len);
-
- return bch_str_hash_end(&ctx, info);
-}
-
-#define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len)
-
-static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
-{
- return bch_xattr_hash(info, key);
-}
-
-static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
-{
- struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
-
- return bch_xattr_hash(info,
- &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
-}
-
-static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
-{
- struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
- const struct xattr_search_key *r = _r;
-
- return l.v->x_type != r->type ||
- l.v->x_name_len != r->name.len ||
- memcmp(l.v->x_name, r->name.name, r->name.len);
-}
-
-static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
-{
- struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
- struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
-
- return l.v->x_type != r.v->x_type ||
- l.v->x_name_len != r.v->x_name_len ||
- memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
-}
-
-static const struct bch_hash_desc xattr_hash_desc = {
- .btree_id = BTREE_ID_XATTRS,
- .key_type = BCH_XATTR,
- .whiteout_type = BCH_XATTR_WHITEOUT,
- .hash_key = xattr_hash_key,
- .hash_bkey = xattr_hash_bkey,
- .cmp_key = xattr_cmp_key,
- .cmp_bkey = xattr_cmp_bkey,
-};
-
-static const char *bch_xattr_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
-{
- switch (k.k->type) {
- case BCH_XATTR:
- return bkey_val_bytes(k.k) < sizeof(struct bch_xattr)
- ? "value too small"
- : NULL;
-
- case BCH_XATTR_WHITEOUT:
- return bkey_val_bytes(k.k) != 0
- ? "value size should be zero"
- : NULL;
-
- default:
- return "invalid type";
- }
-}
-
-static void bch_xattr_to_text(struct bch_fs *c, char *buf,
- size_t size, struct bkey_s_c k)
-{
- struct bkey_s_c_xattr xattr;
- int n;
-
- switch (k.k->type) {
- case BCH_XATTR:
- xattr = bkey_s_c_to_xattr(k);
-
- if (size) {
- n = min_t(unsigned, size, xattr.v->x_name_len);
- memcpy(buf, xattr.v->x_name, n);
- buf[size - 1] = '\0';
- buf += n;
- size -= n;
- }
-
- n = scnprintf(buf, size, " -> ");
- buf += n;
- size -= n;
-
- if (size) {
- n = min_t(unsigned, size,
- le16_to_cpu(xattr.v->x_val_len));
- memcpy(buf, xattr_val(xattr.v), n);
- buf[size - 1] = '\0';
- buf += n;
- size -= n;
- }
-
- break;
- case BCH_XATTR_WHITEOUT:
- scnprintf(buf, size, "whiteout");
- break;
- }
-}
-
-const struct bkey_ops bch_bkey_xattr_ops = {
- .key_invalid = bch_xattr_invalid,
- .val_to_text = bch_xattr_to_text,
-};
-
-int bch_xattr_get(struct bch_fs *c, struct inode *inode,
- const char *name, void *buffer, size_t size, int type)
-{
- struct bch_inode_info *ei = to_bch_ei(inode);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_xattr xattr;
- int ret;
-
- k = bch_hash_lookup(xattr_hash_desc, &ei->str_hash, c,
- ei->vfs_inode.i_ino, &iter,
- &X_SEARCH(type, name, strlen(name)));
- if (IS_ERR(k.k))
- return bch_btree_iter_unlock(&iter) ?: -ENODATA;
-
- xattr = bkey_s_c_to_xattr(k);
- ret = le16_to_cpu(xattr.v->x_val_len);
- if (buffer) {
- if (ret > size)
- ret = -ERANGE;
- else
- memcpy(buffer, xattr_val(xattr.v), ret);
- }
-
- bch_btree_iter_unlock(&iter);
- return ret;
-}
-
-int __bch_xattr_set(struct bch_fs *c, u64 inum,
- const struct bch_hash_info *hash_info,
- const char *name, const void *value, size_t size,
- int flags, int type, u64 *journal_seq)
-{
- struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
- int ret;
-
- if (!value) {
- ret = bch_hash_delete(xattr_hash_desc, hash_info,
- c, inum,
- journal_seq, &search);
- } else {
- struct bkey_i_xattr *xattr;
- unsigned u64s = BKEY_U64s +
- DIV_ROUND_UP(sizeof(struct bch_xattr) +
- search.name.len + size,
- sizeof(u64));
-
- if (u64s > U8_MAX)
- return -ERANGE;
-
- xattr = kmalloc(u64s * sizeof(u64), GFP_NOFS);
- if (!xattr)
- return -ENOMEM;
-
- bkey_xattr_init(&xattr->k_i);
- xattr->k.u64s = u64s;
- xattr->v.x_type = type;
- xattr->v.x_name_len = search.name.len;
- xattr->v.x_val_len = cpu_to_le16(size);
- memcpy(xattr->v.x_name, search.name.name, search.name.len);
- memcpy(xattr_val(&xattr->v), value, size);
-
- ret = bch_hash_set(xattr_hash_desc, hash_info, c,
- inum, journal_seq,
- &xattr->k_i,
- (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
- (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
- kfree(xattr);
- }
-
- if (ret == -ENOENT)
- ret = flags & XATTR_REPLACE ? -ENODATA : 0;
-
- return ret;
-}
-
-int bch_xattr_set(struct bch_fs *c, struct inode *inode,
- const char *name, const void *value, size_t size,
- int flags, int type)
-{
- struct bch_inode_info *ei = to_bch_ei(inode);
-
- return __bch_xattr_set(c, inode->i_ino, &ei->str_hash,
- name, value, size, flags, type,
- &ei->journal_seq);
-}
-
-static const struct xattr_handler *bch_xattr_type_to_handler(unsigned);
-
-static size_t bch_xattr_emit(struct dentry *dentry,
- const struct bch_xattr *xattr,
- char *buffer, size_t buffer_size)
-{
- const struct xattr_handler *handler =
- bch_xattr_type_to_handler(xattr->x_type);
-
- if (handler && (!handler->list || handler->list(dentry))) {
- const char *prefix = handler->prefix ?: handler->name;
- const size_t prefix_len = strlen(prefix);
- const size_t total_len = prefix_len + xattr->x_name_len + 1;
-
- if (buffer && total_len <= buffer_size) {
- memcpy(buffer, prefix, prefix_len);
- memcpy(buffer + prefix_len,
- xattr->x_name, xattr->x_name_len);
- buffer[prefix_len + xattr->x_name_len] = '\0';
- }
-
- return total_len;
- } else {
- return 0;
- }
-}
-
-ssize_t bch_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
- struct bch_fs *c = dentry->d_sb->s_fs_info;
- struct btree_iter iter;
- struct bkey_s_c k;
- const struct bch_xattr *xattr;
- u64 inum = dentry->d_inode->i_ino;
- ssize_t ret = 0;
- size_t len;
-
- for_each_btree_key(&iter, c, BTREE_ID_XATTRS, POS(inum, 0), k) {
- BUG_ON(k.k->p.inode < inum);
-
- if (k.k->p.inode > inum)
- break;
-
- if (k.k->type != BCH_XATTR)
- continue;
-
- xattr = bkey_s_c_to_xattr(k).v;
-
- len = bch_xattr_emit(dentry, xattr, buffer, buffer_size);
- if (buffer) {
- if (len > buffer_size) {
- bch_btree_iter_unlock(&iter);
- return -ERANGE;
- }
-
- buffer += len;
- buffer_size -= len;
- }
-
- ret += len;
-
- }
- bch_btree_iter_unlock(&iter);
-
- return ret;
-}
-
-static int bch_xattr_get_handler(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- struct bch_fs *c = inode->i_sb->s_fs_info;
-
- return bch_xattr_get(c, inode, name, buffer, size, handler->flags);
-}
-
-static int bch_xattr_set_handler(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- struct bch_fs *c = inode->i_sb->s_fs_info;
-
- return bch_xattr_set(c, inode, name, value, size, flags,
- handler->flags);
-}
-
-static const struct xattr_handler bch_xattr_user_handler = {
- .prefix = XATTR_USER_PREFIX,
- .get = bch_xattr_get_handler,
- .set = bch_xattr_set_handler,
- .flags = BCH_XATTR_INDEX_USER,
-};
-
-static bool bch_xattr_trusted_list(struct dentry *dentry)
-{
- return capable(CAP_SYS_ADMIN);
-}
-
-static const struct xattr_handler bch_xattr_trusted_handler = {
- .prefix = XATTR_TRUSTED_PREFIX,
- .list = bch_xattr_trusted_list,
- .get = bch_xattr_get_handler,
- .set = bch_xattr_set_handler,
- .flags = BCH_XATTR_INDEX_TRUSTED,
-};
-
-static const struct xattr_handler bch_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .get = bch_xattr_get_handler,
- .set = bch_xattr_set_handler,
- .flags = BCH_XATTR_INDEX_SECURITY,
-};
-
-static const struct xattr_handler *bch_xattr_handler_map[] = {
- [BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler,
- [BCH_XATTR_INDEX_POSIX_ACL_ACCESS] =
- &posix_acl_access_xattr_handler,
- [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT] =
- &posix_acl_default_xattr_handler,
- [BCH_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler,
- [BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler,
-};
-
-const struct xattr_handler *bch_xattr_handlers[] = {
- &bch_xattr_user_handler,
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
- &bch_xattr_trusted_handler,
- &bch_xattr_security_handler,
- NULL
-};
-
-static const struct xattr_handler *bch_xattr_type_to_handler(unsigned type)
-{
- return type < ARRAY_SIZE(bch_xattr_handler_map)
- ? bch_xattr_handler_map[type]
- : NULL;
-}
diff --git a/libbcache/xattr.h b/libbcache/xattr.h
deleted file mode 100644
index c48c7acf..00000000
--- a/libbcache/xattr.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _BCACHE_XATTR_H
-#define _BCACHE_XATTR_H
-
-extern const struct bkey_ops bch_bkey_xattr_ops;
-
-struct dentry;
-struct xattr_handler;
-struct bch_hash_info;
-
-int bch_xattr_get(struct bch_fs *, struct inode *,
- const char *, void *, size_t, int);
-int __bch_xattr_set(struct bch_fs *, u64, const struct bch_hash_info *,
- const char *, const void *, size_t, int, int, u64 *);
-int bch_xattr_set(struct bch_fs *, struct inode *,
- const char *, const void *, size_t, int, int);
-ssize_t bch_xattr_list(struct dentry *, char *, size_t);
-
-extern const struct xattr_handler *bch_xattr_handlers[];
-
-#endif /* _BCACHE_XATTR_H */