summaryrefslogtreecommitdiff
path: root/libbcache
diff options
context:
space:
mode:
Diffstat (limited to 'libbcache')
-rw-r--r--libbcache/acl.c2
-rw-r--r--libbcache/alloc.c93
-rw-r--r--libbcache/alloc_types.h2
-rw-r--r--libbcache/bcache.h46
-rw-r--r--libbcache/bkey.c172
-rw-r--r--libbcache/bkey.h28
-rw-r--r--libbcache/blockdev.c41
-rw-r--r--libbcache/bset.c4
-rw-r--r--libbcache/bset.h17
-rw-r--r--libbcache/btree_cache.c2
-rw-r--r--libbcache/btree_gc.c71
-rw-r--r--libbcache/btree_gc.h2
-rw-r--r--libbcache/btree_io.c201
-rw-r--r--libbcache/btree_types.h16
-rw-r--r--libbcache/btree_update.c31
-rw-r--r--libbcache/btree_update.h24
-rw-r--r--libbcache/buckets.c13
-rw-r--r--libbcache/buckets.h2
-rw-r--r--libbcache/chardev.c37
-rw-r--r--libbcache/checksum.c450
-rw-r--r--libbcache/checksum.h129
-rw-r--r--libbcache/compress.c144
-rw-r--r--libbcache/compress.h5
-rw-r--r--libbcache/debug.c12
-rw-r--r--libbcache/dirent.c31
-rw-r--r--libbcache/extents.c443
-rw-r--r--libbcache/extents.h211
-rw-r--r--libbcache/fs-gc.c163
-rw-r--r--libbcache/fs-io.c159
-rw-r--r--libbcache/fs.c156
-rw-r--r--libbcache/fs.h9
-rw-r--r--libbcache/inode.c288
-rw-r--r--libbcache/inode.h43
-rw-r--r--libbcache/io.c116
-rw-r--r--libbcache/io.h2
-rw-r--r--libbcache/io_types.h9
-rw-r--r--libbcache/journal.c583
-rw-r--r--libbcache/journal.h29
-rw-r--r--libbcache/journal_types.h6
-rw-r--r--libbcache/migrate.c16
-rw-r--r--libbcache/move.c27
-rw-r--r--libbcache/movinggc.c2
-rw-r--r--libbcache/notify.c4
-rw-r--r--libbcache/opts.c19
-rw-r--r--libbcache/opts.h88
-rw-r--r--libbcache/siphash.c99
-rw-r--r--libbcache/str_hash.h84
-rw-r--r--libbcache/super-io.c798
-rw-r--r--libbcache/super-io.h141
-rw-r--r--libbcache/super.c945
-rw-r--r--libbcache/super.h48
-rw-r--r--libbcache/super_types.h2
-rw-r--r--libbcache/sysfs.c110
-rw-r--r--libbcache/tier.c3
-rw-r--r--libbcache/vstructs.h62
-rw-r--r--libbcache/xattr.c35
56 files changed, 3900 insertions, 2375 deletions
diff --git a/libbcache/acl.c b/libbcache/acl.c
index 64d5616..468d98d 100644
--- a/libbcache/acl.c
+++ b/libbcache/acl.c
@@ -187,7 +187,7 @@ int bch_set_acl(struct inode *inode, struct posix_acl *acl, int type)
if (ret < 0)
return ret;
else {
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_fs_time(inode->i_sb);
mark_inode_dirty(inode);
if (ret == 0)
acl = NULL;
diff --git a/libbcache/alloc.c b/libbcache/alloc.c
index 4fe08b5..cd22c38 100644
--- a/libbcache/alloc.c
+++ b/libbcache/alloc.c
@@ -64,7 +64,7 @@
#include "extents.h"
#include "io.h"
#include "journal.h"
-#include "super.h"
+#include "super-io.h"
#include <linux/blkdev.h>
#include <linux/kthread.h>
@@ -105,7 +105,7 @@ void bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca)
if (rcu_access_pointer(grp->d[i].dev) == ca)
goto out;
- BUG_ON(grp->nr_devices >= MAX_CACHES_PER_SET);
+ BUG_ON(grp->nr_devices >= BCH_SB_MEMBERS_MAX);
rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca);
out:
@@ -124,9 +124,9 @@ static void pd_controllers_update(struct work_struct *work)
int i;
/* All units are in bytes */
- u64 tier_size[CACHE_TIERS];
- u64 tier_free[CACHE_TIERS];
- u64 tier_dirty[CACHE_TIERS];
+ u64 tier_size[BCH_TIER_MAX];
+ u64 tier_free[BCH_TIER_MAX];
+ u64 tier_dirty[BCH_TIER_MAX];
u64 tier0_can_free = 0;
memset(tier_size, 0, sizeof(tier_size));
@@ -134,7 +134,7 @@ static void pd_controllers_update(struct work_struct *work)
memset(tier_dirty, 0, sizeof(tier_dirty));
rcu_read_lock();
- for (i = CACHE_TIERS - 1; i >= 0; --i)
+ for (i = BCH_TIER_MAX - 1; i >= 0; --i)
group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
unsigned bucket_bits = ca->bucket_bits + 9;
@@ -246,6 +246,16 @@ static int prio_io(struct cache *ca, uint64_t bucket, int op)
return submit_bio_wait(ca->bio_prio);
}
+static struct nonce prio_nonce(struct prio_set *p)
+{
+ return (struct nonce) {{
+ [0] = 0,
+ [1] = p->nonce[0],
+ [2] = p->nonce[1],
+ [3] = p->nonce[2]^BCH_NONCE_PRIO,
+ }};
+}
+
static int bch_prio_write(struct cache *ca)
{
struct cache_set *c = ca->set;
@@ -279,12 +289,8 @@ static int bch_prio_write(struct cache *ca)
}
p->next_bucket = cpu_to_le64(ca->prio_buckets[i + 1]);
- p->magic = cpu_to_le64(pset_magic(&c->disk_sb));
-
- SET_PSET_CSUM_TYPE(p, c->opts.metadata_checksum);
- p->csum = cpu_to_le64(bch_checksum(PSET_CSUM_TYPE(p),
- &p->magic,
- bucket_bytes(ca) - 8));
+ p->magic = cpu_to_le64(pset_magic(c));
+ get_random_bytes(&p->nonce, sizeof(p->nonce));
spin_lock(&ca->prio_buckets_lock);
r = bch_bucket_alloc(ca, RESERVE_PRIO);
@@ -298,6 +304,19 @@ static int bch_prio_write(struct cache *ca)
bch_mark_metadata_bucket(ca, ca->buckets + r, false);
spin_unlock(&ca->prio_buckets_lock);
+ SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c));
+
+ bch_encrypt(c, PSET_CSUM_TYPE(p),
+ prio_nonce(p),
+ p->encrypted_start,
+ bucket_bytes(ca) -
+ offsetof(struct prio_set, encrypted_start));
+
+ p->csum = bch_checksum(c, PSET_CSUM_TYPE(p),
+ prio_nonce(p),
+ (void *) p + sizeof(p->csum),
+ bucket_bytes(ca) - sizeof(p->csum));
+
ret = prio_io(ca, r, REQ_OP_WRITE);
if (cache_fatal_io_err_on(ret, ca,
"prio write to bucket %zu", r) ||
@@ -306,9 +325,9 @@ static int bch_prio_write(struct cache *ca)
}
spin_lock(&j->lock);
- j->prio_buckets[ca->sb.nr_this_dev] = cpu_to_le64(ca->prio_buckets[0]);
+ j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]);
j->nr_prio_buckets = max_t(unsigned,
- ca->sb.nr_this_dev + 1,
+ ca->dev_idx + 1,
j->nr_prio_buckets);
spin_unlock(&j->lock);
@@ -320,7 +339,7 @@ static int bch_prio_write(struct cache *ca)
return ret;
need_new_journal_entry = j->buf[res.idx].nr_prio_buckets <
- ca->sb.nr_this_dev + 1;
+ ca->dev_idx + 1;
bch_journal_res_put(j, &res);
ret = bch_journal_flush_seq(j, res.seq);
@@ -355,13 +374,14 @@ int bch_prio_read(struct cache *ca)
struct prio_set *p = ca->disk_buckets;
struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
struct bucket_mark new;
+ struct bch_csum csum;
unsigned bucket_nr = 0;
u64 bucket, expect, got;
size_t b;
int ret = 0;
spin_lock(&c->journal.lock);
- bucket = le64_to_cpu(c->journal.prio_buckets[ca->sb.nr_this_dev]);
+ bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]);
spin_unlock(&c->journal.lock);
/*
@@ -387,18 +407,28 @@ int bch_prio_read(struct cache *ca)
return -EIO;
got = le64_to_cpu(p->magic);
- expect = pset_magic(&c->disk_sb);
+ expect = pset_magic(c);
unfixable_fsck_err_on(got != expect, c,
"bad magic (got %llu expect %llu) while reading prios from bucket %llu",
got, expect, bucket);
- got = le64_to_cpu(p->csum);
- expect = bch_checksum(PSET_CSUM_TYPE(p),
- &p->magic,
- bucket_bytes(ca) - 8);
- unfixable_fsck_err_on(got != expect, c,
- "bad checksum (got %llu expect %llu) while reading prios from bucket %llu",
- got, expect, bucket);
+ unfixable_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c,
+ "prio bucket with unknown csum type %llu bucket %lluu",
+ PSET_CSUM_TYPE(p), bucket);
+
+ csum = bch_checksum(c, PSET_CSUM_TYPE(p),
+ prio_nonce(p),
+ (void *) p + sizeof(p->csum),
+ bucket_bytes(ca) - sizeof(p->csum));
+ unfixable_fsck_err_on(bch_crc_cmp(csum, p->csum), c,
+ "bad checksum reading prios from bucket %llu",
+ bucket);
+
+ bch_encrypt(c, PSET_CSUM_TYPE(p),
+ prio_nonce(p),
+ p->encrypted_start,
+ bucket_bytes(ca) -
+ offsetof(struct prio_set, encrypted_start));
bucket = le64_to_cpu(p->next_bucket);
d = p->data;
@@ -1029,7 +1059,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
spin_lock(&devs->lock);
for (i = 0; i < devs->nr_devices; i++)
- available += !test_bit(devs->d[i].dev->sb.nr_this_dev,
+ available += !test_bit(devs->d[i].dev->dev_idx,
caches_used);
recalc_alloc_group_weights(c, devs);
@@ -1054,7 +1084,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
ca = devs->d[i].dev;
- if (test_bit(ca->sb.nr_this_dev, caches_used))
+ if (test_bit(ca->dev_idx, caches_used))
continue;
if (fail_idx == -1 &&
@@ -1082,11 +1112,11 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
ob->ptrs[0] = (struct bch_extent_ptr) {
.gen = ca->buckets[bucket].mark.gen,
.offset = bucket_to_sector(ca, bucket),
- .dev = ca->sb.nr_this_dev,
+ .dev = ca->dev_idx,
};
ob->ptr_offset[0] = 0;
- __set_bit(ca->sb.nr_this_dev, caches_used);
+ __set_bit(ca->dev_idx, caches_used);
available--;
devs->cur_device = i;
}
@@ -1334,7 +1364,7 @@ static int open_bucket_add_buckets(struct cache_set *c,
enum alloc_reserve reserve,
struct closure *cl)
{
- long caches_used[BITS_TO_LONGS(MAX_CACHES_PER_SET)];
+ long caches_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
int i, dst;
/*
@@ -1475,6 +1505,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
tmp = ob->ptrs[i];
+ tmp.cached = bkey_extent_is_cached(&e->k);
tmp.offset += ob->ptr_offset[i];
extent_ptr_append(e, tmp);
@@ -1657,7 +1688,7 @@ static void bch_stop_write_point(struct cache *ca,
return;
for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
- if (ptr->dev == ca->sb.nr_this_dev)
+ if (ptr->dev == ca->dev_idx)
goto found;
mutex_unlock(&ob->lock);
@@ -1682,7 +1713,7 @@ static bool bch_dev_has_open_write_point(struct cache *ca)
if (atomic_read(&ob->pin)) {
mutex_lock(&ob->lock);
for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
- if (ptr->dev == ca->sb.nr_this_dev) {
+ if (ptr->dev == ca->dev_idx) {
mutex_unlock(&ob->lock);
return true;
}
diff --git a/libbcache/alloc_types.h b/libbcache/alloc_types.h
index 337b6e4..fbe8b75 100644
--- a/libbcache/alloc_types.h
+++ b/libbcache/alloc_types.h
@@ -56,7 +56,7 @@ struct cache_group {
struct {
u64 weight;
struct cache *dev;
- } d[MAX_CACHES_PER_SET];
+ } d[BCH_SB_MEMBERS_MAX];
};
/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
diff --git a/libbcache/bcache.h b/libbcache/bcache.h
index 309d372..8a0262f 100644
--- a/libbcache/bcache.h
+++ b/libbcache/bcache.h
@@ -314,6 +314,8 @@ do { \
struct btree;
struct cache;
+struct crypto_blkcipher;
+struct crypto_ahash;
enum gc_phase {
GC_PHASE_PENDING_DELETE = BTREE_ID_NR + 1,
@@ -332,7 +334,6 @@ struct cache_member_cpu {
u16 bucket_size; /* sectors */
u8 state;
u8 tier;
- u8 replication_set;
u8 has_metadata;
u8 has_data;
u8 replacement;
@@ -342,7 +343,7 @@ struct cache_member_cpu {
struct cache_member_rcu {
struct rcu_head rcu;
- unsigned nr_in_set;
+ unsigned nr_devices;
struct cache_member_cpu m[];
};
@@ -363,14 +364,13 @@ struct cache {
struct cache_group self;
+ u8 dev_idx;
/*
* Cached version of this device's member info from superblock
- * Committed by write_super()
+ * Committed by bch_write_super() -> bch_cache_set_mi_update()
*/
- struct {
- u8 nr_this_dev;
- } sb;
struct cache_member_cpu mi;
+ uuid_le uuid;
struct bcache_superblock disk_sb;
@@ -518,36 +518,45 @@ struct cache_set {
struct percpu_ref writes;
struct work_struct read_only_work;
- struct cache __rcu *cache[MAX_CACHES_PER_SET];
-
- struct mutex mi_lock;
- struct cache_member_rcu __rcu *members;
- struct cache_member *disk_mi; /* protected by register_lock */
+ struct cache __rcu *cache[BCH_SB_MEMBERS_MAX];
struct cache_set_opts opts;
/*
* Cached copy in native endianness:
- * Set by cache_sb_to_cache_set:
+ * Set by bch_cache_set_mi_update():
*/
+ struct cache_member_rcu __rcu *members;
+
+ /* Updated by bch_sb_update():*/
struct {
+ uuid_le uuid;
+ uuid_le user_uuid;
+
u16 block_size;
u16 btree_node_size;
- u8 nr_in_set;
+ u8 nr_devices;
u8 clean;
u8 meta_replicas_have;
u8 data_replicas_have;
u8 str_hash_type;
+ u8 encryption_type;
+
+ u64 time_base_lo;
+ u32 time_base_hi;
+ u32 time_precision;
} sb;
- struct cache_sb disk_sb;
+ struct bch_sb *disk_sb;
+ unsigned disk_sb_order;
+
unsigned short block_bits; /* ilog2(block_size) */
struct closure sb_write;
- struct semaphore sb_write_mutex;
+ struct mutex sb_lock;
struct backing_dev_info bdi;
@@ -631,7 +640,7 @@ struct cache_set {
* allocate from:
*/
struct cache_group cache_all;
- struct cache_group cache_tiers[CACHE_TIERS];
+ struct cache_group cache_tiers[BCH_TIER_MAX];
u64 capacity; /* sectors */
@@ -724,6 +733,11 @@ struct cache_set {
struct bio_decompress_worker __percpu
*bio_decompress_worker;
+ struct crypto_blkcipher *chacha20;
+ struct crypto_shash *poly1305;
+
+ atomic64_t key_version;
+
/* For punting bio submissions to workqueue, io.c */
struct bio_list bio_submit_list;
struct work_struct bio_submit_work;
diff --git a/libbcache/bkey.c b/libbcache/bkey.c
index 64d2c84..374237e 100644
--- a/libbcache/bkey.c
+++ b/libbcache/bkey.c
@@ -81,9 +81,9 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
- p("u64s %u type %u %llu:%llu snap %u len %u ver %u",
+ p("u64s %u type %u %llu:%llu snap %u len %u ver %llu",
k->u64s, k->type, k->p.inode, k->p.offset,
- k->p.snapshot, k->size, k->version);
+ k->p.snapshot, k->size, k->version.lo);
BUG_ON(bkey_packed(k));
@@ -258,13 +258,21 @@ bool bch_bkey_transform(const struct bkey_format *out_f,
return true;
}
+#define bkey_fields() \
+ x(BKEY_FIELD_INODE, p.inode) \
+ x(BKEY_FIELD_OFFSET, p.offset) \
+ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
+ x(BKEY_FIELD_SIZE, size) \
+ x(BKEY_FIELD_VERSION_HI, version.hi) \
+ x(BKEY_FIELD_VERSION_LO, version.lo)
+
struct bkey __bkey_unpack_key(const struct bkey_format *format,
const struct bkey_packed *in)
{
struct unpack_state state = unpack_state_init(format, in);
struct bkey out;
- EBUG_ON(format->nr_fields != 5);
+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
EBUG_ON(in->u64s < format->key_u64s);
EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
@@ -274,11 +282,10 @@ struct bkey __bkey_unpack_key(const struct bkey_format *format,
out.needs_whiteout = in->needs_whiteout;
out.type = in->type;
out.pad[0] = 0;
- out.p.inode = get_inc_field(&state, BKEY_FIELD_INODE);
- out.p.offset = get_inc_field(&state, BKEY_FIELD_OFFSET);
- out.p.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
- out.size = get_inc_field(&state, BKEY_FIELD_SIZE);
- out.version = get_inc_field(&state, BKEY_FIELD_VERSION);
+
+#define x(id, field) out.field = get_inc_field(&state, id);
+ bkey_fields()
+#undef x
return out;
}
@@ -290,7 +297,7 @@ struct bpos __bkey_unpack_pos(const struct bkey_format *format,
struct unpack_state state = unpack_state_init(format, in);
struct bpos out;
- EBUG_ON(format->nr_fields != 5);
+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
EBUG_ON(in->u64s < format->key_u64s);
EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
@@ -311,17 +318,14 @@ bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
struct pack_state state = pack_state_init(format, out);
EBUG_ON((void *) in == (void *) out);
- EBUG_ON(format->nr_fields != 5);
+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
EBUG_ON(in->format != KEY_FORMAT_CURRENT);
out->_data[0] = 0;
- if (!set_inc_field(&state, BKEY_FIELD_INODE, in->p.inode) ||
- !set_inc_field(&state, BKEY_FIELD_OFFSET, in->p.offset) ||
- !set_inc_field(&state, BKEY_FIELD_SNAPSHOT, in->p.snapshot) ||
- !set_inc_field(&state, BKEY_FIELD_SIZE, in->size) ||
- !set_inc_field(&state, BKEY_FIELD_VERSION, in->version))
- return false;
+#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false;
+ bkey_fields()
+#undef x
/*
* Extents - we have to guarantee that if an extent is packed, a trimmed
@@ -340,47 +344,6 @@ bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
return true;
}
-/*
- * Alternate implementations using bch_bkey_transform_key() - unfortunately, too
- * slow
- */
-#if 0
-struct bkey __bkey_unpack_key(const struct bkey_format *format,
- const struct bkey_packed *in)
-{
- struct bkey out;
- bool s;
-
- EBUG_ON(format->nr_fields != 5);
- EBUG_ON(in->u64s < format->key_u64s);
- EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
-
- s = bch_bkey_transform_key(&bch_bkey_format_current, (void *) &out,
- format, in);
- EBUG_ON(!s);
-
- out.format = KEY_FORMAT_CURRENT;
-
- return out;
-}
-
-bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
- const struct bkey_format *format)
-{
- EBUG_ON(format->nr_fields != 5);
- EBUG_ON(in->format != KEY_FORMAT_CURRENT);
-
- if (!bch_bkey_transform_key(format, out,
- &bch_bkey_format_current, (void *) in))
- return false;
-
- out->format = KEY_FORMAT_LOCAL_BTREE;
-
- bch_bkey_pack_verify(out, in, format);
- return true;
-}
-#endif
-
/**
* bkey_unpack -- unpack the key and the value
*/
@@ -588,12 +551,10 @@ static void __bkey_format_add(struct bkey_format_state *s,
*/
void bch_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
{
- __bkey_format_add(s, BKEY_FIELD_INODE, k->p.inode);
- __bkey_format_add(s, BKEY_FIELD_OFFSET, k->p.offset);
+#define x(id, field) __bkey_format_add(s, id, k->field);
+ bkey_fields()
+#undef x
__bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
- __bkey_format_add(s, BKEY_FIELD_SNAPSHOT, k->p.snapshot);
- __bkey_format_add(s, BKEY_FIELD_SIZE, k->size);
- __bkey_format_add(s, BKEY_FIELD_VERSION, k->version);
}
void bch_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
@@ -636,6 +597,12 @@ struct bkey_format bch_bkey_format_done(struct bkey_format_state *s)
bits += ret.bits_per_field[i];
}
+ /* allow for extent merging: */
+ if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
+ ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
+ bits += 4;
+ }
+
ret.key_u64s = DIV_ROUND_UP(bits, 64);
/* if we have enough spare bits, round fields up to nearest byte */
@@ -1014,25 +981,13 @@ int bch_compile_bkey_format(const struct bkey_format *format, void *_out)
/* mov [rdi], eax */
I2(0x89, 0x07);
- out = compile_bkey_field(format, out, BKEY_FIELD_INODE,
- offsetof(struct bkey, p.inode), 8,
- &eax_zeroed);
-
- out = compile_bkey_field(format, out, BKEY_FIELD_OFFSET,
- offsetof(struct bkey, p.offset), 8,
- &eax_zeroed);
-
- out = compile_bkey_field(format, out, BKEY_FIELD_SNAPSHOT,
- offsetof(struct bkey, p.snapshot), 4,
- &eax_zeroed);
-
- out = compile_bkey_field(format, out, BKEY_FIELD_SIZE,
- offsetof(struct bkey, size), 4,
- &eax_zeroed);
-
- out = compile_bkey_field(format, out, BKEY_FIELD_VERSION,
- offsetof(struct bkey, version), 4,
+#define x(id, field) \
+ out = compile_bkey_field(format, out, id, \
+ offsetof(struct bkey, field), \
+ sizeof(((struct bkey *) NULL)->field), \
&eax_zeroed);
+ bkey_fields()
+#undef x
/* retq */
I1(0xc3);
@@ -1078,43 +1033,6 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
}
#endif
-/*
- * Would like to use this if we can make __bkey_cmp_bits() fast enough, it'll be
- * a decent reduction in code size
- */
-#if 0
-static int bkey_cmp_verify(const struct bkey *l, const struct bkey *r)
-{
- if (l->p.inode != r->p.inode)
- return l->p.inode < r->p.inode ? -1 : 1;
-
- if (l->p.offset != r->p.offset)
- return l->p.offset < r->p.offset ? -1 : 1;
-
- if (l->p.snapshot != r->p.snapshot)
- return l->p.snapshot < r->p.snapshot ? -1 : 1;
-
- return 0;
-}
-
-int bkey_cmp(const struct bkey *l, const struct bkey *r)
-{
- int ret;
-
- EBUG_ON(bkey_packed(l) || bkey_packed(r));
-
- ret = __bkey_cmp_bits((sizeof(l->inode) +
- sizeof(l->offset) +
- sizeof(l->snapshot)) * BITS_PER_BYTE,
- __high_word(BKEY_U64s, l),
- __high_word(BKEY_U64s, r));
-
- BUG_ON(ret != bkey_cmp_verify(l, r));
-
- return ret;
-}
-#endif
-
__pure
int __bkey_cmp_packed_format_checked(const struct bkey_packed *l,
const struct bkey_packed *r,
@@ -1214,7 +1132,7 @@ void bkey_pack_test(void)
struct bkey_format test_format = {
.key_u64s = 2,
- .nr_fields = 5,
+ .nr_fields = BKEY_NR_FIELDS,
.bits_per_field = {
13,
64,
@@ -1230,21 +1148,9 @@ void bkey_pack_test(void)
u64 a, v = get_inc_field(&in_s, i);
switch (i) {
- case 0:
- a = t.p.inode;
- break;
- case 1:
- a = t.p.offset;
- break;
- case 2:
- a = t.p.snapshot;
- break;
- case 3:
- a = t.size;
- break;
- case 4:
- a = t.version;
- break;
+#define x(id, field) case id: a = t.field; break;
+ bkey_fields()
+#undef x
default:
BUG();
}
diff --git a/libbcache/bkey.h b/libbcache/bkey.h
index 3e29cdd..0893134 100644
--- a/libbcache/bkey.h
+++ b/libbcache/bkey.h
@@ -5,6 +5,7 @@
#include <linux/bcache.h>
#include "util.h"
+#include "vstructs.h"
void bch_to_binary(char *, const u64 *, unsigned);
int bch_bkey_to_text(char *, size_t, const struct bkey *);
@@ -28,15 +29,7 @@ struct bkey_s {
};
};
-#define bkey_next(_k) \
-({ \
- BUILD_BUG_ON(!type_is(_k, struct bkey *) && \
- !type_is(_k, struct bkey_i *) && \
- !type_is(_k, struct bkey_packed *)); \
- \
- ((typeof(_k)) __bkey_idx(((struct bkey *) (_k)), \
- ((struct bkey *) (_k))->u64s)); \
-})
+#define bkey_next(_k) vstruct_next(_k)
static inline unsigned bkey_val_u64s(const struct bkey *k)
{
@@ -218,6 +211,22 @@ static inline struct bpos bpos_min(struct bpos l, struct bpos r)
void bch_bpos_swab(struct bpos *);
void bch_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
+static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
+{
+ if (l.hi != r.hi)
+ return l.hi < r.hi ? -1 : 1;
+ if (l.lo != r.lo)
+ return l.lo < r.lo ? -1 : 1;
+ return 0;
+}
+
+#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
+
+static __always_inline int bversion_zero(struct bversion v)
+{
+ return !bversion_cmp(v, ZERO_VERSION);
+}
+
#ifdef CONFIG_BCACHE_DEBUG
/* statement expressions confusing unlikely()? */
#define bkey_packed(_k) \
@@ -555,6 +564,7 @@ static inline void __bch_extent_assert(u8 type, u8 nr)
}
__BKEY_VAL_ACCESSORS(extent, BCH_EXTENT, __bch_extent_assert);
+BKEY_VAL_ACCESSORS(reservation, BCH_RESERVATION);
BKEY_VAL_ACCESSORS(inode, BCH_INODE_FS);
BKEY_VAL_ACCESSORS(inode_blockdev, BCH_INODE_BLOCKDEV);
diff --git a/libbcache/blockdev.c b/libbcache/blockdev.c
index cd231f5..d3a373c 100644
--- a/libbcache/blockdev.c
+++ b/libbcache/blockdev.c
@@ -2,11 +2,12 @@
#include "bcache.h"
#include "blockdev.h"
#include "btree_iter.h"
+#include "btree_update.h"
#include "checksum.h"
#include "error.h"
#include "inode.h"
#include "request.h"
-#include "super.h"
+#include "super-io.h"
#include "writeback.h"
#include <linux/kthread.h>
@@ -42,15 +43,22 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
down(&dc->sb_write_mutex);
closure_init(cl, parent);
+ sb->csum = csum_vstruct(NULL, BCH_CSUM_CRC64,
+ (struct nonce) { 0 }, sb).lo;
+
bio_reset(bio);
- bio->bi_end_io = write_bdev_super_endio;
- bio->bi_private = dc;
+ bio->bi_bdev = dc->disk_sb.bdev;
+ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
+ bio->bi_iter.bi_size =
+ roundup(vstruct_bytes(sb),
+ bdev_logical_block_size(dc->disk_sb.bdev));
+ bio->bi_end_io = write_bdev_super_endio;
+ bio->bi_private = dc;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FUA|REQ_META);
+ bch_bio_map(bio, sb);
closure_get(cl);
- sb->csum = cpu_to_le64(__csum_set(sb, 0, BCH_CSUM_CRC64));
- __write_super(dc->disk.c, (void *) &dc->disk_sb);
-
closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
}
@@ -263,7 +271,7 @@ static void calc_cached_dev_sectors(struct cache_set *c)
void bch_cached_dev_run(struct cached_dev *dc)
{
struct bcache_device *d = &dc->disk;
- char buf[SB_LABEL_SIZE + 1];
+ char buf[BCH_SB_LABEL_SIZE + 1];
char *env[] = {
"DRIVER=bcache",
kasprintf(GFP_KERNEL, "CACHED_UUID=%pU",
@@ -272,8 +280,8 @@ void bch_cached_dev_run(struct cached_dev *dc)
NULL,
};
- memcpy(buf, dc->disk_sb.sb->label, SB_LABEL_SIZE);
- buf[SB_LABEL_SIZE] = '\0';
+ memcpy(buf, dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
+ buf[BCH_SB_LABEL_SIZE] = '\0';
env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
if (atomic_xchg(&dc->running, 1)) {
@@ -370,8 +378,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
bdevname(dc->disk_sb.bdev, buf);
if (memcmp(&dc->disk_sb.sb->set_uuid,
- &c->disk_sb.set_uuid,
- sizeof(c->disk_sb.set_uuid)))
+ &c->sb.uuid,
+ sizeof(c->sb.uuid)))
return -ENOENT;
if (dc->disk.c) {
@@ -424,7 +432,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
SET_CACHED_DEV(&dc->disk.inode.v, true);
dc->disk.inode.v.i_uuid = dc->disk_sb.sb->disk_uuid;
memcpy(dc->disk.inode.v.i_label,
- dc->disk_sb.sb->label, SB_LABEL_SIZE);
+ dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
dc->disk.inode.v.i_ctime = rtime;
dc->disk.inode.v.i_mtime = rtime;
@@ -438,14 +446,15 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
pr_info("attached inode %llu", bcache_dev_inum(&dc->disk));
- dc->disk_sb.sb->set_uuid = c->disk_sb.set_uuid;
+ dc->disk_sb.sb->set_uuid = c->sb.uuid;
SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN);
bch_write_bdev_super(dc, &cl);
closure_sync(&cl);
} else {
dc->disk.inode.v.i_mtime = rtime;
- bch_inode_update(c, &dc->disk.inode.k_i, NULL);
+ bch_btree_update(c, BTREE_ID_INODES,
+ &dc->disk.inode.k_i, NULL);
}
/* Count dirty sectors before attaching */
@@ -479,7 +488,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
pr_info("Caching %s as %s on set %pU",
bdevname(dc->disk_sb.bdev, buf), dc->disk.disk->disk_name,
- dc->disk.c->disk_sb.set_uuid.b);
+ dc->disk.c->sb.uuid.b);
return 0;
}
@@ -517,7 +526,7 @@ static void cached_dev_free(struct closure *cl)
mutex_unlock(&bch_register_lock);
- free_super((void *) &dc->disk_sb);
+ bch_free_super((void *) &dc->disk_sb);
kobject_put(&dc->disk.kobj);
}
diff --git a/libbcache/bset.c b/libbcache/bset.c
index 3488095..a88d801 100644
--- a/libbcache/bset.c
+++ b/libbcache/bset.c
@@ -59,7 +59,7 @@ void bch_dump_bset(struct btree *b, struct bset *i, unsigned set)
return;
for (_k = i->start, k = bkey_unpack_key(b, _k);
- _k < bset_bkey_last(i);
+ _k < vstruct_last(i);
_k = _n, k = n) {
_n = bkey_next(_k);
@@ -67,7 +67,7 @@ void bch_dump_bset(struct btree *b, struct bset *i, unsigned set)
printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
_k->_data - i->_data, i->u64s, buf);
- if (_n == bset_bkey_last(i))
+ if (_n == vstruct_last(i))
continue;
n = bkey_unpack_key(b, _n);
diff --git a/libbcache/bset.h b/libbcache/bset.h
index f03e6b8..70868c5 100644
--- a/libbcache/bset.h
+++ b/libbcache/bset.h
@@ -9,6 +9,7 @@
#include "bkey_methods.h"
#include "btree_types.h"
#include "util.h" /* for time_stats */
+#include "vstructs.h"
/*
* BKEYS:
@@ -302,15 +303,6 @@ static inline void btree_node_set_format(struct btree *b,
bch_bset_set_no_aux_tree(b, b->set);
}
-#define __set_bytes(_i, _u64s) (sizeof(*(_i)) + (_u64s) * sizeof(u64))
-#define set_bytes(_i) __set_bytes(_i, (_i)->u64s)
-
-#define __set_blocks(_i, _u64s, _block_bytes) \
- DIV_ROUND_UP((size_t) __set_bytes((_i), (_u64s)), (_block_bytes))
-
-#define set_blocks(_i, _block_bytes) \
- __set_blocks((_i), (_i)->u64s, (_block_bytes))
-
static inline struct bset *bset_next_set(struct btree *b,
unsigned block_bytes)
{
@@ -318,7 +310,7 @@ static inline struct bset *bset_next_set(struct btree *b,
EBUG_ON(!is_power_of_2(block_bytes));
- return ((void *) i) + round_up(set_bytes(i), block_bytes);
+ return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
}
void bch_btree_keys_free(struct btree *);
@@ -387,11 +379,6 @@ static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
(cmp == 0 && !strictly_greater && !bkey_deleted(k));
}
-static inline struct bkey_packed *bset_bkey_idx(struct bset *i, unsigned idx)
-{
- return bkey_idx(i, idx);
-}
-
struct bset_tree *bch_bkey_to_bset(struct btree *, struct bkey_packed *);
struct bkey_packed *bkey_prev_all(struct btree *, struct bset_tree *,
struct bkey_packed *);
diff --git a/libbcache/btree_cache.c b/libbcache/btree_cache.c
index ca6064a..4d5efdb 100644
--- a/libbcache/btree_cache.c
+++ b/libbcache/btree_cache.c
@@ -695,7 +695,7 @@ retry:
EBUG_ON(!b->written);
EBUG_ON(b->btree_id != iter->btree_id ||
- BSET_BTREE_LEVEL(&b->data->keys) != level ||
+ BTREE_NODE_LEVEL(b->data) != level ||
bkey_cmp(b->data->max_key, k->k.p));
return b;
diff --git a/libbcache/btree_gc.c b/libbcache/btree_gc.c
index 8417187..5c77b26 100644
--- a/libbcache/btree_gc.c
+++ b/libbcache/btree_gc.c
@@ -18,6 +18,7 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
+#include "super-io.h"
#include "writeback.h"
#include <linux/slab.h>
@@ -118,8 +119,8 @@ u8 bch_btree_key_recalc_oldest_gen(struct cache_set *c, struct bkey_s_c k)
/*
* For runtime mark and sweep:
*/
-u8 __bch_btree_mark_key(struct cache_set *c, enum bkey_type type,
- struct bkey_s_c k)
+static u8 bch_btree_mark_key(struct cache_set *c, enum bkey_type type,
+ struct bkey_s_c k)
{
switch (type) {
case BKEY_TYPE_BTREE:
@@ -133,10 +134,14 @@ u8 __bch_btree_mark_key(struct cache_set *c, enum bkey_type type,
}
}
-static u8 btree_mark_key(struct cache_set *c, struct btree *b,
- struct bkey_s_c k)
+u8 bch_btree_mark_key_initial(struct cache_set *c, enum bkey_type type,
+ struct bkey_s_c k)
{
- return __bch_btree_mark_key(c, btree_node_type(b), k);
+ atomic64_set(&c->key_version,
+ max_t(u64, k.k->version.lo,
+ atomic64_read(&c->key_version)));
+
+ return bch_btree_mark_key(c, type, k);
}
static bool btree_gc_mark_node(struct cache_set *c, struct btree *b)
@@ -151,7 +156,8 @@ static bool btree_gc_mark_node(struct cache_set *c, struct btree *b)
btree_node_is_extents(b),
&unpacked) {
bkey_debugcheck(c, b, k);
- stale = max(stale, btree_mark_key(c, b, k));
+ stale = max(stale, bch_btree_mark_key(c,
+ btree_node_type(b), k));
}
if (btree_gc_rewrite_disabled(c))
@@ -218,7 +224,7 @@ static int bch_gc_btree(struct cache_set *c, enum btree_id btree_id)
mutex_lock(&c->btree_root_lock);
b = c->btree_roots[btree_id].b;
- __bch_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key));
+ bch_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key));
gc_pos_set(c, gc_pos_btree_root(b->btree_id));
mutex_unlock(&c->btree_root_lock);
@@ -265,22 +271,21 @@ static void bch_mark_allocator_buckets(struct cache_set *c)
static void bch_mark_metadata(struct cache_set *c)
{
struct cache *ca;
- unsigned i;
+ unsigned i, j;
+ u64 b;
for_each_cache(ca, c, i) {
- unsigned j;
- u64 *i;
-
- for (j = 0; j < bch_nr_journal_buckets(ca->disk_sb.sb); j++)
- bch_mark_metadata_bucket(ca,
- &ca->buckets[journal_bucket(ca->disk_sb.sb, j)],
- true);
+ for (j = 0; j < ca->journal.nr; j++) {
+ b = ca->journal.buckets[j];
+ bch_mark_metadata_bucket(ca, ca->buckets + b, true);
+ }
spin_lock(&ca->prio_buckets_lock);
- for (i = ca->prio_buckets;
- i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
- bch_mark_metadata_bucket(ca, &ca->buckets[*i], true);
+ for (j = 0; j < prio_buckets(ca) * 2; j++) {
+ b = ca->prio_buckets[j];
+ bch_mark_metadata_bucket(ca, ca->buckets + b, true);
+ }
spin_unlock(&ca->prio_buckets_lock);
}
@@ -476,9 +481,8 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
/* Check if all keys in @old_nodes could fit in one fewer node */
if (nr_old_nodes <= 1 ||
- __set_blocks(old_nodes[0]->data,
- DIV_ROUND_UP(u64s, nr_old_nodes - 1),
- block_bytes(c)) > blocks)
+ __vstruct_blocks(struct btree_node, c->block_bits,
+ DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
return;
res = bch_btree_reserve_get(c, parent, nr_old_nodes,
@@ -542,9 +546,9 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
u64s = 0;
for (k = s2->start;
- k < bset_bkey_last(s2) &&
- __set_blocks(n1->data, le16_to_cpu(s1->u64s) + u64s + k->u64s,
- block_bytes(c)) <= blocks;
+ k < vstruct_last(s2) &&
+ vstruct_blocks_plus(n1->data, c->block_bits,
+ u64s + k->u64s) <= blocks;
k = bkey_next(k)) {
last = k;
u64s += k->u64s;
@@ -554,7 +558,7 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
/* n2 fits entirely in n1 */
n1->key.k.p = n1->data->max_key = n2->data->max_key;
- memcpy_u64s(bset_bkey_last(s1),
+ memcpy_u64s(vstruct_last(s1),
s2->start,
le16_to_cpu(s2->u64s));
le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
@@ -578,12 +582,12 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
btree_type_successor(iter->btree_id,
n1->data->max_key);
- memcpy_u64s(bset_bkey_last(s1),
+ memcpy_u64s(vstruct_last(s1),
s2->start, u64s);
le16_add_cpu(&s1->u64s, u64s);
memmove(s2->start,
- bset_bkey_idx(s2, u64s),
+ vstruct_idx(s2, u64s),
(le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
@@ -866,7 +870,7 @@ static void bch_initial_gc_btree(struct cache_set *c, enum btree_id id)
for_each_btree_node_key_unpack(b, k, &node_iter,
btree_node_is_extents(b),
&unpacked)
- btree_mark_key(c, b, k);
+ bch_btree_mark_key_initial(c, btree_node_type(b), k);
}
bch_btree_iter_cond_resched(&iter);
@@ -874,8 +878,8 @@ static void bch_initial_gc_btree(struct cache_set *c, enum btree_id id)
bch_btree_iter_unlock(&iter);
- __bch_btree_mark_key(c, BKEY_TYPE_BTREE,
- bkey_i_to_s_c(&c->btree_roots[id].b->key));
+ bch_btree_mark_key(c, BKEY_TYPE_BTREE,
+ bkey_i_to_s_c(&c->btree_roots[id].b->key));
}
int bch_initial_gc(struct cache_set *c, struct list_head *journal)
@@ -889,6 +893,13 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
bch_journal_mark(c, journal);
}
+ /*
+ * Skip past versions that might have possibly been used (as nonces),
+ * but hadn't had their pointers written:
+ */
+ if (c->sb.encryption_type)
+ atomic64_add(1 << 16, &c->key_version);
+
bch_mark_metadata(c);
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
diff --git a/libbcache/btree_gc.h b/libbcache/btree_gc.h
index 91d31c0..0607187 100644
--- a/libbcache/btree_gc.h
+++ b/libbcache/btree_gc.h
@@ -11,7 +11,7 @@ void bch_gc_thread_stop(struct cache_set *);
int bch_gc_thread_start(struct cache_set *);
int bch_initial_gc(struct cache_set *, struct list_head *);
u8 bch_btree_key_recalc_oldest_gen(struct cache_set *, struct bkey_s_c);
-u8 __bch_btree_mark_key(struct cache_set *, enum bkey_type,
+u8 bch_btree_mark_key_initial(struct cache_set *, enum bkey_type,
struct bkey_s_c);
/*
diff --git a/libbcache/btree_io.c b/libbcache/btree_io.c
index 4c295af..e772c6a 100644
--- a/libbcache/btree_io.c
+++ b/libbcache/btree_io.c
@@ -13,6 +13,7 @@
#include "extents.h"
#include "io.h"
#include "journal.h"
+#include "super-io.h"
#include <trace/events/bcache.h>
@@ -39,7 +40,7 @@ static void clear_needs_whiteout(struct bset *i)
{
struct bkey_packed *k;
- for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k))
+ for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
k->needs_whiteout = false;
}
@@ -47,7 +48,7 @@ static void set_needs_whiteout(struct bset *i)
{
struct bkey_packed *k;
- for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k))
+ for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
k->needs_whiteout = true;
}
@@ -341,7 +342,7 @@ bool __bch_compact_whiteouts(struct cache_set *c, struct btree *b,
compacting = true;
u_start = u_pos;
start = i->start;
- end = bset_bkey_last(i);
+ end = vstruct_last(i);
if (src != dst) {
memmove(dst, src, sizeof(*src));
@@ -574,7 +575,7 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
order = sorting_entire_node
? btree_page_order(c)
- : get_order(__set_bytes(b->data, u64s));
+ : get_order(__vstruct_bytes(struct btree_node, u64s));
out = btree_bounce_alloc(c, order, &used_mempool);
@@ -589,8 +590,7 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
out->keys.u64s = cpu_to_le16(u64s);
- BUG_ON((void *) bset_bkey_last(&out->keys) >
- (void *) out + (PAGE_SIZE << order));
+ BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
if (sorting_entire_node)
bch_time_stats_update(&c->btree_sort_time, start_time);
@@ -654,7 +654,7 @@ static struct btree_nr_keys sort_repack(struct bset *dst,
bool filter_whiteouts)
{
struct bkey_format *in_f = &src->format;
- struct bkey_packed *in, *out = bset_bkey_last(dst);
+ struct bkey_packed *in, *out = vstruct_last(dst);
struct btree_nr_keys nr;
memset(&nr, 0, sizeof(nr));
@@ -723,7 +723,7 @@ static struct btree_nr_keys sort_repack_merge(struct cache_set *c,
btree_keys_account_key_add(&nr, 0, prev);
prev = bkey_next(prev);
} else {
- prev = bset_bkey_last(dst);
+ prev = vstruct_last(dst);
}
bkey_copy(prev, &tmp.k);
@@ -734,7 +734,7 @@ static struct btree_nr_keys sort_repack_merge(struct cache_set *c,
btree_keys_account_key_add(&nr, 0, prev);
out = bkey_next(prev);
} else {
- out = bset_bkey_last(dst);
+ out = vstruct_last(dst);
}
dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
@@ -854,22 +854,23 @@ void bch_btree_init_next(struct cache_set *c, struct btree *b,
bch_btree_iter_reinit_node(iter, b);
}
-/*
- * We seed the checksum with the entire first pointer (dev, gen and offset),
- * since for btree nodes we have to store the checksum with the data instead of
- * the pointer - this helps guard against reading a valid btree node that is not
- * the node we actually wanted:
- */
-#define btree_csum_set(_b, _i) \
-({ \
- void *_data = (void *) (_i) + 8; \
- void *_end = bset_bkey_last(&(_i)->keys); \
- \
- bch_checksum_update(BSET_CSUM_TYPE(&(_i)->keys), \
- bkey_i_to_extent_c(&(_b)->key)->v._data[0], \
- _data, \
- _end - _data) ^ 0xffffffffffffffffULL; \
-})
+static struct nonce btree_nonce(struct btree *b,
+ struct bset *i,
+ unsigned offset)
+{
+ return (struct nonce) {{
+ [0] = cpu_to_le32(offset),
+ [1] = ((__le32 *) &i->seq)[0],
+ [2] = ((__le32 *) &i->seq)[1],
+ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
+ }};
+}
+
+static void bset_encrypt(struct cache_set *c, struct bset *i, struct nonce nonce)
+{
+ bch_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+ vstruct_end(i) - (void *) i->_data);
+}
#define btree_node_error(b, c, ptr, fmt, ...) \
cache_set_inconsistent(c, \
@@ -877,7 +878,7 @@ void bch_btree_init_next(struct cache_set *c, struct btree *b,
(b)->btree_id, (b)->level, btree_node_root(c, b) \
? btree_node_root(c, b)->level : -1, \
PTR_BUCKET_NR(ca, ptr), (b)->written, \
- (i)->u64s, ##__VA_ARGS__)
+ le16_to_cpu((i)->u64s), ##__VA_ARGS__)
static const char *validate_bset(struct cache_set *c, struct btree *b,
struct cache *ca,
@@ -886,6 +887,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
unsigned *whiteout_u64s)
{
struct bkey_packed *k, *prev = NULL;
+ struct bpos prev_pos = POS_MIN;
bool seen_non_whiteout = false;
if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION)
@@ -903,7 +905,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
}
for (k = i->start;
- k != bset_bkey_last(i);) {
+ k != vstruct_last(i);) {
struct bkey_s_c u;
struct bkey tmp;
const char *invalid;
@@ -911,13 +913,13 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
if (!k->u64s) {
btree_node_error(b, c, ptr,
"KEY_U64s 0: %zu bytes of metadata lost",
- (void *) bset_bkey_last(i) - (void *) k);
+ vstruct_end(i) - (void *) k);
i->u64s = cpu_to_le16((u64 *) k - i->_data);
break;
}
- if (bkey_next(k) > bset_bkey_last(i)) {
+ if (bkey_next(k) > vstruct_last(i)) {
btree_node_error(b, c, ptr,
"key extends past end of bset");
@@ -931,7 +933,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
- (u64 *) bset_bkey_last(i) - (u64 *) k);
+ (u64 *) vstruct_end(i) - (u64 *) k);
continue;
}
@@ -951,7 +953,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
- (u64 *) bset_bkey_last(i) - (u64 *) k);
+ (u64 *) vstruct_end(i) - (u64 *) k);
continue;
}
@@ -963,22 +965,40 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
if (!seen_non_whiteout &&
(!bkey_whiteout(k) ||
- (prev && bkey_cmp_left_packed_byval(b, prev,
- bkey_start_pos(u.k)) > 0))) {
+ (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
*whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true;
+ } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
+ btree_node_error(b, c, ptr,
+ "keys out of order: %llu:%llu > %llu:%llu",
+ prev_pos.inode,
+ prev_pos.offset,
+ u.k->p.inode,
+ bkey_start_offset(u.k));
+ /* XXX: repair this */
}
+ prev_pos = u.k->p;
prev = k;
k = bkey_next(k);
}
SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
- b->written += sectors;
return NULL;
}
+static bool extent_contains_ptr(struct bkey_s_c_extent e,
+ struct bch_extent_ptr match)
+{
+ const struct bch_extent_ptr *ptr;
+
+ extent_for_each_ptr(e, ptr)
+ if (!memcmp(ptr, &match, sizeof(*ptr)))
+ return true;
+
+ return false;
+}
+
void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
struct cache *ca,
const struct bch_extent_ptr *ptr)
@@ -990,6 +1010,8 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
bool used_mempool;
unsigned u64s;
const char *err;
+ struct bch_csum csum;
+ struct nonce nonce;
int ret;
iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
@@ -1005,40 +1027,62 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
if (!b->written) {
i = &b->data->keys;
+ err = "bad magic";
+ if (le64_to_cpu(b->data->magic) != bset_magic(c))
+ goto err;
+
+ err = "bad btree header";
+ if (!b->data->keys.seq)
+ goto err;
+
err = "unknown checksum type";
- if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR)
+ if (!bch_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
goto err;
/* XXX: retry checksum errors */
+ nonce = btree_nonce(b, i, b->written << 9);
+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+
err = "bad checksum";
- if (le64_to_cpu(b->data->csum) !=
- btree_csum_set(b, b->data))
+ if (bch_crc_cmp(csum, b->data->csum))
goto err;
- sectors = __set_blocks(b->data,
- le16_to_cpu(b->data->keys.u64s),
- block_bytes(c)) << c->block_bits;
+ bch_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+ &b->data->flags,
+ (void *) &b->data->keys -
+ (void *) &b->data->flags);
+ nonce = nonce_add(nonce,
+ round_up((void *) &b->data->keys -
+ (void *) &b->data->flags,
+ CHACHA20_BLOCK_SIZE));
+ bset_encrypt(c, i, nonce);
- err = "bad magic";
- if (le64_to_cpu(b->data->magic) != bset_magic(&c->disk_sb))
- goto err;
-
- err = "bad btree header";
- if (!b->data->keys.seq)
- goto err;
+ sectors = vstruct_sectors(b->data, c->block_bits);
if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
+ u64 *p = (u64 *) &b->data->ptr;
+
+ *p = swab64(*p);
bch_bpos_swab(&b->data->min_key);
bch_bpos_swab(&b->data->max_key);
}
+ err = "incorrect btree id";
+ if (BTREE_NODE_ID(b->data) != b->btree_id)
+ goto err;
+
+ err = "incorrect level";
+ if (BTREE_NODE_LEVEL(b->data) != b->level)
+ goto err;
+
err = "incorrect max key";
if (bkey_cmp(b->data->max_key, b->key.k.p))
goto err;
- err = "incorrect level";
- if (BSET_BTREE_LEVEL(i) != b->level)
+ err = "incorrect backpointer";
+ if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
+ b->data->ptr))
goto err;
err = bch_bkey_format_validate(&b->data->format);
@@ -1056,23 +1100,27 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
break;
err = "unknown checksum type";
- if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR)
+ if (!bch_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
goto err;
+ nonce = btree_nonce(b, i, b->written << 9);
+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
err = "bad checksum";
- if (le64_to_cpu(bne->csum) !=
- btree_csum_set(b, bne))
+ if (memcmp(&csum, &bne->csum, sizeof(csum)))
goto err;
- sectors = __set_blocks(bne,
- le16_to_cpu(bne->keys.u64s),
- block_bytes(c)) << c->block_bits;
+ bset_encrypt(c, i, nonce);
+
+ sectors = vstruct_sectors(bne, c->block_bits);
}
err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s);
if (err)
goto err;
+ b->written += sectors;
+
err = "insufficient memory";
ret = bch_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
if (ret < 0)
@@ -1083,11 +1131,11 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
__bch_btree_node_iter_push(iter, b,
i->start,
- bkey_idx(i, whiteout_u64s));
+ vstruct_idx(i, whiteout_u64s));
__bch_btree_node_iter_push(iter, b,
- bkey_idx(i, whiteout_u64s),
- bset_bkey_last(i));
+ vstruct_idx(i, whiteout_u64s),
+ vstruct_last(i));
}
err = "corrupted btree";
@@ -1290,6 +1338,7 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
struct bch_extent_ptr *ptr;
struct cache *ca;
struct sort_iter sort_iter;
+ struct nonce nonce;
unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
u64 seq = 0;
bool used_mempool;
@@ -1330,7 +1379,7 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
BUG_ON(b->written >= c->sb.btree_node_size);
BUG_ON(bset_written(b, btree_bset_last(b)));
- BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(&c->disk_sb));
+ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
if (lock_type_held == SIX_LOCK_intent) {
@@ -1396,7 +1445,7 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
b->whiteout_u64s = 0;
u64s = btree_node_is_extents(b)
- ? sort_extents(bset_bkey_last(i), &sort_iter, false)
+ ? sort_extents(vstruct_last(i), &sort_iter, false)
: sort_keys(i->start, &sort_iter, false);
le16_add_cpu(&i->u64s, u64s);
@@ -1413,14 +1462,30 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
BUG_ON(i->seq != b->data->keys.seq);
i->version = cpu_to_le16(BCACHE_BSET_VERSION);
- SET_BSET_CSUM_TYPE(i, c->opts.metadata_checksum);
+ SET_BSET_CSUM_TYPE(i, bch_meta_checksum_type(c));
+
+ nonce = btree_nonce(b, i, b->written << 9);
+
+ if (bn) {
+ bch_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+ &bn->flags,
+ (void *) &b->data->keys -
+ (void *) &b->data->flags);
+ nonce = nonce_add(nonce,
+ round_up((void *) &b->data->keys -
+ (void *) &b->data->flags,
+ CHACHA20_BLOCK_SIZE));
+ bset_encrypt(c, i, nonce);
+
+ nonce = btree_nonce(b, i, b->written << 9);
+ bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
+ } else {
+ bset_encrypt(c, i, nonce);
- if (bn)
- bn->csum = cpu_to_le64(btree_csum_set(b, bn));
- else
- bne->csum = cpu_to_le64(btree_csum_set(b, bne));
+ bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+ }
- bytes_to_write = (void *) bset_bkey_last(i) - data;
+ bytes_to_write = vstruct_end(i) - data;
sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
memset(data + bytes_to_write, 0,
@@ -1548,7 +1613,7 @@ bool bch_btree_post_write_cleanup(struct cache_set *c, struct btree *b)
* If later we don't unconditionally sort down to a single bset, we have
* to ensure this is still true:
*/
- BUG_ON((void *) bset_bkey_last(btree_bset_last(b)) > write_block(b));
+ BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
bne = want_new_bset(c, b);
if (bne)
diff --git a/libbcache/btree_types.h b/libbcache/btree_types.h
index 176d42a..4cbec7f 100644
--- a/libbcache/btree_types.h
+++ b/libbcache/btree_types.h
@@ -202,24 +202,12 @@ __btree_node_offset_to_key(const struct btree *b, u16 k)
return (void *) ((u64 *) b->data + k + 1);
}
-#define __bkey_idx(_set, _offset) \
- ((_set)->_data + (_offset))
-
-#define bkey_idx(_set, _offset) \
- ((typeof(&(_set)->start[0])) __bkey_idx((_set), (_offset)))
-
-#define __bset_bkey_last(_set) \
- __bkey_idx((_set), (_set)->u64s)
-
-#define bset_bkey_last(_set) \
- bkey_idx((_set), le16_to_cpu((_set)->u64s))
-
#define btree_bkey_first(_b, _t) (bset(_b, _t)->start)
#define btree_bkey_last(_b, _t) \
({ \
EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \
- bset_bkey_last(bset(_b, _t))); \
+ vstruct_last(bset(_b, _t))); \
\
__btree_node_offset_to_key(_b, (_t)->end_offset); \
})
@@ -227,7 +215,7 @@ __btree_node_offset_to_key(const struct btree *b, u16 k)
static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
{
t->end_offset =
- __btree_node_key_to_offset(b, bset_bkey_last(bset(b, t)));
+ __btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
btree_bkey_last(b, t);
}
diff --git a/libbcache/btree_update.c b/libbcache/btree_update.c
index 95406a4..c3bb209 100644
--- a/libbcache/btree_update.c
+++ b/libbcache/btree_update.c
@@ -12,7 +12,7 @@
#include "extents.h"
#include "journal.h"
#include "keylist.h"
-#include "super.h"
+#include "super-io.h"
#include <linux/random.h>
#include <linux/sort.h>
@@ -80,7 +80,7 @@ bool bch_btree_node_format_fits(struct cache_set *c, struct btree *b,
{
size_t u64s = btree_node_u64s_with_format(b, new_f);
- return __set_bytes(b->data, u64s) < btree_bytes(c);
+ return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
}
/* Btree node freeing/allocation: */
@@ -298,8 +298,11 @@ static struct btree *bch_btree_node_alloc(struct cache_set *c,
bch_bset_init_first(b, &b->data->keys);
memset(&b->nr, 0, sizeof(b->nr));
- b->data->magic = cpu_to_le64(bset_magic(&c->disk_sb));
- SET_BSET_BTREE_LEVEL(&b->data->keys, level);
+ b->data->magic = cpu_to_le64(bset_magic(c));
+ b->data->flags = 0;
+ SET_BTREE_NODE_ID(b->data, id);
+ SET_BTREE_NODE_LEVEL(b->data, level);
+ b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
bch_btree_build_aux_trees(b);
@@ -1292,7 +1295,7 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
*/
k = set1->start;
while (1) {
- if (bkey_next(k) == bset_bkey_last(set1))
+ if (bkey_next(k) == vstruct_last(set1))
break;
if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
break;
@@ -1313,7 +1316,7 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
n2->data->min_key =
btree_type_successor(n1->btree_id, n1->key.k.p);
- set2->u64s = cpu_to_le16((u64 *) bset_bkey_last(set1) - (u64 *) k);
+ set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
set_btree_bset_end(n1, n1->set);
@@ -1333,7 +1336,7 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
BUG_ON(!set2->u64s);
memcpy_u64s(set2->start,
- bset_bkey_last(set1),
+ vstruct_end(set1),
le16_to_cpu(set2->u64s));
btree_node_reset_sib_u64s(n1);
@@ -1393,12 +1396,12 @@ static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b,
*/
i = btree_bset_first(b);
p = i->start;
- while (p != bset_bkey_last(i))
+ while (p != vstruct_last(i))
if (bkey_deleted(p)) {
le16_add_cpu(&i->u64s, -p->u64s);
set_btree_bset_end(b, b->set);
memmove_u64s_down(p, bkey_next(p),
- (u64 *) bset_bkey_last(i) -
+ (u64 *) vstruct_last(i) -
(u64 *) p);
} else
p = bkey_next(p);
@@ -1428,9 +1431,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
if (b->level)
btree_split_insert_keys(iter, n1, insert_keys, reserve);
- if (__set_blocks(n1->data,
- le16_to_cpu(n1->data->keys.u64s),
- block_bytes(c)) > BTREE_SPLIT_THRESHOLD(c)) {
+ if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
trace_bcache_btree_node_split(c, b, b->nr.live_u64s);
n2 = __btree_split_node(iter, n1, reserve);
@@ -1939,7 +1940,7 @@ retry:
u64s = 0;
trans_for_each_entry(trans, i)
if (!i->done)
- u64s += jset_u64s(i->k->k.u64s);
+ u64s += jset_u64s(i->k->k.u64s + i->extra_res);
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
@@ -1966,7 +1967,7 @@ retry:
* written one
*/
if (!i->done) {
- u64s += i->k->k.u64s;
+ u64s += i->k->k.u64s + i->extra_res;
if (!bch_btree_node_insert_fits(c,
i->iter->nodes[0], u64s)) {
split = i->iter;
@@ -2217,7 +2218,7 @@ int bch_btree_update(struct cache_set *c, enum btree_id id,
int bch_btree_delete_range(struct cache_set *c, enum btree_id id,
struct bpos start,
struct bpos end,
- u64 version,
+ struct bversion version,
struct disk_reservation *disk_res,
struct extent_insert_hook *hook,
u64 *journal_seq)
diff --git a/libbcache/btree_update.h b/libbcache/btree_update.h
index 5fc1b1a..8ff089d 100644
--- a/libbcache/btree_update.h
+++ b/libbcache/btree_update.h
@@ -5,6 +5,7 @@
#include "btree_iter.h"
#include "buckets.h"
#include "journal.h"
+#include "vstructs.h"
struct cache_set;
struct bkey_format_state;
@@ -200,7 +201,7 @@ static inline bool bset_unwritten(struct btree *b, struct bset *i)
static inline unsigned bset_end_sector(struct cache_set *c, struct btree *b,
struct bset *i)
{
- return round_up(bset_byte_offset(b, bset_bkey_last(i)),
+ return round_up(bset_byte_offset(b, vstruct_end(i)),
block_bytes(c)) >> 9;
}
@@ -208,7 +209,7 @@ static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c,
struct btree *b)
{
struct bset *i = btree_bset_last(b);
- unsigned used = bset_byte_offset(b, bset_bkey_last(i)) / sizeof(u64) +
+ unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) +
b->whiteout_u64s +
b->uncompacted_whiteout_u64s;
unsigned total = c->sb.btree_node_size << 6;
@@ -235,7 +236,7 @@ static inline struct btree_node_entry *want_new_bset(struct cache_set *c,
{
struct bset *i = btree_bset_last(b);
unsigned offset = max_t(unsigned, b->written << 9,
- bset_byte_offset(b, bset_bkey_last(i)));
+ bset_byte_offset(b, vstruct_end(i)));
ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t)
(offset + sizeof(struct btree_node_entry) +
b->whiteout_u64s * sizeof(u64) +
@@ -244,8 +245,8 @@ static inline struct btree_node_entry *want_new_bset(struct cache_set *c,
EBUG_ON(offset > btree_bytes(c));
if ((unlikely(bset_written(b, i)) && n > 0) ||
- (unlikely(__set_bytes(i, le16_to_cpu(i->u64s)) >
- btree_write_set_buffer(b)) && n > btree_write_set_buffer(b)))
+ (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+ n > btree_write_set_buffer(b)))
return (void *) b->data + offset;
return NULL;
@@ -308,6 +309,7 @@ struct btree_insert {
struct btree_insert_entry {
struct btree_iter *iter;
struct bkey_i *k;
+ unsigned extra_res;
/*
* true if entire key was inserted - can only be false for
* extents
@@ -329,6 +331,14 @@ int __bch_btree_insert_at(struct btree_insert *);
.done = false, \
})
+#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra) \
+ ((struct btree_insert_entry) { \
+ .iter = (_iter), \
+ .k = (_k), \
+ .extra_res = (_extra), \
+ .done = false, \
+ })
+
/**
* bch_btree_insert_at - insert one or more keys at iterator positions
* @iter: btree iterator
@@ -391,7 +401,7 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans,
return true;
for (i = insert; i < trans->entries + trans->nr; i++)
- u64s += jset_u64s(i->k->k.u64s);
+ u64s += jset_u64s(i->k->k.u64s + i->extra_res);
return u64s <= trans->journal_res.u64s;
}
@@ -404,7 +414,7 @@ int bch_btree_update(struct cache_set *, enum btree_id,
struct bkey_i *, u64 *);
int bch_btree_delete_range(struct cache_set *, enum btree_id,
- struct bpos, struct bpos, u64,
+ struct bpos, struct bpos, struct bversion,
struct disk_reservation *,
struct extent_insert_hook *, u64 *);
diff --git a/libbcache/buckets.c b/libbcache/buckets.c
index 3398b25..757bc03 100644
--- a/libbcache/buckets.c
+++ b/libbcache/buckets.c
@@ -534,12 +534,10 @@ static void bch_mark_extent(struct cache_set *c, struct bkey_s_c_extent e,
rcu_read_lock();
extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
- bool dirty = bch_extent_ptr_is_dirty(c, e, ptr);
-
- trace_bcache_mark_bucket(ca, e.k, ptr, sectors, dirty);
+ trace_bcache_mark_bucket(ca, e.k, ptr, sectors, !ptr->cached);
bch_mark_pointer(c, e, ca, crc, ptr, sectors,
- dirty ? type : S_CACHED,
+ ptr->cached ? S_CACHED : type,
may_make_unavailable,
stats, gc_will_visit, journal_seq);
}
@@ -559,10 +557,13 @@ static void __bch_mark_key(struct cache_set *c, struct bkey_s_c k,
may_make_unavailable, stats,
gc_will_visit, journal_seq);
break;
- case BCH_RESERVATION:
- stats->persistent_reserved += sectors;
+ case BCH_RESERVATION: {
+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+ stats->persistent_reserved += r.v->nr_replicas * sectors;
break;
}
+ }
}
void __bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k,
diff --git a/libbcache/buckets.h b/libbcache/buckets.h
index 35100eb..8194dd9 100644
--- a/libbcache/buckets.h
+++ b/libbcache/buckets.h
@@ -42,7 +42,7 @@ static inline u8 bucket_gc_gen(struct cache *ca, struct bucket *g)
static inline struct cache *PTR_CACHE(const struct cache_set *c,
const struct bch_extent_ptr *ptr)
{
- EBUG_ON(ptr->dev > rcu_dereference(c->members)->nr_in_set);
+ EBUG_ON(ptr->dev > rcu_dereference(c->members)->nr_devices);
return rcu_dereference(c->cache[ptr->dev]);
}
diff --git a/libbcache/chardev.c b/libbcache/chardev.c
index 0b020c8..b361b09 100644
--- a/libbcache/chardev.c
+++ b/libbcache/chardev.c
@@ -9,6 +9,7 @@
#include "bcache.h"
#include "super.h"
+#include "super-io.h"
#include <linux/module.h>
#include <linux/fs.h>
@@ -202,16 +203,16 @@ static long bch_ioctl_disk_fail(struct cache_set *c,
return ret;
}
-static struct cache_member *bch_uuid_lookup(struct cache_set *c, uuid_le uuid)
+static struct bch_member *bch_uuid_lookup(struct cache_set *c, uuid_le uuid)
{
- struct cache_member *mi = c->disk_mi;
+ struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb);
unsigned i;
- lockdep_assert_held(&bch_register_lock);
+ lockdep_assert_held(&c->sb_lock);
- for (i = 0; i < c->disk_sb.nr_in_set; i++)
- if (!memcmp(&mi[i].uuid, &uuid, sizeof(uuid)))
- return &mi[i];
+ for (i = 0; i < c->disk_sb->nr_devices; i++)
+ if (!memcmp(&mi->members[i].uuid, &uuid, sizeof(uuid)))
+ return &mi->members[i];
return NULL;
}
@@ -220,20 +221,20 @@ static long bch_ioctl_disk_remove_by_uuid(struct cache_set *c,
struct bch_ioctl_disk_remove_by_uuid __user *user_arg)
{
struct bch_ioctl_disk_fail_by_uuid arg;
- struct cache_member *m;
+ struct bch_member *m;
int ret = -ENOENT;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
return -EFAULT;
- mutex_lock(&bch_register_lock);
+ mutex_lock(&c->sb_lock);
if ((m = bch_uuid_lookup(c, arg.dev))) {
/* XXX: */
- SET_CACHE_STATE(m, CACHE_FAILED);
- bcache_write_super(c);
+ SET_BCH_MEMBER_STATE(m, BCH_MEMBER_STATE_FAILED);
+ bch_write_super(c);
ret = 0;
}
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&c->sb_lock);
return ret;
}
@@ -242,19 +243,19 @@ static long bch_ioctl_disk_fail_by_uuid(struct cache_set *c,
struct bch_ioctl_disk_fail_by_uuid __user *user_arg)
{
struct bch_ioctl_disk_fail_by_uuid arg;
- struct cache_member *m;
+ struct bch_member *m;
int ret = -ENOENT;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
return -EFAULT;
- mutex_lock(&bch_register_lock);
+ mutex_lock(&c->sb_lock);
if ((m = bch_uuid_lookup(c, arg.dev))) {
- SET_CACHE_STATE(m, CACHE_FAILED);
- bcache_write_super(c);
+ SET_BCH_MEMBER_STATE(m, BCH_MEMBER_STATE_FAILED);
+ bch_write_super(c);
ret = 0;
}
- mutex_unlock(&bch_register_lock);
+ mutex_unlock(&c->sb_lock);
return ret;
}
@@ -263,8 +264,8 @@ static long bch_ioctl_query_uuid(struct cache_set *c,
struct bch_ioctl_query_uuid __user *user_arg)
{
return copy_to_user(&user_arg->uuid,
- &c->disk_sb.user_uuid,
- sizeof(c->disk_sb.user_uuid));
+ &c->sb.user_uuid,
+ sizeof(c->sb.user_uuid));
}
long bch_cache_set_ioctl(struct cache_set *c, unsigned cmd, void __user *arg)
diff --git a/libbcache/checksum.c b/libbcache/checksum.c
index beae0b2..eb41f2e 100644
--- a/libbcache/checksum.c
+++ b/libbcache/checksum.c
@@ -1,11 +1,19 @@
#include "bcache.h"
#include "checksum.h"
+#include "super.h"
+#include "super-io.h"
#include <linux/crc32c.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <crypto/algapi.h>
#include <crypto/chacha20.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
+#include <keys/user-type.h>
/*
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
@@ -129,7 +137,35 @@ u64 bch_crc64_update(u64 crc, const void *_data, size_t len)
return crc;
}
-u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
+static u64 bch_checksum_init(unsigned type)
+{
+ switch (type) {
+ case BCH_CSUM_NONE:
+ return 0;
+ case BCH_CSUM_CRC32C:
+ return U32_MAX;
+ case BCH_CSUM_CRC64:
+ return U64_MAX;
+ default:
+ BUG();
+ }
+}
+
+static u64 bch_checksum_final(unsigned type, u64 crc)
+{
+ switch (type) {
+ case BCH_CSUM_NONE:
+ return 0;
+ case BCH_CSUM_CRC32C:
+ return crc ^ U32_MAX;
+ case BCH_CSUM_CRC64:
+ return crc ^ U64_MAX;
+ default:
+ BUG();
+ }
+}
+
+static u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
{
switch (type) {
case BCH_CSUM_NONE:
@@ -143,32 +179,416 @@ u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
}
}
-u64 bch_checksum(unsigned type, const void *data, size_t len)
+static inline void do_encrypt_sg(struct crypto_blkcipher *tfm,
+ struct nonce nonce,
+ struct scatterlist *sg, size_t len)
+{
+ struct blkcipher_desc desc = { .tfm = tfm, .info = nonce.d };
+ int ret;
+
+ ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len);
+ BUG_ON(ret);
+}
+
+static inline void do_encrypt(struct crypto_blkcipher *tfm,
+ struct nonce nonce,
+ void *buf, size_t len)
+{
+ struct scatterlist sg;
+
+ sg_init_one(&sg, buf, len);
+ do_encrypt_sg(tfm, nonce, &sg, len);
+}
+
+int bch_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
+ void *buf, size_t len)
+{
+ struct crypto_blkcipher *chacha20 =
+ crypto_alloc_blkcipher("chacha20", 0, CRYPTO_ALG_ASYNC);
+ int ret;
+
+ if (!chacha20)
+ return PTR_ERR(chacha20);
+
+ ret = crypto_blkcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ if (ret)
+ goto err;
+
+ do_encrypt(chacha20, nonce, buf, len);
+err:
+ crypto_free_blkcipher(chacha20);
+ return ret;
+}
+
+static void gen_poly_key(struct cache_set *c, struct shash_desc *desc,
+ struct nonce nonce)
+{
+ u8 key[POLY1305_KEY_SIZE];
+
+ nonce.d[3] ^= BCH_NONCE_POLY;
+
+ memset(key, 0, sizeof(key));
+ do_encrypt(c->chacha20, nonce, key, sizeof(key));
+
+ desc->tfm = c->poly1305;
+ desc->flags = 0;
+ crypto_shash_init(desc);
+ crypto_shash_update(desc, key, sizeof(key));
+}
+
+struct bch_csum bch_checksum(struct cache_set *c, unsigned type,
+ struct nonce nonce, const void *data, size_t len)
{
- u64 crc = 0xffffffffffffffffULL;
+ switch (type) {
+ case BCH_CSUM_NONE:
+ case BCH_CSUM_CRC32C:
+ case BCH_CSUM_CRC64: {
+ u64 crc = bch_checksum_init(type);
+
+ crc = bch_checksum_update(type, crc, data, len);
+ crc = bch_checksum_final(type, crc);
+
+ return (struct bch_csum) { .lo = crc };
+ }
+
+ case BCH_CSUM_CHACHA20_POLY1305_80:
+ case BCH_CSUM_CHACHA20_POLY1305_128: {
+ SHASH_DESC_ON_STACK(desc, c->poly1305);
+ u8 digest[POLY1305_DIGEST_SIZE];
+ struct bch_csum ret = { 0 };
+
+ gen_poly_key(c, desc, nonce);
+
+ crypto_shash_update(desc, data, len);
+ crypto_shash_final(desc, digest);
+
+ memcpy(&ret, digest, bch_crc_bytes[type]);
+ return ret;
+ }
+ default:
+ BUG();
+ }
+}
- crc = bch_checksum_update(type, crc, data, len);
+void bch_encrypt(struct cache_set *c, unsigned type,
+ struct nonce nonce, void *data, size_t len)
+{
+ if (!bch_csum_type_is_encryption(type))
+ return;
- return crc ^ 0xffffffffffffffffULL;
+ do_encrypt(c->chacha20, nonce, data, len);
}
-u32 bch_checksum_bio(struct bio *bio, unsigned type)
+struct bch_csum bch_checksum_bio(struct cache_set *c, unsigned type,
+ struct nonce nonce, struct bio *bio)
{
struct bio_vec bv;
struct bvec_iter iter;
- u32 csum = U32_MAX;
- if (type == BCH_CSUM_NONE)
- return 0;
+ switch (type) {
+ case BCH_CSUM_NONE:
+ return (struct bch_csum) { 0 };
+ case BCH_CSUM_CRC32C:
+ case BCH_CSUM_CRC64: {
+ u64 crc = bch_checksum_init(type);
+
+ bio_for_each_segment(bv, bio, iter) {
+ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+
+ crc = bch_checksum_update(type,
+ crc, p, bv.bv_len);
+ kunmap_atomic(p);
+ }
+
+ crc = bch_checksum_final(type, crc);
+ return (struct bch_csum) { .lo = crc };
+ }
+
+ case BCH_CSUM_CHACHA20_POLY1305_80:
+ case BCH_CSUM_CHACHA20_POLY1305_128: {
+ SHASH_DESC_ON_STACK(desc, c->poly1305);
+ u8 digest[POLY1305_DIGEST_SIZE];
+ struct bch_csum ret = { 0 };
+
+ gen_poly_key(c, desc, nonce);
+
+ bio_for_each_segment(bv, bio, iter) {
+ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+
+ crypto_shash_update(desc, p, bv.bv_len);
+ kunmap_atomic(p);
+ }
+
+ crypto_shash_final(desc, digest);
+
+ memcpy(&ret, digest, bch_crc_bytes[type]);
+ return ret;
+ }
+ default:
+ BUG();
+ }
+}
+
+void bch_encrypt_bio(struct cache_set *c, unsigned type,
+ struct nonce nonce, struct bio *bio)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ struct scatterlist sgl[16], *sg = sgl;
+ size_t bytes = 0;
+
+ if (!bch_csum_type_is_encryption(type))
+ return;
+
+ sg_init_table(sgl, ARRAY_SIZE(sgl));
bio_for_each_segment(bv, bio, iter) {
- void *p = kmap_atomic(bv.bv_page);
+ if (sg == sgl + ARRAY_SIZE(sgl)) {
+ sg_mark_end(sg - 1);
+ do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+
+ le32_add_cpu(nonce.d, bytes / CHACHA20_BLOCK_SIZE);
+ bytes = 0;
+
+ sg_init_table(sgl, ARRAY_SIZE(sgl));
+ sg = sgl;
+ }
+
+ sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
+ bytes += bv.bv_len;
+
+ }
+
+ sg_mark_end(sg - 1);
+ do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+}
+
+#ifdef __KERNEL__
+int bch_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+ char key_description[60];
+ struct key *keyring_key;
+ const struct user_key_payload *ukp;
+ int ret;
+
+ snprintf(key_description, sizeof(key_description),
+ "bcache:%pUb", &sb->user_uuid);
+
+ keyring_key = request_key(&key_type_logon, key_description, NULL);
+ if (IS_ERR(keyring_key))
+ return PTR_ERR(keyring_key);
+
+ down_read(&keyring_key->sem);
+ ukp = user_key_payload(keyring_key);
+ if (ukp->datalen == sizeof(*key)) {
+ memcpy(key, ukp->data, ukp->datalen);
+ ret = 0;
+ } else {
+ ret = -EINVAL;
+ }
+ up_read(&keyring_key->sem);
+ key_put(keyring_key);
+
+ return ret;
+}
+#else
+#include <keyutils.h>
+#include <uuid/uuid.h>
+
+int bch_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+ key_serial_t key_id;
+ char key_description[60];
+ char uuid[40];
+
+ uuid_unparse_lower(sb->user_uuid.b, uuid);
+ sprintf(key_description, "bcache:%s", uuid);
+
+ key_id = request_key("user", key_description, NULL,
+ KEY_SPEC_USER_KEYRING);
+ if (key_id < 0)
+ return -errno;
+
+ if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
+ return -1;
+
+ return 0;
+}
+#endif
- csum = bch_checksum_update(type, csum,
- p + bv.bv_offset,
- bv.bv_len);
- kunmap_atomic(p);
+static int bch_decrypt_sb_key(struct cache_set *c,
+ struct bch_sb_field_crypt *crypt,
+ struct bch_key *key)
+{
+ struct bch_encrypted_key sb_key = crypt->key;
+ struct bch_key user_key;
+ int ret = 0;
+
+ /* is key encrypted? */
+ if (!bch_key_is_encrypted(&sb_key))
+ goto out;
+
+ ret = bch_request_key(c->disk_sb, &user_key);
+ if (ret) {
+ bch_err(c, "error requesting encryption key");
+ goto err;
}
- return csum ^= U32_MAX;
+ /* decrypt real key: */
+ ret = bch_chacha_encrypt_key(&user_key, bch_sb_key_nonce(c),
+ &sb_key, sizeof(sb_key));
+ if (ret)
+ goto err;
+
+ if (bch_key_is_encrypted(&sb_key)) {
+ bch_err(c, "incorrect encryption key");
+ ret = -EINVAL;
+ goto err;
+ }
+out:
+ *key = sb_key.key;
+err:
+ memzero_explicit(&sb_key, sizeof(sb_key));
+ memzero_explicit(&user_key, sizeof(user_key));
+ return ret;
+}
+
+static int bch_alloc_ciphers(struct cache_set *c)
+{
+ if (!c->chacha20)
+ c->chacha20 = crypto_alloc_blkcipher("chacha20", 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(c->chacha20))
+ return PTR_ERR(c->chacha20);
+
+ if (!c->poly1305)
+ c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
+ if (IS_ERR(c->poly1305))
+ return PTR_ERR(c->poly1305);
+
+ return 0;
+}
+
+int bch_disable_encryption(struct cache_set *c)
+{
+ struct bch_sb_field_crypt *crypt;
+ struct bch_key key;
+ int ret = -EINVAL;
+
+ mutex_lock(&c->sb_lock);
+
+ crypt = bch_sb_get_crypt(c->disk_sb);
+ if (!crypt)
+ goto out;
+
+ /* is key encrypted? */
+ ret = 0;
+ if (bch_key_is_encrypted(&crypt->key))
+ goto out;
+
+ ret = bch_decrypt_sb_key(c, crypt, &key);
+ if (ret)
+ goto out;
+
+ crypt->key.magic = BCH_KEY_MAGIC;
+ crypt->key.key = key;
+
+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 0);
+ bch_write_super(c);
+out:
+ mutex_unlock(&c->sb_lock);
+
+ return ret;
+}
+
+int bch_enable_encryption(struct cache_set *c, bool keyed)
+{
+ struct bch_encrypted_key key;
+ struct bch_key user_key;
+ struct bch_sb_field_crypt *crypt;
+ int ret = -EINVAL;
+
+ mutex_lock(&c->sb_lock);
+
+ /* Do we already have an encryption key? */
+ if (bch_sb_get_crypt(c->disk_sb))
+ goto err;
+
+ ret = bch_alloc_ciphers(c);
+ if (ret)
+ goto err;
+
+ key.magic = BCH_KEY_MAGIC;
+ get_random_bytes(&key.key, sizeof(key.key));
+
+ if (keyed) {
+ ret = bch_request_key(c->disk_sb, &user_key);
+ if (ret) {
+ bch_err(c, "error requesting encryption key");
+ goto err;
+ }
+
+ ret = bch_chacha_encrypt_key(&user_key, bch_sb_key_nonce(c),
+ &key, sizeof(key));
+ if (ret)
+ goto err;
+ }
+
+ ret = crypto_blkcipher_setkey(c->chacha20,
+ (void *) &key.key, sizeof(key.key));
+ if (ret)
+ goto err;
+
+ crypt = container_of_or_null(bch_fs_sb_field_resize(c, NULL,
+ sizeof(*crypt) / sizeof(u64)),
+ struct bch_sb_field_crypt, field);
+ if (!crypt) {
+ ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
+ goto err;
+ }
+
+ crypt->field.type = BCH_SB_FIELD_crypt;
+ crypt->key = key;
+
+ /* write superblock */
+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 1);
+ bch_write_super(c);
+err:
+ mutex_unlock(&c->sb_lock);
+ memzero_explicit(&user_key, sizeof(user_key));
+ memzero_explicit(&key, sizeof(key));
+ return ret;
+}
+
+void bch_cache_set_encryption_free(struct cache_set *c)
+{
+ if (!IS_ERR_OR_NULL(c->poly1305))
+ crypto_free_shash(c->poly1305);
+ if (!IS_ERR_OR_NULL(c->chacha20))
+ crypto_free_blkcipher(c->chacha20);
+}
+
+int bch_cache_set_encryption_init(struct cache_set *c)
+{
+ struct bch_sb_field_crypt *crypt;
+ struct bch_key key;
+ int ret;
+
+ crypt = bch_sb_get_crypt(c->disk_sb);
+ if (!crypt)
+ return 0;
+
+ ret = bch_alloc_ciphers(c);
+ if (ret)
+ return ret;
+
+ ret = bch_decrypt_sb_key(c, crypt, &key);
+ if (ret)
+ goto err;
+
+ ret = crypto_blkcipher_setkey(c->chacha20,
+ (void *) &key.key, sizeof(key.key));
+err:
+ memzero_explicit(&key, sizeof(key));
+ return ret;
}
diff --git a/libbcache/checksum.h b/libbcache/checksum.h
index 196b7e8..a9a1758 100644
--- a/libbcache/checksum.h
+++ b/libbcache/checksum.h
@@ -1,24 +1,133 @@
#ifndef _BCACHE_CHECKSUM_H
#define _BCACHE_CHECKSUM_H
-#include "btree_types.h"
+#include "bcache.h"
+#include "super-io.h"
+
+#include <crypto/chacha20.h>
u64 bch_crc64_update(u64, const void *, size_t);
-u64 bch_checksum_update(unsigned, u64, const void *, size_t);
-u64 bch_checksum(unsigned, const void *, size_t);
-u32 bch_checksum_bio(struct bio *, unsigned);
+#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28)
+#define BCH_NONCE_BTREE cpu_to_le32(2 << 28)
+#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28)
+#define BCH_NONCE_PRIO cpu_to_le32(4 << 28)
+#define BCH_NONCE_POLY cpu_to_le32(1 << 31)
+
+struct bch_csum bch_checksum(struct cache_set *, unsigned, struct nonce,
+ const void *, size_t);
/*
- * This is used for various on disk data structures - cache_sb, prio_set, bset,
- * jset: The checksum is _always_ the first 8 bytes of these structs
+ * This is used for various on disk data structures - bch_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first field of these structs
*/
-#define __csum_set(i, u64s, type) \
+#define csum_vstruct(_c, _type, _nonce, _i) \
({ \
- const void *start = ((const void *) (i)) + sizeof(u64); \
- const void *end = __bkey_idx(i, u64s); \
+ const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \
+ const void *end = vstruct_end(_i); \
\
- bch_checksum(type, start, end - start); \
+ bch_checksum(_c, _type, _nonce, start, end - start); \
})
+int bch_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
+int bch_request_key(struct bch_sb *, struct bch_key *);
+
+void bch_encrypt(struct cache_set *, unsigned, struct nonce,
+ void *data, size_t);
+
+struct bch_csum bch_checksum_bio(struct cache_set *, unsigned,
+ struct nonce, struct bio *);
+void bch_encrypt_bio(struct cache_set *, unsigned,
+ struct nonce, struct bio *);
+
+int bch_disable_encryption(struct cache_set *);
+int bch_enable_encryption(struct cache_set *, bool);
+
+void bch_cache_set_encryption_free(struct cache_set *);
+int bch_cache_set_encryption_init(struct cache_set *);
+
+static inline unsigned bch_data_checksum_type(struct cache_set *c)
+{
+ if (c->sb.encryption_type)
+ return c->opts.wide_macs
+ ? BCH_CSUM_CHACHA20_POLY1305_128
+ : BCH_CSUM_CHACHA20_POLY1305_80;
+
+ return c->opts.data_checksum;
+}
+
+static inline unsigned bch_meta_checksum_type(struct cache_set *c)
+{
+ return c->sb.encryption_type
+ ? BCH_CSUM_CHACHA20_POLY1305_128
+ : c->opts.metadata_checksum;
+}
+
+static inline bool bch_checksum_type_valid(const struct cache_set *c,
+ unsigned type)
+{
+ if (type >= BCH_CSUM_NR)
+ return false;
+
+ if (bch_csum_type_is_encryption(type) && !c->chacha20)
+ return false;
+
+ return true;
+}
+
+static const unsigned bch_crc_bytes[] = {
+ [BCH_CSUM_NONE] = 0,
+ [BCH_CSUM_CRC32C] = 4,
+ [BCH_CSUM_CRC64] = 8,
+ [BCH_CSUM_CHACHA20_POLY1305_80] = 10,
+ [BCH_CSUM_CHACHA20_POLY1305_128] = 16,
+};
+
+static inline bool bch_crc_cmp(struct bch_csum l, struct bch_csum r)
+{
+ /*
+ * XXX: need some way of preventing the compiler from optimizing this
+ * into a form that isn't constant time..
+ */
+ return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
+}
+
+/* for skipping ahead and encrypting/decrypting at an offset: */
+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
+{
+ EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+
+ le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+ return nonce;
+}
+
+static inline bool bch_key_is_encrypted(struct bch_encrypted_key *key)
+{
+ return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
+}
+
+static inline struct nonce __bch_sb_key_nonce(struct bch_sb *sb)
+{
+ __le64 magic = __bch_sb_magic(sb);
+
+ return (struct nonce) {{
+ [0] = 0,
+ [1] = 0,
+ [2] = ((__le32 *) &magic)[0],
+ [3] = ((__le32 *) &magic)[1],
+ }};
+}
+
+static inline struct nonce bch_sb_key_nonce(struct cache_set *c)
+{
+ __le64 magic = bch_sb_magic(c);
+
+ return (struct nonce) {{
+ [0] = 0,
+ [1] = 0,
+ [2] = ((__le32 *) &magic)[0],
+ [3] = ((__le32 *) &magic)[1],
+ }};
+}
+
#endif /* _BCACHE_CHECKSUM_H */
diff --git a/libbcache/compress.c b/libbcache/compress.c
index f7bfd57..e76850b 100644
--- a/libbcache/compress.c
+++ b/libbcache/compress.c
@@ -1,6 +1,8 @@
#include "bcache.h"
#include "compress.h"
+#include "extents.h"
#include "io.h"
+#include "super-io.h"
#include <linux/lz4.h>
#include <linux/zlib.h>
@@ -50,7 +52,7 @@ static void *__bio_map_or_bounce(struct cache_set *c,
unsigned prev_end = PAGE_SIZE;
void *data;
- BUG_ON(bvec_iter_sectors(start) > BCH_COMPRESSED_EXTENT_MAX);
+ BUG_ON(bvec_iter_sectors(start) > BCH_ENCODED_EXTENT_MAX);
*bounced = BOUNCED_MAPPED;
@@ -118,12 +120,12 @@ static void bio_unmap_or_unbounce(struct cache_set *c, void *data,
}
static int __bio_uncompress(struct cache_set *c, struct bio *src,
- void *dst_data, struct bch_extent_crc64 crc)
+ void *dst_data, struct bch_extent_crc128 crc)
{
void *src_data = NULL;
unsigned src_bounced;
size_t src_len = src->bi_iter.bi_size;
- size_t dst_len = crc.uncompressed_size << 9;
+ size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
int ret;
src_data = bio_map_or_bounce(c, src, &src_bounced, READ);
@@ -179,10 +181,10 @@ err:
int bch_bio_uncompress_inplace(struct cache_set *c, struct bio *bio,
unsigned live_data_sectors,
- struct bch_extent_crc64 crc)
+ struct bch_extent_crc128 crc)
{
void *dst_data = NULL;
- size_t dst_len = crc.uncompressed_size << 9;
+ size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
int ret = -ENOMEM;
BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs);
@@ -231,11 +233,11 @@ use_mempool:
int bch_bio_uncompress(struct cache_set *c, struct bio *src,
struct bio *dst, struct bvec_iter dst_iter,
- struct bch_extent_crc64 crc)
+ struct bch_extent_crc128 crc)
{
void *dst_data = NULL;
unsigned dst_bounced;
- size_t dst_len = crc.uncompressed_size << 9;
+ size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
int ret = -ENOMEM;
dst_data = dst_len == dst_iter.bi_size
@@ -273,28 +275,23 @@ static int __bio_compress(struct cache_set *c,
*src_len = src->bi_iter.bi_size;
workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
-retry_compress:
- ret = lz4_compress(src_data, *src_len,
- dst_data, dst_len,
- workspace);
- /*
- * On error, the compressed data was bigger than dst_len, and
- * -ret is the amount of data we were able to compress - round
- * down to nearest block and try again:
- */
- if (ret && round_down(-ret, block_bytes(c)) > *dst_len) {
- BUG_ON(ret > 0);
- /* not supposed to happen */
- if (WARN_ON(-ret >= *src_len))
- goto err;
+ while (*src_len > block_bytes(c) &&
+ (ret = lz4_compress(src_data, *src_len,
+ dst_data, dst_len,
+ workspace))) {
+ /*
+ * On error, the compressed data was bigger than
+ * dst_len, and -ret is the amount of data we were able
+ * to compress - round down to nearest block and try
+ * again:
+ */
+ BUG_ON(ret > 0);
+ BUG_ON(-ret >= *src_len);
*src_len = round_down(-ret, block_bytes(c));
- if (!*src_len)
- goto err;
-
- goto retry_compress;
}
+
mempool_free(workspace, &c->lz4_workspace_pool);
if (ret)
@@ -354,6 +351,10 @@ zlib_err:
}
BUG_ON(!*dst_len);
+ BUG_ON(*dst_len > dst->bi_iter.bi_size);
+
+ BUG_ON(*src_len & (block_bytes(c) - 1));
+ BUG_ON(*src_len > src->bi_iter.bi_size);
/* Didn't get smaller: */
if (round_up(*dst_len, block_bytes(c)) >= *src_len) {
@@ -382,9 +383,9 @@ void bch_bio_compress(struct cache_set *c,
unsigned orig_dst = dst->bi_iter.bi_size;
unsigned orig_src = src->bi_iter.bi_size;
- /* Don't consume more than BCH_COMPRESSED_EXTENT_MAX from @src: */
+ /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
src->bi_iter.bi_size =
- min(src->bi_iter.bi_size, BCH_COMPRESSED_EXTENT_MAX << 9);
+ min(src->bi_iter.bi_size, BCH_ENCODED_EXTENT_MAX << 9);
/* Don't generate a bigger output than input: */
dst->bi_iter.bi_size =
@@ -405,6 +406,30 @@ out:
src->bi_iter.bi_size = orig_src;
}
+/* doesn't write superblock: */
+int bch_check_set_has_compressed_data(struct cache_set *c,
+ unsigned compression_type)
+{
+ switch (compression_type) {
+ case BCH_COMPRESSION_NONE:
+ return 0;
+ case BCH_COMPRESSION_LZ4:
+ if (bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4))
+ return 0;
+
+ bch_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4);
+ break;
+ case BCH_COMPRESSION_GZIP:
+ if (bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
+ return 0;
+
+ bch_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP);
+ break;
+ }
+
+ return bch_compress_init(c);
+}
+
void bch_compress_free(struct cache_set *c)
{
vfree(c->zlib_workspace);
@@ -420,39 +445,56 @@ void bch_compress_free(struct cache_set *c)
int bch_compress_init(struct cache_set *c)
{
+ unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9);
int ret, cpu;
- c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker);
- if (!c->bio_decompress_worker)
- return -ENOMEM;
+ if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
+ !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
+ return 0;
- for_each_possible_cpu(cpu) {
- struct bio_decompress_worker *d =
- per_cpu_ptr(c->bio_decompress_worker, cpu);
+ if (!c->bio_decompress_worker) {
+ c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker);
+ if (!c->bio_decompress_worker)
+ return -ENOMEM;
- d->c = c;
- INIT_WORK(&d->work, bch_bio_decompress_work);
- init_llist_head(&d->bio_list);
+ for_each_possible_cpu(cpu) {
+ struct bio_decompress_worker *d =
+ per_cpu_ptr(c->bio_decompress_worker, cpu);
+
+ d->c = c;
+ INIT_WORK(&d->work, bch_bio_decompress_work);
+ init_llist_head(&d->bio_list);
+ }
}
- ret = mempool_init_page_pool(&c->compression_bounce[READ], 1,
- get_order(BCH_COMPRESSED_EXTENT_MAX << 9));
- if (ret)
- return ret;
+ if (!mempool_initialized(&c->compression_bounce[READ])) {
+ ret = mempool_init_page_pool(&c->compression_bounce[READ],
+ 1, order);
+ if (ret)
+ return ret;
+ }
- ret = mempool_init_page_pool(&c->compression_bounce[WRITE], 1,
- get_order(BCH_COMPRESSED_EXTENT_MAX << 9));
- if (ret)
- return ret;
+ if (!mempool_initialized(&c->compression_bounce[WRITE])) {
+ ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
+ 1, order);
+ if (ret)
+ return ret;
+ }
- ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool, 1,
- LZ4_MEM_COMPRESS);
- if (ret)
- return ret;
+ if (!mempool_initialized(&c->lz4_workspace_pool) &&
+ bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) {
+ ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool,
+ 1, LZ4_MEM_COMPRESS);
+ if (ret)
+ return ret;
+ }
- c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE);
- if (!c->zlib_workspace)
- return -ENOMEM;
+ if (!c->zlib_workspace &&
+ bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) {
+ c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE);
+ if (!c->zlib_workspace)
+ return -ENOMEM;
+ }
return 0;
}
diff --git a/libbcache/compress.h b/libbcache/compress.h
index 02578ef..485acd9 100644
--- a/libbcache/compress.h
+++ b/libbcache/compress.h
@@ -2,12 +2,13 @@
#define _BCACHE_COMPRESS_H
int bch_bio_uncompress_inplace(struct cache_set *, struct bio *,
- unsigned, struct bch_extent_crc64);
+ unsigned, struct bch_extent_crc128);
int bch_bio_uncompress(struct cache_set *, struct bio *, struct bio *,
- struct bvec_iter, struct bch_extent_crc64);
+ struct bvec_iter, struct bch_extent_crc128);
void bch_bio_compress(struct cache_set *, struct bio *, size_t *,
struct bio *, size_t *, unsigned *);
+int bch_check_set_has_compressed_data(struct cache_set *, unsigned);
void bch_compress_free(struct cache_set *);
int bch_compress_init(struct cache_set *);
diff --git a/libbcache/debug.c b/libbcache/debug.c
index 39f5550..d25c32a 100644
--- a/libbcache/debug.c
+++ b/libbcache/debug.c
@@ -96,7 +96,7 @@ void __bch_btree_verify(struct cache_set *c, struct btree *b)
if (inmemory->u64s != sorted->u64s ||
memcmp(inmemory->start,
sorted->start,
- (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
+ vstruct_end(inmemory) - (void *) inmemory->start)) {
unsigned offset = 0, sectors;
struct bset *i;
unsigned j;
@@ -112,18 +112,14 @@ void __bch_btree_verify(struct cache_set *c, struct btree *b)
while (offset < b->written) {
if (!offset ) {
i = &n_ondisk->keys;
- sectors = __set_blocks(n_ondisk,
- le16_to_cpu(n_ondisk->keys.u64s),
- block_bytes(c)) <<
+ sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
c->block_bits;
} else {
struct btree_node_entry *bne =
(void *) n_ondisk + (offset << 9);
i = &bne->keys;
- sectors = __set_blocks(bne,
- le16_to_cpu(bne->keys.u64s),
- block_bytes(c)) <<
+ sectors = vstruct_blocks(bne, c->block_bits) <<
c->block_bits;
}
@@ -427,7 +423,7 @@ void bch_debug_init_cache_set(struct cache_set *c)
if (IS_ERR_OR_NULL(bch_debug))
return;
- snprintf(name, sizeof(name), "%pU", c->disk_sb.user_uuid.b);
+ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
c->debug = debugfs_create_dir(name, bch_debug);
if (IS_ERR_OR_NULL(c->debug))
return;
diff --git a/libbcache/dirent.c b/libbcache/dirent.c
index d97c3b2..ebf0f10 100644
--- a/libbcache/dirent.c
+++ b/libbcache/dirent.c
@@ -23,34 +23,13 @@ unsigned bch_dirent_name_bytes(struct bkey_s_c_dirent d)
static u64 bch_dirent_hash(const struct bch_hash_info *info,
const struct qstr *name)
{
- switch (info->type) {
- case BCH_STR_HASH_SHA1: {
- SHASH_DESC_ON_STACK(desc, bch_sha1);
- u8 digest[SHA1_DIGEST_SIZE];
- u64 ret;
- desc->tfm = bch_sha1;
- desc->flags = 0;
- crypto_shash_init(desc);
-
- crypto_shash_update(desc, (void *) &info->seed, sizeof(info->seed));
-
- crypto_shash_update(desc, (void *) name->name, name->len);
- crypto_shash_final(desc, digest);
- memcpy(&ret, &digest, sizeof(ret));
- return max_t(u64, ret >> 1, 2);
- }
- default: {
- struct bch_str_hash_ctx ctx;
-
- bch_str_hash_init(&ctx, info->type);
- bch_str_hash_update(&ctx, info->type, &info->seed, sizeof(info->seed));
+ struct bch_str_hash_ctx ctx;
- bch_str_hash_update(&ctx, info->type, name->name, name->len);
+ bch_str_hash_init(&ctx, info);
+ bch_str_hash_update(&ctx, info, name->name, name->len);
- /* [0,2) reserved for dots */
- return max_t(u64, bch_str_hash_end(&ctx, info->type), 2);
- }
- }
+ /* [0,2) reserved for dots */
+ return max_t(u64, bch_str_hash_end(&ctx, info), 2);
}
static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
diff --git a/libbcache/extents.c b/libbcache/extents.c
index c026d59..4b8a266 100644
--- a/libbcache/extents.c
+++ b/libbcache/extents.c
@@ -9,19 +9,19 @@
#include "bkey_methods.h"
#include "btree_gc.h"
#include "btree_update.h"
+#include "checksum.h"
#include "debug.h"
#include "dirent.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
#include "journal.h"
-#include "super.h"
+#include "super-io.h"
#include "writeback.h"
#include "xattr.h"
#include <trace/events/bcache.h>
-static bool __bch_extent_normalize(struct cache_set *, struct bkey_s, bool);
static enum merge_result bch_extent_merge(struct cache_set *, struct btree *,
struct bkey_i *, struct bkey_i *);
@@ -120,21 +120,38 @@ bch_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
return NULL;
}
-unsigned bch_extent_nr_ptrs_from(struct bkey_s_c_extent e,
- const struct bch_extent_ptr *start)
+unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent e)
{
const struct bch_extent_ptr *ptr;
unsigned nr_ptrs = 0;
- extent_for_each_ptr_from(e, ptr, start)
+ extent_for_each_ptr(e, ptr)
nr_ptrs++;
return nr_ptrs;
}
-unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent e)
+unsigned bch_extent_nr_dirty_ptrs(struct bkey_s_c k)
{
- return bch_extent_nr_ptrs_from(e, &e.v->start->ptr);
+ struct bkey_s_c_extent e;
+ const struct bch_extent_ptr *ptr;
+ unsigned nr_ptrs = 0;
+
+ switch (k.k->type) {
+ case BCH_EXTENT:
+ case BCH_EXTENT_CACHED:
+ e = bkey_s_c_to_extent(k);
+
+ extent_for_each_ptr(e, ptr)
+ nr_ptrs += !ptr->cached;
+ break;
+
+ case BCH_RESERVATION:
+ nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
+ break;
+ }
+
+ return nr_ptrs;
}
/* returns true if equal */
@@ -177,16 +194,19 @@ void bch_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc
*
* and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
* use crc_live here (that we verified was correct earlier)
+ *
+ * note: doesn't work with encryption
*/
void bch_extent_narrow_crcs(struct bkey_s_extent e)
{
union bch_extent_crc *crc;
bool have_wide = false, have_narrow = false;
- u64 csum = 0;
+ struct bch_csum csum = { 0 };
unsigned csum_type = 0;
extent_for_each_crc(e, crc) {
- if (crc_compression_type(crc))
+ if (crc_compression_type(crc) ||
+ bch_csum_type_is_encryption(crc_csum_type(crc)))
continue;
if (crc_uncompressed_size(e.k, crc) != e.k->size) {
@@ -210,26 +230,38 @@ void bch_extent_narrow_crcs(struct bkey_s_extent e)
case BCH_EXTENT_CRC_NONE:
BUG();
case BCH_EXTENT_CRC32:
- if (bch_crc_size[csum_type] > sizeof(crc->crc32.csum))
+ if (bch_crc_bytes[csum_type] > 4)
continue;
bch_extent_crc_narrow_pointers(e, crc);
- crc->crc32.compressed_size = e.k->size;
- crc->crc32.uncompressed_size = e.k->size;
+ crc->crc32._compressed_size = e.k->size - 1;
+ crc->crc32._uncompressed_size = e.k->size - 1;
crc->crc32.offset = 0;
crc->crc32.csum_type = csum_type;
- crc->crc32.csum = csum;
+ crc->crc32.csum = csum.lo;
break;
case BCH_EXTENT_CRC64:
- if (bch_crc_size[csum_type] > sizeof(crc->crc64.csum))
+ if (bch_crc_bytes[csum_type] > 10)
continue;
bch_extent_crc_narrow_pointers(e, crc);
- crc->crc64.compressed_size = e.k->size;
- crc->crc64.uncompressed_size = e.k->size;
+ crc->crc64._compressed_size = e.k->size - 1;
+ crc->crc64._uncompressed_size = e.k->size - 1;
crc->crc64.offset = 0;
crc->crc64.csum_type = csum_type;
- crc->crc64.csum = csum;
+ crc->crc64.csum_lo = csum.lo;
+ crc->crc64.csum_hi = csum.hi;
+ break;
+ case BCH_EXTENT_CRC128:
+ if (bch_crc_bytes[csum_type] > 16)
+ continue;
+
+ bch_extent_crc_narrow_pointers(e, crc);
+ crc->crc128._compressed_size = e.k->size - 1;
+ crc->crc128._uncompressed_size = e.k->size - 1;
+ crc->crc128.offset = 0;
+ crc->crc128.csum_type = csum_type;
+ crc->crc128.csum = csum;
break;
}
}
@@ -300,13 +332,8 @@ static void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent e)
struct bch_extent_ptr *ptr = &e.v->start->ptr;
bool dropped = false;
- /*
- * We don't want to change which pointers are considered cached/dirty,
- * so don't remove pointers that are considered dirty:
- */
rcu_read_lock();
- while ((ptr = extent_ptr_next(e, ptr)) &&
- !bch_extent_ptr_is_dirty(c, e.c, ptr))
+ while ((ptr = extent_ptr_next(e, ptr)))
if (should_drop_ptr(c, e.c, ptr)) {
__bch_extent_drop_ptr(e, ptr);
dropped = true;
@@ -321,16 +348,43 @@ static void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent e)
static bool bch_ptr_normalize(struct cache_set *c, struct btree *bk,
struct bkey_s k)
{
- return __bch_extent_normalize(c, k, false);
+ return bch_extent_normalize(c, k);
}
static void bch_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
{
- u64 *d = (u64 *) bkeyp_val(f, k);
- unsigned i;
+ switch (k->type) {
+ case BCH_EXTENT:
+ case BCH_EXTENT_CACHED: {
+ union bch_extent_entry *entry;
+ u64 *d = (u64 *) bkeyp_val(f, k);
+ unsigned i;
- for (i = 0; i < bkeyp_val_u64s(f, k); i++)
- d[i] = swab64(d[i]);
+ for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+ d[i] = swab64(d[i]);
+
+ for (entry = (union bch_extent_entry *) d;
+ entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+ entry = extent_entry_next(entry)) {
+ switch (extent_entry_type(entry)) {
+ case BCH_EXTENT_ENTRY_crc32:
+ entry->crc32.csum = swab32(entry->crc32.csum);
+ break;
+ case BCH_EXTENT_ENTRY_crc64:
+ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+ break;
+ case BCH_EXTENT_ENTRY_crc128:
+ entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
+ entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
+ break;
+ case BCH_EXTENT_ENTRY_ptr:
+ break;
+ }
+ }
+ break;
+ }
+ }
}
static const char *extent_ptr_invalid(struct bkey_s_c_extent e,
@@ -341,7 +395,7 @@ static const char *extent_ptr_invalid(struct bkey_s_c_extent e,
const struct bch_extent_ptr *ptr2;
const struct cache_member_cpu *m = mi->m + ptr->dev;
- if (ptr->dev > mi->nr_in_set || !m->valid)
+ if (ptr->dev > mi->nr_devices || !m->valid)
return "pointer to invalid device";
extent_for_each_ptr(e, ptr2)
@@ -380,7 +434,9 @@ static size_t extent_print_ptrs(struct cache_set *c, char *buf,
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
+ case BCH_EXTENT_ENTRY_crc128:
crc = entry_to_crc(entry);
+
p("crc: c_size %u size %u offset %u csum %u compress %u",
crc_compressed_size(e.k, crc),
crc_uncompressed_size(e.k, crc),
@@ -388,7 +444,8 @@ static size_t extent_print_ptrs(struct cache_set *c, char *buf,
crc_compression_type(crc));
break;
case BCH_EXTENT_ENTRY_ptr:
- ptr = &entry->ptr;
+ ptr = entry_to_ptr(entry);
+
p("ptr: %u:%llu gen %u%s", ptr->dev,
(u64) ptr->offset, ptr->gen,
(ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr)
@@ -621,6 +678,10 @@ static bool __bch_cut_front(struct bpos where, struct bkey_s k)
if (prev_crc != crc)
crc->crc64.offset += e.k->size - len;
break;
+ case BCH_EXTENT_CRC128:
+ if (prev_crc != crc)
+ crc->crc128.offset += e.k->size - len;
+ break;
}
prev_crc = crc;
}
@@ -948,7 +1009,7 @@ static bool bch_extent_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r)
BUG_ON(!l.k->size || !r.k->size);
if (l.k->type != r.k->type ||
- l.k->version != r.k->version)
+ bversion_cmp(l.k->version, r.k->version))
return false;
switch (l.k->type) {
@@ -985,7 +1046,7 @@ static bool bch_extent_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r)
extent_for_each_ptr(le, lp) {
const union bch_extent_entry *entry =
- bkey_idx(re.v, (u64 *) lp - le.v->_data);
+ vstruct_idx(re.v, (u64 *) lp - le.v->_data);
if (!extent_entry_is_ptr(entry))
return false;
@@ -1142,7 +1203,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
bkey_cmp(s->committed, insert->k.p) &&
- bkey_extent_is_compressed(c, bkey_i_to_s_c(insert))) {
+ bkey_extent_is_compressed(bkey_i_to_s_c(insert))) {
/* XXX: possibly need to increase our reservation? */
bch_cut_subtract_back(s, s->committed,
bkey_i_to_s(&split.k));
@@ -1178,12 +1239,19 @@ __extent_insert_advance_pos(struct extent_insert_state *s,
{
struct extent_insert_hook *hook = s->trans->hook;
enum extent_insert_hook_ret ret;
-
+#if 0
+ /*
+ * Currently disabled for encryption - broken with fcollapse. Will have
+ * to reenable when versions are exposed for send/receive - versions
+ * will have to be monotonic then:
+ */
if (k.k && k.k->size &&
- s->insert->k->k.version &&
- k.k->version > s->insert->k->k.version)
+ !bversion_zero(s->insert->k->k.version) &&
+ bversion_cmp(k.k->version, s->insert->k->k.version) > 0) {
ret = BTREE_HOOK_NO_INSERT;
- else if (hook)
+ } else
+#endif
+ if (hook)
ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
else
ret = BTREE_HOOK_DO_INSERT;
@@ -1257,7 +1325,7 @@ extent_insert_check_split_compressed(struct extent_insert_state *s,
unsigned sectors;
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
- (sectors = bkey_extent_is_compressed(c, k))) {
+ (sectors = bkey_extent_is_compressed(k))) {
int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
if (s->trans->flags & BTREE_INSERT_NOFAIL)
@@ -1680,6 +1748,7 @@ static const char *bch_extent_invalid(const struct cache_set *c,
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const union bch_extent_entry *entry;
const union bch_extent_crc *crc;
+ const struct bch_extent_ptr *ptr;
struct cache_member_rcu *mi = cache_member_info_get(c);
unsigned size_ondisk = e.k->size;
const char *reason;
@@ -1689,9 +1758,7 @@ static const char *bch_extent_invalid(const struct cache_set *c,
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
goto invalid;
- switch (extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_crc32:
- case BCH_EXTENT_ENTRY_crc64:
+ if (extent_entry_is_crc(entry)) {
crc = entry_to_crc(entry);
reason = "checksum offset + key size > uncompressed size";
@@ -1702,19 +1769,19 @@ static const char *bch_extent_invalid(const struct cache_set *c,
size_ondisk = crc_compressed_size(e.k, crc);
reason = "invalid checksum type";
- if (crc_csum_type(crc) >= BCH_CSUM_NR)
+ if (!bch_checksum_type_valid(c, crc_csum_type(crc)))
goto invalid;
reason = "invalid compression type";
if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
goto invalid;
- break;
- case BCH_EXTENT_ENTRY_ptr:
+ } else {
+ ptr = entry_to_ptr(entry);
+
reason = extent_ptr_invalid(e, mi,
&entry->ptr, size_ondisk);
if (reason)
goto invalid;
- break;
}
}
@@ -1725,8 +1792,17 @@ invalid:
return reason;
}
- case BCH_RESERVATION:
+ case BCH_RESERVATION: {
+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
+ return "incorrect value size";
+
+ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
+ return "invalid nr_replicas";
+
return NULL;
+ }
default:
return "invalid value type";
@@ -1743,7 +1819,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
unsigned seq, stale;
char buf[160];
bool bad;
- unsigned ptrs_per_tier[CACHE_TIERS];
+ unsigned ptrs_per_tier[BCH_TIER_MAX];
unsigned tier, replicas = 0;
/*
@@ -1760,11 +1836,9 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
mi = cache_member_info_get(c);
extent_for_each_ptr(e, ptr) {
- bool dirty = bch_extent_ptr_is_dirty(c, e, ptr);
-
replicas++;
- if (ptr->dev >= mi->nr_in_set)
+ if (ptr->dev >= mi->nr_devices)
goto bad_device;
/*
@@ -1796,7 +1870,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
stale = ptr_stale(ca, ptr);
- cache_set_bug_on(stale && dirty, c,
+ cache_set_bug_on(stale && !ptr->cached, c,
"stale dirty pointer");
cache_set_bug_on(stale > 96, c,
@@ -1809,9 +1883,9 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
bad = (mark.is_metadata ||
(gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
!mark.owned_by_allocator &&
- !(dirty
- ? mark.dirty_sectors
- : mark.cached_sectors)));
+ !(ptr->cached
+ ? mark.cached_sectors
+ : mark.dirty_sectors)));
} while (read_seqcount_retry(&c->gc_pos_lock, seq));
if (bad)
@@ -1869,6 +1943,7 @@ static void bch_extent_debugcheck(struct cache_set *c, struct btree *b,
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
bch_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
+ break;
case BCH_RESERVATION:
break;
default:
@@ -1896,69 +1971,77 @@ static void bch_extent_to_text(struct cache_set *c, char *buf,
static unsigned PTR_TIER(struct cache_member_rcu *mi,
const struct bch_extent_ptr *ptr)
{
- return ptr->dev < mi->nr_in_set
+ return ptr->dev < mi->nr_devices
? mi->m[ptr->dev].tier
: UINT_MAX;
}
-void bch_extent_entry_append(struct bkey_i_extent *e,
- union bch_extent_entry *entry)
-{
- BUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
- BKEY_EXTENT_VAL_U64s_MAX);
-
- memcpy_u64s(extent_entry_last(extent_i_to_s(e)),
- entry,
- extent_entry_u64s(entry));
- e->k.u64s += extent_entry_u64s(entry);
-}
-
-const unsigned bch_crc_size[] = {
- [BCH_CSUM_NONE] = 0,
- [BCH_CSUM_CRC32C] = 4,
- [BCH_CSUM_CRC64] = 8,
-};
-
static void bch_extent_crc_init(union bch_extent_crc *crc,
unsigned compressed_size,
unsigned uncompressed_size,
unsigned compression_type,
- u64 csum, unsigned csum_type)
+ unsigned nonce,
+ struct bch_csum csum, unsigned csum_type)
{
- if (bch_crc_size[csum_type] <= 4 &&
- uncompressed_size <= CRC32_EXTENT_SIZE_MAX) {
+ if (bch_crc_bytes[csum_type] <= 4 &&
+ uncompressed_size <= CRC32_SIZE_MAX &&
+ nonce <= CRC32_NONCE_MAX) {
crc->crc32 = (struct bch_extent_crc32) {
.type = 1 << BCH_EXTENT_ENTRY_crc32,
- .compressed_size = compressed_size,
- .uncompressed_size = uncompressed_size,
+ ._compressed_size = compressed_size - 1,
+ ._uncompressed_size = uncompressed_size - 1,
.offset = 0,
.compression_type = compression_type,
.csum_type = csum_type,
- .csum = csum,
+ .csum = *((__le32 *) &csum.lo),
};
- } else {
- BUG_ON(uncompressed_size > CRC64_EXTENT_SIZE_MAX);
+ return;
+ }
+ if (bch_crc_bytes[csum_type] <= 10 &&
+ uncompressed_size <= CRC64_SIZE_MAX &&
+ nonce <= CRC64_NONCE_MAX) {
crc->crc64 = (struct bch_extent_crc64) {
.type = 1 << BCH_EXTENT_ENTRY_crc64,
- .compressed_size = compressed_size,
- .uncompressed_size = uncompressed_size,
+ ._compressed_size = compressed_size - 1,
+ ._uncompressed_size = uncompressed_size - 1,
+ .offset = 0,
+ .nonce = nonce,
+ .compression_type = compression_type,
+ .csum_type = csum_type,
+ .csum_lo = csum.lo,
+ .csum_hi = *((__le16 *) &csum.hi),
+ };
+ return;
+ }
+
+ if (bch_crc_bytes[csum_type] <= 16 &&
+ uncompressed_size <= CRC128_SIZE_MAX &&
+ nonce <= CRC128_NONCE_MAX) {
+ crc->crc128 = (struct bch_extent_crc128) {
+ .type = 1 << BCH_EXTENT_ENTRY_crc128,
+ ._compressed_size = compressed_size - 1,
+ ._uncompressed_size = uncompressed_size - 1,
.offset = 0,
+ .nonce = nonce,
.compression_type = compression_type,
.csum_type = csum_type,
.csum = csum,
};
+ return;
}
+
+ BUG();
}
void bch_extent_crc_append(struct bkey_i_extent *e,
unsigned compressed_size,
unsigned uncompressed_size,
unsigned compression_type,
- u64 csum, unsigned csum_type)
+ unsigned nonce,
+ struct bch_csum csum, unsigned csum_type)
{
union bch_extent_crc *crc;
- union bch_extent_crc new;
BUG_ON(compressed_size > uncompressed_size);
BUG_ON(uncompressed_size != e->k.size);
@@ -1971,123 +2054,26 @@ void bch_extent_crc_append(struct bkey_i_extent *e,
extent_for_each_crc(extent_i_to_s(e), crc)
;
- switch (extent_crc_type(crc)) {
- case BCH_EXTENT_CRC_NONE:
- if (!csum_type && !compression_type)
- return;
- break;
- case BCH_EXTENT_CRC32:
- case BCH_EXTENT_CRC64:
- if (crc_compressed_size(&e->k, crc) == compressed_size &&
- crc_uncompressed_size(&e->k, crc) == uncompressed_size &&
- crc_offset(crc) == 0 &&
- crc_compression_type(crc) == compression_type &&
- crc_csum_type(crc) == csum_type &&
- crc_csum(crc) == csum)
- return;
- break;
- }
+ if (!crc && !csum_type && !compression_type)
+ return;
+
+ if (crc &&
+ crc_compressed_size(&e->k, crc) == compressed_size &&
+ crc_uncompressed_size(&e->k, crc) == uncompressed_size &&
+ crc_offset(crc) == 0 &&
+ crc_nonce(crc) == nonce &&
+ crc_csum_type(crc) == csum_type &&
+ crc_compression_type(crc) == compression_type &&
+ crc_csum(crc).lo == csum.lo &&
+ crc_csum(crc).hi == csum.hi)
+ return;
- bch_extent_crc_init(&new,
+ bch_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)),
compressed_size,
uncompressed_size,
compression_type,
- csum, csum_type);
- bch_extent_entry_append(e, to_entry(&new));
-}
-
-static void __extent_sort_ptrs(struct cache_member_rcu *mi,
- struct bkey_s_extent src)
-{
- struct bch_extent_ptr *src_ptr, *dst_ptr;
- union bch_extent_crc *src_crc, *dst_crc;
- union bch_extent_crc _src;
- BKEY_PADDED(k) tmp;
- struct bkey_s_extent dst;
- size_t u64s, crc_u64s;
- u64 *p;
-
- /*
- * Insertion sort:
- *
- * Note: this sort needs to be stable, because pointer order determines
- * pointer dirtyness.
- */
-
- tmp.k.k = *src.k;
- dst = bkey_i_to_s_extent(&tmp.k);
- set_bkey_val_u64s(dst.k, 0);
-
- extent_for_each_ptr_crc(src, src_ptr, src_crc) {
- extent_for_each_ptr_crc(dst, dst_ptr, dst_crc)
- if (PTR_TIER(mi, src_ptr) < PTR_TIER(mi, dst_ptr))
- goto found;
-
- dst_ptr = &extent_entry_last(dst)->ptr;
- dst_crc = NULL;
-found:
- /* found insert position: */
-
- /*
- * we're making sure everything has a crc at this point, if
- * dst_ptr points to a pointer it better have a crc:
- */
- BUG_ON(dst_ptr != &extent_entry_last(dst)->ptr && !dst_crc);
- BUG_ON(dst_crc &&
- (extent_entry_next(to_entry(dst_crc)) !=
- to_entry(dst_ptr)));
-
- if (!src_crc) {
- bch_extent_crc_init(&_src, src.k->size,
- src.k->size, 0, 0, 0);
- src_crc = &_src;
- }
-
- p = dst_ptr != &extent_entry_last(dst)->ptr
- ? (void *) dst_crc
- : (void *) dst_ptr;
-
- crc_u64s = extent_entry_u64s(to_entry(src_crc));
- u64s = crc_u64s + sizeof(*dst_ptr) / sizeof(u64);
-
- memmove_u64s_up(p + u64s, p,
- (u64 *) extent_entry_last(dst) - (u64 *) p);
- set_bkey_val_u64s(dst.k, bkey_val_u64s(dst.k) + u64s);
-
- memcpy_u64s(p, src_crc, crc_u64s);
- memcpy_u64s(p + crc_u64s, src_ptr,
- sizeof(*src_ptr) / sizeof(u64));
- }
-
- /* Sort done - now drop redundant crc entries: */
- bch_extent_drop_redundant_crcs(dst);
-
- memcpy_u64s(src.v, dst.v, bkey_val_u64s(dst.k));
- set_bkey_val_u64s(src.k, bkey_val_u64s(dst.k));
-}
-
-static void extent_sort_ptrs(struct cache_set *c, struct bkey_s_extent e)
-{
- struct cache_member_rcu *mi;
- struct bch_extent_ptr *ptr, *prev = NULL;
- union bch_extent_crc *crc;
-
- /*
- * First check if any pointers are out of order before doing the actual
- * sort:
- */
- mi = cache_member_info_get(c);
-
- extent_for_each_ptr_crc(e, ptr, crc) {
- if (prev &&
- PTR_TIER(mi, ptr) < PTR_TIER(mi, prev)) {
- __extent_sort_ptrs(mi, e);
- break;
- }
- prev = ptr;
- }
-
- cache_member_info_put();
+ nonce, csum, csum_type);
+ __extent_entry_push(e);
}
/*
@@ -2098,8 +2084,7 @@ static void extent_sort_ptrs(struct cache_set *c, struct bkey_s_extent e)
* For existing keys, only called when btree nodes are being rewritten, not when
* they're merely being compacted/resorted in memory.
*/
-static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k,
- bool sort)
+bool bch_extent_normalize(struct cache_set *c, struct bkey_s k)
{
struct bkey_s_extent e;
@@ -2112,7 +2097,7 @@ static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k,
return true;
case KEY_TYPE_DISCARD:
- return !k.k->version;
+ return bversion_zero(k.k->version);
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
@@ -2120,13 +2105,10 @@ static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k,
bch_extent_drop_stale(c, e);
- if (sort)
- extent_sort_ptrs(c, e);
-
if (!bkey_val_u64s(e.k)) {
if (bkey_extent_is_cached(e.k)) {
k.k->type = KEY_TYPE_DISCARD;
- if (!k.k->version)
+ if (bversion_zero(k.k->version))
return true;
} else {
k.k->type = KEY_TYPE_ERROR;
@@ -2141,9 +2123,40 @@ static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k,
}
}
-bool bch_extent_normalize(struct cache_set *c, struct bkey_s k)
+void bch_extent_mark_replicas_cached(struct cache_set *c,
+ struct bkey_s_extent e,
+ unsigned nr_cached)
{
- return __bch_extent_normalize(c, k, true);
+ struct bch_extent_ptr *ptr;
+ struct cache_member_rcu *mi;
+ bool have_higher_tier;
+ unsigned tier = 0;
+
+ if (!nr_cached)
+ return;
+
+ mi = cache_member_info_get(c);
+
+ do {
+ have_higher_tier = false;
+
+ extent_for_each_ptr(e, ptr) {
+ if (!ptr->cached &&
+ PTR_TIER(mi, ptr) == tier) {
+ ptr->cached = true;
+ nr_cached--;
+ if (!nr_cached)
+ goto out;
+ }
+
+ if (PTR_TIER(mi, ptr) > tier)
+ have_higher_tier = true;
+ }
+
+ tier++;
+ } while (have_higher_tier);
+out:
+ cache_member_info_put();
}
/*
@@ -2183,7 +2196,7 @@ void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
extent_for_each_online_device_crc(c, e, crc, ptr, ca)
if (!ptr_stale(ca, ptr)) {
*ret = (struct extent_pick_ptr) {
- .crc = crc_to_64(e.k, crc),
+ .crc = crc_to_128(e.k, crc),
.ptr = *ptr,
.ca = ca,
};
@@ -2227,7 +2240,7 @@ static enum merge_result bch_extent_merge(struct cache_set *c,
if (l->k.u64s != r->k.u64s ||
l->k.type != r->k.type ||
- l->k.version != r->k.version ||
+ bversion_cmp(l->k.version, r->k.version) ||
bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
return BCH_MERGE_NOMERGE;
@@ -2235,7 +2248,6 @@ static enum merge_result bch_extent_merge(struct cache_set *c,
case KEY_TYPE_DELETED:
case KEY_TYPE_DISCARD:
case KEY_TYPE_ERROR:
- case BCH_RESERVATION:
/* These types are mergeable, and no val to check */
break;
@@ -2248,7 +2260,7 @@ static enum merge_result bch_extent_merge(struct cache_set *c,
struct bch_extent_ptr *lp, *rp;
struct cache_member_cpu *m;
- en_r = bkey_idx(er.v, (u64 *) en_l - el.v->_data);
+ en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
if ((extent_entry_type(en_l) !=
extent_entry_type(en_r)) ||
@@ -2276,6 +2288,15 @@ static enum merge_result bch_extent_merge(struct cache_set *c,
}
break;
+ case BCH_RESERVATION: {
+ struct bkey_i_reservation *li = bkey_i_to_reservation(l);
+ struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
+
+ if (li->v.generation != ri->v.generation ||
+ li->v.nr_replicas != ri->v.nr_replicas)
+ return BCH_MERGE_NOMERGE;
+ break;
+ }
default:
return BCH_MERGE_NOMERGE;
}
diff --git a/libbcache/extents.h b/libbcache/extents.h
index e1cb47a..b0a0542 100644
--- a/libbcache/extents.h
+++ b/libbcache/extents.h
@@ -26,7 +26,7 @@ struct cache_set;
struct journal_res;
struct extent_pick_ptr {
- struct bch_extent_crc64 crc;
+ struct bch_extent_crc128 crc;
struct bch_extent_ptr ptr;
struct cache *ca;
};
@@ -53,10 +53,11 @@ bch_insert_fixup_extent(struct btree_insert *,
struct btree_insert_entry *);
bool bch_extent_normalize(struct cache_set *, struct bkey_s);
+void bch_extent_mark_replicas_cached(struct cache_set *,
+ struct bkey_s_extent, unsigned);
-unsigned bch_extent_nr_ptrs_from(struct bkey_s_c_extent,
- const struct bch_extent_ptr *);
unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent);
+unsigned bch_extent_nr_dirty_ptrs(struct bkey_s_c);
static inline bool bkey_extent_is_data(const struct bkey *k)
{
@@ -117,6 +118,8 @@ static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
return sizeof(struct bch_extent_crc32);
case BCH_EXTENT_ENTRY_crc64:
return sizeof(struct bch_extent_crc64);
+ case BCH_EXTENT_ENTRY_crc128:
+ return sizeof(struct bch_extent_crc128);
case BCH_EXTENT_ENTRY_ptr:
return sizeof(struct bch_extent_ptr);
default:
@@ -143,6 +146,7 @@ union bch_extent_crc {
u8 type;
struct bch_extent_crc32 crc32;
struct bch_extent_crc64 crc64;
+ struct bch_extent_crc128 crc128;
};
/* downcast, preserves const */
@@ -185,10 +189,11 @@ enum bch_extent_crc_type {
BCH_EXTENT_CRC_NONE,
BCH_EXTENT_CRC32,
BCH_EXTENT_CRC64,
+ BCH_EXTENT_CRC128,
};
static inline enum bch_extent_crc_type
-extent_crc_type(const union bch_extent_crc *crc)
+__extent_crc_type(const union bch_extent_crc *crc)
{
if (!crc)
return BCH_EXTENT_CRC_NONE;
@@ -198,16 +203,31 @@ extent_crc_type(const union bch_extent_crc *crc)
return BCH_EXTENT_CRC32;
case BCH_EXTENT_ENTRY_crc64:
return BCH_EXTENT_CRC64;
+ case BCH_EXTENT_ENTRY_crc128:
+ return BCH_EXTENT_CRC128;
default:
BUG();
}
}
+#define extent_crc_type(_crc) \
+({ \
+ BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) && \
+ !type_is(_crc, struct bch_extent_crc64 *) && \
+ !type_is(_crc, struct bch_extent_crc128 *) && \
+ !type_is(_crc, union bch_extent_crc *)); \
+ \
+ type_is(_crc, struct bch_extent_crc32 *) ? BCH_EXTENT_CRC32 \
+ : type_is(_crc, struct bch_extent_crc64 *) ? BCH_EXTENT_CRC64 \
+ : type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128 \
+ : __extent_crc_type((union bch_extent_crc *) _crc); \
+})
+
#define extent_entry_next(_entry) \
((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
#define extent_entry_last(_e) \
- bkey_idx((_e).v, bkey_val_u64s((_e).k))
+ vstruct_idx((_e).v, bkey_val_u64s((_e).k))
/* Iterate over all entries: */
@@ -283,20 +303,16 @@ out: \
#define extent_ptr_next(_e, _ptr) \
extent_ptr_next_filter(_e, _ptr, true)
-#define extent_for_each_ptr_from_filter(_e, _ptr, _start, _filter) \
- for ((_ptr) = (_start); \
+#define extent_for_each_ptr_filter(_e, _ptr, _filter) \
+ for ((_ptr) = &(_e).v->start->ptr; \
((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter)); \
(_ptr)++)
-#define extent_for_each_ptr_from(_e, _ptr, _start) \
- extent_for_each_ptr_from_filter(_e, _ptr, _start, true)
-
#define extent_for_each_ptr(_e, _ptr) \
- extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr, true)
+ extent_for_each_ptr_filter(_e, _ptr, true)
#define extent_for_each_online_device(_c, _e, _ptr, _ca) \
- extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr, \
- ((_ca) = PTR_CACHE(_c, _ptr)))
+ extent_for_each_ptr_filter(_e, _ptr, ((_ca) = PTR_CACHE(_c, _ptr)))
#define extent_ptr_prev(_e, _ptr) \
({ \
@@ -321,67 +337,114 @@ out: \
(_ptr); \
(_ptr) = extent_ptr_prev(_e, _ptr))
-void bch_extent_entry_append(struct bkey_i_extent *, union bch_extent_entry *);
void bch_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned,
- unsigned, u64, unsigned);
+ unsigned, unsigned, struct bch_csum, unsigned);
+
+static inline void __extent_entry_push(struct bkey_i_extent *e)
+{
+ union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
+
+ EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
+ BKEY_EXTENT_VAL_U64s_MAX);
+
+ e->k.u64s += extent_entry_u64s(entry);
+}
static inline void extent_ptr_append(struct bkey_i_extent *e,
struct bch_extent_ptr ptr)
{
ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
- bch_extent_entry_append(e, to_entry(&ptr));
+ extent_entry_last(extent_i_to_s(e))->ptr = ptr;
+ __extent_entry_push(e);
}
-/* XXX: inefficient */
-static inline bool bch_extent_ptr_is_dirty(const struct cache_set *c,
- struct bkey_s_c_extent e,
- const struct bch_extent_ptr *ptr)
+static inline struct bch_extent_crc128 crc_to_128(const struct bkey *k,
+ const union bch_extent_crc *crc)
{
- if (bkey_extent_is_cached(e.k))
- return false;
-
- /* Dirty pointers come last */
- return bch_extent_nr_ptrs_from(e, ptr) <= c->opts.data_replicas;
-}
-
-extern const unsigned bch_crc_size[];
+ EBUG_ON(!k->size);
-static inline struct bch_extent_crc64 crc_to_64(const struct bkey *k,
- const union bch_extent_crc *crc)
-{
switch (extent_crc_type(crc)) {
case BCH_EXTENT_CRC_NONE:
- return (struct bch_extent_crc64) {
- .compressed_size = k->size,
- .uncompressed_size = k->size,
+ return (struct bch_extent_crc128) {
+ ._compressed_size = k->size - 1,
+ ._uncompressed_size = k->size - 1,
};
case BCH_EXTENT_CRC32:
- return (struct bch_extent_crc64) {
- .compressed_size = crc->crc32.compressed_size,
- .uncompressed_size = crc->crc32.uncompressed_size,
+ return (struct bch_extent_crc128) {
+ .type = 1 << BCH_EXTENT_ENTRY_crc128,
+ ._compressed_size = crc->crc32._compressed_size,
+ ._uncompressed_size = crc->crc32._uncompressed_size,
.offset = crc->crc32.offset,
.csum_type = crc->crc32.csum_type,
.compression_type = crc->crc32.compression_type,
- .csum = crc->crc32.csum,
+ .csum.lo = crc->crc32.csum,
};
case BCH_EXTENT_CRC64:
- return crc->crc64;
+ return (struct bch_extent_crc128) {
+ .type = 1 << BCH_EXTENT_ENTRY_crc128,
+ ._compressed_size = crc->crc64._compressed_size,
+ ._uncompressed_size = crc->crc64._uncompressed_size,
+ .offset = crc->crc64.offset,
+ .nonce = crc->crc64.nonce,
+ .csum_type = crc->crc64.csum_type,
+ .compression_type = crc->crc64.compression_type,
+ .csum.lo = crc->crc64.csum_lo,
+ .csum.hi = crc->crc64.csum_hi,
+ };
+ case BCH_EXTENT_CRC128:
+ return crc->crc128;
default:
BUG();
}
}
-static inline unsigned crc_compressed_size(const struct bkey *k,
- const union bch_extent_crc *crc)
-{
- return crc_to_64(k, crc).compressed_size;
-}
+#define crc_compressed_size(_k, _crc) \
+({ \
+ unsigned _size = 0; \
+ \
+ switch (extent_crc_type(_crc)) { \
+ case BCH_EXTENT_CRC_NONE: \
+ _size = ((const struct bkey *) (_k))->size; \
+ break; \
+ case BCH_EXTENT_CRC32: \
+ _size = ((struct bch_extent_crc32 *) _crc) \
+ ->_compressed_size + 1; \
+ break; \
+ case BCH_EXTENT_CRC64: \
+ _size = ((struct bch_extent_crc64 *) _crc) \
+ ->_compressed_size + 1; \
+ break; \
+ case BCH_EXTENT_CRC128: \
+ _size = ((struct bch_extent_crc128 *) _crc) \
+ ->_compressed_size + 1; \
+ break; \
+ } \
+ _size; \
+})
-static inline unsigned crc_uncompressed_size(const struct bkey *k,
- const union bch_extent_crc *crc)
-{
- return crc_to_64(k, crc).uncompressed_size;
-}
+#define crc_uncompressed_size(_k, _crc) \
+({ \
+ unsigned _size = 0; \
+ \
+ switch (extent_crc_type(_crc)) { \
+ case BCH_EXTENT_CRC_NONE: \
+ _size = ((const struct bkey *) (_k))->size; \
+ break; \
+ case BCH_EXTENT_CRC32: \
+ _size = ((struct bch_extent_crc32 *) _crc) \
+ ->_uncompressed_size + 1; \
+ break; \
+ case BCH_EXTENT_CRC64: \
+ _size = ((struct bch_extent_crc64 *) _crc) \
+ ->_uncompressed_size + 1; \
+ break; \
+ case BCH_EXTENT_CRC128: \
+ _size = ((struct bch_extent_crc128 *) _crc) \
+ ->_uncompressed_size + 1; \
+ break; \
+ } \
+ _size; \
+})
static inline unsigned crc_offset(const union bch_extent_crc *crc)
{
@@ -392,6 +455,23 @@ static inline unsigned crc_offset(const union bch_extent_crc *crc)
return crc->crc32.offset;
case BCH_EXTENT_CRC64:
return crc->crc64.offset;
+ case BCH_EXTENT_CRC128:
+ return crc->crc128.offset;
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned crc_nonce(const union bch_extent_crc *crc)
+{
+ switch (extent_crc_type(crc)) {
+ case BCH_EXTENT_CRC_NONE:
+ case BCH_EXTENT_CRC32:
+ return 0;
+ case BCH_EXTENT_CRC64:
+ return crc->crc64.nonce;
+ case BCH_EXTENT_CRC128:
+ return crc->crc128.nonce;
default:
BUG();
}
@@ -406,6 +486,8 @@ static inline unsigned crc_csum_type(const union bch_extent_crc *crc)
return crc->crc32.csum_type;
case BCH_EXTENT_CRC64:
return crc->crc64.csum_type;
+ case BCH_EXTENT_CRC128:
+ return crc->crc128.csum_type;
default:
BUG();
}
@@ -420,27 +502,33 @@ static inline unsigned crc_compression_type(const union bch_extent_crc *crc)
return crc->crc32.compression_type;
case BCH_EXTENT_CRC64:
return crc->crc64.compression_type;
+ case BCH_EXTENT_CRC128:
+ return crc->crc128.compression_type;
default:
BUG();
}
}
-static inline u64 crc_csum(const union bch_extent_crc *crc)
+static inline struct bch_csum crc_csum(const union bch_extent_crc *crc)
{
switch (extent_crc_type(crc)) {
case BCH_EXTENT_CRC_NONE:
- return 0;
+ return (struct bch_csum) { 0 };
case BCH_EXTENT_CRC32:
- return crc->crc32.csum;
+ return (struct bch_csum) { .lo = crc->crc32.csum };
case BCH_EXTENT_CRC64:
- return crc->crc64.csum;
+ return (struct bch_csum) {
+ .lo = crc->crc64.csum_lo,
+ .hi = crc->crc64.csum_hi,
+ };
+ case BCH_EXTENT_CRC128:
+ return crc->crc128.csum;
default:
BUG();
}
}
-static inline unsigned bkey_extent_is_compressed(struct cache_set *c,
- struct bkey_s_c k)
+static inline unsigned bkey_extent_is_compressed(struct bkey_s_c k)
{
struct bkey_s_c_extent e;
const struct bch_extent_ptr *ptr;
@@ -453,7 +541,7 @@ static inline unsigned bkey_extent_is_compressed(struct cache_set *c,
e = bkey_s_c_to_extent(k);
extent_for_each_ptr_crc(e, ptr, crc)
- if (bch_extent_ptr_is_dirty(c, e, ptr) &&
+ if (!ptr->cached &&
crc_compression_type(crc) != BCH_COMPRESSION_NONE &&
crc_compressed_size(e.k, crc) < k.k->size)
ret = max_t(unsigned, ret,
@@ -463,6 +551,17 @@ static inline unsigned bkey_extent_is_compressed(struct cache_set *c,
return ret;
}
+static inline unsigned extent_current_nonce(struct bkey_s_c_extent e)
+{
+ const union bch_extent_crc *crc;
+
+ extent_for_each_crc(e, crc)
+ if (bch_csum_type_is_encryption(crc_csum_type(crc)))
+ return crc_offset(crc) + crc_nonce(crc);
+
+ return 0;
+}
+
void bch_extent_narrow_crcs(struct bkey_s_extent);
void bch_extent_drop_redundant_crcs(struct bkey_s_extent);
diff --git a/libbcache/fs-gc.c b/libbcache/fs-gc.c
index 1dec230..a758e89 100644
--- a/libbcache/fs-gc.c
+++ b/libbcache/fs-gc.c
@@ -17,7 +17,7 @@ static int remove_dirent(struct cache_set *c, struct btree_iter *iter,
struct bkey_s_c_dirent dirent)
{
struct qstr name;
- struct bkey_i_inode dir_inode;
+ struct bch_inode_unpacked dir_inode;
struct bch_hash_info dir_hash_info;
u64 dir_inum = dirent.k->p.inode;
int ret;
@@ -39,7 +39,7 @@ static int remove_dirent(struct cache_set *c, struct btree_iter *iter,
if (ret)
goto err;
- dir_hash_info = bch_hash_info_init(&dir_inode.v);
+ dir_hash_info = bch_hash_info_init(&dir_inode);
ret = bch_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL);
err:
@@ -48,11 +48,12 @@ err:
}
static int reattach_inode(struct cache_set *c,
- struct bkey_i_inode *lostfound_inode,
+ struct bch_inode_unpacked *lostfound_inode,
u64 inum)
{
struct bch_hash_info lostfound_hash_info =
- bch_hash_info_init(&lostfound_inode->v);
+ bch_hash_info_init(lostfound_inode);
+ struct bkey_inode_buf packed;
char name_buf[20];
struct qstr name;
int ret;
@@ -60,14 +61,16 @@ static int reattach_inode(struct cache_set *c,
snprintf(name_buf, sizeof(name_buf), "%llu", inum);
name = (struct qstr) QSTR(name_buf);
- le32_add_cpu(&lostfound_inode->v.i_nlink, 1);
+ lostfound_inode->i_nlink++;
- ret = bch_btree_insert(c, BTREE_ID_INODES, &lostfound_inode->k_i,
+ bch_inode_pack(&packed, lostfound_inode);
+
+ ret = bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
NULL, NULL, NULL, 0);
if (ret)
return ret;
- return bch_dirent_create(c, lostfound_inode->k.p.inode,
+ return bch_dirent_create(c, lostfound_inode->inum,
&lostfound_hash_info,
DT_DIR, &name, inum, NULL, 0);
}
@@ -75,10 +78,8 @@ static int reattach_inode(struct cache_set *c,
struct inode_walker {
bool first_this_inode;
bool have_inode;
- u16 i_mode;
- u64 i_size;
u64 cur_inum;
- struct bkey_i_inode inode;
+ struct bch_inode_unpacked inode;
};
static struct inode_walker inode_walker_init(void)
@@ -101,11 +102,6 @@ static int walk_inode(struct cache_set *c, struct inode_walker *w, u64 inum)
return ret;
w->have_inode = !ret;
-
- if (w->have_inode) {
- w->i_mode = le16_to_cpu(w->inode.v.i_mode);
- w->i_size = le64_to_cpu(w->inode.v.i_size);
- }
}
return 0;
@@ -138,20 +134,20 @@ static int check_extents(struct cache_set *c)
k.k->type, k.k->p.inode);
unfixable_fsck_err_on(w.first_this_inode && w.have_inode &&
- le64_to_cpu(w.inode.v.i_sectors) !=
+ w.inode.i_sectors !=
(i_sectors = bch_count_inode_sectors(c, w.cur_inum)),
c, "i_sectors wrong: got %llu, should be %llu",
- le64_to_cpu(w.inode.v.i_sectors), i_sectors);
+ w.inode.i_sectors, i_sectors);
unfixable_fsck_err_on(w.have_inode &&
- !S_ISREG(w.i_mode) && !S_ISLNK(w.i_mode), c,
+ !S_ISREG(w.inode.i_mode) && !S_ISLNK(w.inode.i_mode), c,
"extent type %u for non regular file, inode %llu mode %o",
- k.k->type, k.k->p.inode, w.i_mode);
+ k.k->type, k.k->p.inode, w.inode.i_mode);
unfixable_fsck_err_on(k.k->type != BCH_RESERVATION &&
- k.k->p.offset > round_up(w.i_size, PAGE_SIZE) >> 9, c,
+ k.k->p.offset > round_up(w.inode.i_size, PAGE_SIZE) >> 9, c,
"extent type %u offset %llu past end of inode %llu, i_size %llu",
- k.k->type, k.k->p.offset, k.k->p.inode, w.i_size);
+ k.k->type, k.k->p.offset, k.k->p.inode, w.inode.i_size);
}
fsck_err:
return bch_btree_iter_unlock(&iter) ?: ret;
@@ -172,7 +168,7 @@ static int check_dirents(struct cache_set *c)
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
POS(BCACHE_ROOT_INO, 0), k) {
struct bkey_s_c_dirent d;
- struct bkey_i_inode target;
+ struct bch_inode_unpacked target;
bool have_target;
u64 d_inum;
@@ -184,9 +180,9 @@ static int check_dirents(struct cache_set *c)
"dirent in nonexisting directory %llu",
k.k->p.inode);
- unfixable_fsck_err_on(!S_ISDIR(w.i_mode), c,
+ unfixable_fsck_err_on(!S_ISDIR(w.inode.i_mode), c,
"dirent in non directory inode %llu, type %u",
- k.k->p.inode, mode_to_type(w.i_mode));
+ k.k->p.inode, mode_to_type(w.inode.i_mode));
if (k.k->type != BCH_DIRENT)
continue;
@@ -220,10 +216,10 @@ static int check_dirents(struct cache_set *c)
if (fsck_err_on(have_target &&
d.v->d_type !=
- mode_to_type(le16_to_cpu(target.v.i_mode)), c,
+ mode_to_type(le16_to_cpu(target.i_mode)), c,
"incorrect d_type: got %u should be %u, filename %s",
d.v->d_type,
- mode_to_type(le16_to_cpu(target.v.i_mode)),
+ mode_to_type(le16_to_cpu(target.i_mode)),
d.v->d_name)) {
struct bkey_i_dirent *n;
@@ -234,7 +230,7 @@ static int check_dirents(struct cache_set *c)
}
bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_type = mode_to_type(le16_to_cpu(target.v.i_mode));
+ n->v.d_type = mode_to_type(le16_to_cpu(target.i_mode));
ret = bch_btree_insert_at(c, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL,
@@ -276,8 +272,9 @@ fsck_err:
}
/* Get root directory, create if it doesn't exist: */
-static int check_root(struct cache_set *c, struct bkey_i_inode *root_inode)
+static int check_root(struct cache_set *c, struct bch_inode_unpacked *root_inode)
{
+ struct bkey_inode_buf packed;
int ret;
ret = bch_inode_find_by_inum(c, BCACHE_ROOT_INO, root_inode);
@@ -287,7 +284,7 @@ static int check_root(struct cache_set *c, struct bkey_i_inode *root_inode)
if (fsck_err_on(ret, c, "root directory missing"))
goto create_root;
- if (fsck_err_on(!S_ISDIR(le16_to_cpu(root_inode->v.i_mode)), c,
+ if (fsck_err_on(!S_ISDIR(root_inode->i_mode), c,
"root inode not a directory"))
goto create_root;
@@ -296,19 +293,23 @@ fsck_err:
return ret;
create_root:
bch_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
- root_inode->k.p.inode = BCACHE_ROOT_INO;
+ root_inode->inum = BCACHE_ROOT_INO;
+
+ bch_inode_pack(&packed, root_inode);
- return bch_btree_insert(c, BTREE_ID_INODES, &root_inode->k_i,
+ return bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
NULL, NULL, NULL, 0);
}
/* Get lost+found, create if it doesn't exist: */
static int check_lostfound(struct cache_set *c,
- struct bkey_i_inode *root_inode,
- struct bkey_i_inode *lostfound_inode)
+ struct bch_inode_unpacked *root_inode,
+ struct bch_inode_unpacked *lostfound_inode)
{
struct qstr lostfound = QSTR("lost+found");
- struct bch_hash_info root_hash_info = bch_hash_info_init(&root_inode->v);
+ struct bch_hash_info root_hash_info =
+ bch_hash_info_init(root_inode);
+ struct bkey_inode_buf packed;
u64 inum;
int ret;
@@ -326,7 +327,7 @@ static int check_lostfound(struct cache_set *c,
if (fsck_err_on(ret, c, "lost+found missing"))
goto create_lostfound;
- if (fsck_err_on(!S_ISDIR(le16_to_cpu(lostfound_inode->v.i_mode)), c,
+ if (fsck_err_on(!S_ISDIR(lostfound_inode->i_mode), c,
"lost+found inode not a directory"))
goto create_lostfound;
@@ -334,22 +335,27 @@ static int check_lostfound(struct cache_set *c,
fsck_err:
return ret;
create_lostfound:
- le32_add_cpu(&root_inode->v.i_nlink, 1);
+ root_inode->i_nlink++;
- ret = bch_btree_insert(c, BTREE_ID_INODES, &root_inode->k_i,
+ bch_inode_pack(&packed, root_inode);
+
+ ret = bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
NULL, NULL, NULL, 0);
if (ret)
return ret;
bch_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+ bch_inode_pack(&packed, lostfound_inode);
- ret = bch_inode_create(c, &lostfound_inode->k_i, BLOCKDEV_INODE_MAX, 0,
+ ret = bch_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);
if (ret)
return ret;
+ lostfound_inode->inum = packed.inode.k.p.inode;
+
ret = bch_dirent_create(c, BCACHE_ROOT_INO, &root_hash_info, DT_DIR,
- &lostfound, lostfound_inode->k.p.inode, NULL, 0);
+ &lostfound, lostfound_inode->inum, NULL, 0);
if (ret)
return ret;
@@ -420,7 +426,7 @@ static int path_down(struct pathbuf *p, u64 inum)
noinline_for_stack
static int check_directory_structure(struct cache_set *c,
- struct bkey_i_inode *lostfound_inode)
+ struct bch_inode_unpacked *lostfound_inode)
{
struct inode_bitmap dirs_done = { NULL, 0 };
struct pathbuf path = { 0, 0, NULL };
@@ -618,25 +624,30 @@ s64 bch_count_inode_sectors(struct cache_set *c, u64 inum)
}
static int bch_gc_do_inode(struct cache_set *c,
- struct bkey_i_inode *lostfound_inode,
+ struct bch_inode_unpacked *lostfound_inode,
struct btree_iter *iter,
struct bkey_s_c_inode inode, struct nlink link)
{
- u16 i_mode = le16_to_cpu(inode.v->i_mode);
- u32 i_flags = le32_to_cpu(inode.v->i_flags);
- u32 i_nlink = le32_to_cpu(inode.v->i_nlink);
- u64 i_size = le64_to_cpu(inode.v->i_size);
- s64 i_sectors = 0;
+ struct bch_inode_unpacked u;
int ret = 0;
- u32 real_i_nlink;
+ u32 i_nlink, real_i_nlink;
+ bool do_update = false;
+
+ ret = bch_inode_unpack(inode, &u);
+ if (cache_set_inconsistent_on(ret, c,
+ "error unpacking inode %llu in fs-gc",
+ inode.k->p.inode))
+ return ret;
+
+ i_nlink = u.i_nlink + nlink_bias(u.i_mode);
fsck_err_on(i_nlink < link.count, c,
"inode %llu i_link too small (%u < %u, type %i)",
inode.k->p.inode, i_nlink,
- link.count, mode_to_type(i_mode));
+ link.count, mode_to_type(u.i_mode));
/* These should have been caught/fixed by earlier passes: */
- if (S_ISDIR(i_mode)) {
+ if (S_ISDIR(u.i_mode)) {
need_fsck_err_on(link.count > 1, c,
"directory %llu with multiple hardlinks: %u",
inode.k->p.inode, link.count);
@@ -656,7 +667,7 @@ static int bch_gc_do_inode(struct cache_set *c,
"but found orphaned inode %llu",
inode.k->p.inode);
- if (fsck_err_on(S_ISDIR(i_mode) &&
+ if (fsck_err_on(S_ISDIR(u.i_mode) &&
bch_empty_dir(c, inode.k->p.inode), c,
"non empty directory with link count 0, "
"inode nlink %u, dir links found %u",
@@ -676,7 +687,7 @@ static int bch_gc_do_inode(struct cache_set *c,
return ret;
}
- if (i_flags & BCH_INODE_I_SIZE_DIRTY) {
+ if (u.i_flags & BCH_INODE_I_SIZE_DIRTY) {
fsck_err_on(c->sb.clean, c,
"filesystem marked clean, "
"but inode %llu has i_size dirty",
@@ -690,7 +701,7 @@ static int bch_gc_do_inode(struct cache_set *c,
*/
ret = bch_inode_truncate(c, inode.k->p.inode,
- round_up(i_size, PAGE_SIZE) >> 9,
+ round_up(u.i_size, PAGE_SIZE) >> 9,
NULL, NULL);
if (ret) {
bch_err(c, "error in fs gc: error %i "
@@ -702,10 +713,15 @@ static int bch_gc_do_inode(struct cache_set *c,
* We truncated without our normal sector accounting hook, just
* make sure we recalculate it:
*/
- i_flags |= BCH_INODE_I_SECTORS_DIRTY;
+ u.i_flags |= BCH_INODE_I_SECTORS_DIRTY;
+
+ u.i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+ do_update = true;
}
- if (i_flags & BCH_INODE_I_SECTORS_DIRTY) {
+ if (u.i_flags & BCH_INODE_I_SECTORS_DIRTY) {
+ s64 sectors;
+
fsck_err_on(c->sb.clean, c,
"filesystem marked clean, "
"but inode %llu has i_sectors dirty",
@@ -714,13 +730,17 @@ static int bch_gc_do_inode(struct cache_set *c,
bch_verbose(c, "recounting sectors for inode %llu",
inode.k->p.inode);
- i_sectors = bch_count_inode_sectors(c, inode.k->p.inode);
- if (i_sectors < 0) {
+ sectors = bch_count_inode_sectors(c, inode.k->p.inode);
+ if (sectors < 0) {
bch_err(c, "error in fs gc: error %i "
"recounting inode sectors",
- (int) i_sectors);
- return i_sectors;
+ (int) sectors);
+ return sectors;
}
+
+ u.i_sectors = sectors;
+ u.i_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
+ do_update = true;
}
if (i_nlink != real_i_nlink) {
@@ -728,30 +748,23 @@ static int bch_gc_do_inode(struct cache_set *c,
"filesystem marked clean, "
"but inode %llu has wrong i_nlink "
"(type %u i_nlink %u, should be %u)",
- inode.k->p.inode, mode_to_type(i_mode),
+ inode.k->p.inode, mode_to_type(u.i_mode),
i_nlink, real_i_nlink);
bch_verbose(c, "setting inode %llu nlinks from %u to %u",
inode.k->p.inode, i_nlink, real_i_nlink);
+ u.i_nlink = real_i_nlink - nlink_bias(u.i_mode);;
+ do_update = true;
}
- if (i_nlink != real_i_nlink||
- i_flags & BCH_INODE_I_SECTORS_DIRTY ||
- i_flags & BCH_INODE_I_SIZE_DIRTY) {
- struct bkey_i_inode update;
-
- bkey_reassemble(&update.k_i, inode.s_c);
- update.v.i_nlink = cpu_to_le32(real_i_nlink);
- update.v.i_flags = cpu_to_le32(i_flags &
- ~(BCH_INODE_I_SIZE_DIRTY|
- BCH_INODE_I_SECTORS_DIRTY));
+ if (do_update) {
+ struct bkey_inode_buf p;
- if (i_flags & BCH_INODE_I_SECTORS_DIRTY)
- update.v.i_sectors = cpu_to_le64(i_sectors);
+ bch_inode_pack(&p, &u);
ret = bch_btree_insert_at(c, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(iter, &update.k_i));
+ BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
if (ret && ret != -EINTR)
bch_err(c, "error in fs gc: error %i "
"updating inode", ret);
@@ -762,7 +775,7 @@ fsck_err:
noinline_for_stack
static int bch_gc_walk_inodes(struct cache_set *c,
- struct bkey_i_inode *lostfound_inode,
+ struct bch_inode_unpacked *lostfound_inode,
struct nlinks *links,
u64 range_start, u64 range_end)
{
@@ -835,7 +848,7 @@ fsck_err:
noinline_for_stack
static int check_inode_nlinks(struct cache_set *c,
- struct bkey_i_inode *lostfound_inode)
+ struct bch_inode_unpacked *lostfound_inode)
{
struct nlinks links;
u64 this_iter_range_start, next_iter_range_start = 0;
@@ -873,7 +886,7 @@ static int check_inode_nlinks(struct cache_set *c,
*/
int bch_fsck(struct cache_set *c, bool full_fsck)
{
- struct bkey_i_inode root_inode, lostfound_inode;
+ struct bch_inode_unpacked root_inode, lostfound_inode;
int ret;
ret = check_root(c, &root_inode);
diff --git a/libbcache/fs-io.c b/libbcache/fs-io.c
index 942baeb..ecf249c 100644
--- a/libbcache/fs-io.c
+++ b/libbcache/fs-io.c
@@ -59,22 +59,20 @@ static int write_invalidate_inode_pages_range(struct address_space *mapping,
/* i_size updates: */
-static int inode_set_size(struct bch_inode_info *ei, struct bch_inode *bi,
+static int inode_set_size(struct bch_inode_info *ei,
+ struct bch_inode_unpacked *bi,
void *p)
{
loff_t *new_i_size = p;
- unsigned i_flags = le32_to_cpu(bi->i_flags);
lockdep_assert_held(&ei->update_lock);
- bi->i_size = cpu_to_le64(*new_i_size);
+ bi->i_size = *new_i_size;
if (atomic_long_read(&ei->i_size_dirty_count))
- i_flags |= BCH_INODE_I_SIZE_DIRTY;
+ bi->i_flags |= BCH_INODE_I_SIZE_DIRTY;
else
- i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-
- bi->i_flags = cpu_to_le32(i_flags);
+ bi->i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
return 0;
}
@@ -122,23 +120,22 @@ i_sectors_hook_fn(struct extent_insert_hook *hook,
}
static int inode_set_i_sectors_dirty(struct bch_inode_info *ei,
- struct bch_inode *bi, void *p)
+ struct bch_inode_unpacked *bi, void *p)
{
- BUG_ON(le32_to_cpu(bi->i_flags) & BCH_INODE_I_SECTORS_DIRTY);
+ BUG_ON(bi->i_flags & BCH_INODE_I_SECTORS_DIRTY);
- bi->i_flags = cpu_to_le32(le32_to_cpu(bi->i_flags)|
- BCH_INODE_I_SECTORS_DIRTY);
+ bi->i_flags |= BCH_INODE_I_SECTORS_DIRTY;
return 0;
}
static int inode_clear_i_sectors_dirty(struct bch_inode_info *ei,
- struct bch_inode *bi, void *p)
+ struct bch_inode_unpacked *bi,
+ void *p)
{
- BUG_ON(!(le32_to_cpu(bi->i_flags) & BCH_INODE_I_SECTORS_DIRTY));
+ BUG_ON(!(bi->i_flags & BCH_INODE_I_SECTORS_DIRTY));
- bi->i_sectors = cpu_to_le64(atomic64_read(&ei->i_sectors));
- bi->i_flags = cpu_to_le32(le32_to_cpu(bi->i_flags) &
- ~BCH_INODE_I_SECTORS_DIRTY);
+ bi->i_sectors = atomic64_read(&ei->i_sectors);
+ bi->i_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
return 0;
}
@@ -203,7 +200,10 @@ static int __must_check i_sectors_dirty_get(struct bch_inode_info *ei,
struct bchfs_extent_trans_hook {
struct bchfs_write_op *op;
struct extent_insert_hook hook;
- struct bkey_i_inode new_inode;
+
+ struct bch_inode_unpacked inode_u;
+ struct bkey_inode_buf inode_p;
+
bool need_inode_update;
};
@@ -222,6 +222,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
(k.k && bkey_extent_is_allocation(k.k));
s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign;
u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
+ bool do_pack = false;
BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
@@ -234,7 +235,9 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
return BTREE_HOOK_RESTART_TRANS;
}
- h->new_inode.v.i_size = cpu_to_le64(offset);
+ h->inode_u.i_size = offset;
+ do_pack = true;
+
ei->i_size = offset;
if (h->op->is_dio)
@@ -247,7 +250,9 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
return BTREE_HOOK_RESTART_TRANS;
}
- le64_add_cpu(&h->new_inode.v.i_sectors, sectors);
+ h->inode_u.i_sectors += sectors;
+ do_pack = true;
+
atomic64_add(sectors, &ei->i_sectors);
h->op->sectors_added += sectors;
@@ -259,6 +264,9 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
}
}
+ if (do_pack)
+ bch_inode_pack(&h->inode_p, &h->inode_u);
+
return BTREE_HOOK_DO_INSERT;
}
@@ -310,13 +318,32 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
break;
}
- bkey_reassemble(&hook.new_inode.k_i, inode);
+ if (WARN_ONCE(bkey_bytes(inode.k) >
+ sizeof(hook.inode_p),
+ "inode %llu too big (%zu bytes, buf %zu)",
+ extent_iter.pos.inode,
+ bkey_bytes(inode.k),
+ sizeof(hook.inode_p))) {
+ ret = -ENOENT;
+ break;
+ }
+
+ bkey_reassemble(&hook.inode_p.inode.k_i, inode);
+ ret = bch_inode_unpack(bkey_s_c_to_inode(inode),
+ &hook.inode_u);
+ if (WARN_ONCE(ret,
+ "error %i unpacking inode %llu",
+ ret, extent_iter.pos.inode)) {
+ ret = -ENOENT;
+ break;
+ }
ret = bch_btree_insert_at(wop->c, &wop->res,
&hook.hook, op_journal_seq(wop),
BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
BTREE_INSERT_ENTRY(&extent_iter, k),
- BTREE_INSERT_ENTRY(&inode_iter, &hook.new_inode.k_i));
+ BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter,
+ &hook.inode_p.inode.k_i, 2));
} else {
ret = bch_btree_insert_at(wop->c, &wop->res,
&hook.hook, op_journal_seq(wop),
@@ -350,25 +377,15 @@ err:
struct bch_page_state {
union { struct {
/*
- * BCH_PAGE_ALLOCATED: page is _fully_ written on disk, and not
- * compressed - which means to write this page we don't have to reserve
- * space (the new write will never take up more space on disk than what
- * it's overwriting)
- *
- * BCH_PAGE_UNALLOCATED: page is not fully written on disk, or is
- * compressed - before writing we have to reserve space with
- * bch_reserve_sectors()
- *
- * BCH_PAGE_RESERVED: page has space reserved on disk (reservation will
- * be consumed when the page is written).
+ * page is _fully_ written on disk, and not compressed - which means to
+ * write this page we don't have to reserve space (the new write will
+ * never take up more space on disk than what it's overwriting)
*/
- enum {
- BCH_PAGE_UNALLOCATED = 0,
- BCH_PAGE_ALLOCATED,
- } alloc_state:2;
+ unsigned allocated:1;
/* Owns PAGE_SECTORS sized reservation: */
unsigned reserved:1;
+ unsigned nr_replicas:4;
/*
* Number of sectors on disk - for i_blocks
@@ -431,11 +448,9 @@ static int bch_get_page_reservation(struct cache_set *c, struct page *page,
struct disk_reservation res;
int ret = 0;
- BUG_ON(s->alloc_state == BCH_PAGE_ALLOCATED &&
- s->sectors != PAGE_SECTORS);
+ BUG_ON(s->allocated && s->sectors != PAGE_SECTORS);
- if (s->reserved ||
- s->alloc_state == BCH_PAGE_ALLOCATED)
+ if (s->allocated || s->reserved)
return 0;
ret = bch_disk_reservation_get(c, &res, PAGE_SECTORS, !check_enospc
@@ -448,7 +463,8 @@ static int bch_get_page_reservation(struct cache_set *c, struct page *page,
bch_disk_reservation_put(c, &res);
return 0;
}
- new.reserved = 1;
+ new.reserved = 1;
+ new.nr_replicas = res.nr_replicas;
});
return 0;
@@ -585,10 +601,10 @@ static void bch_mark_pages_unalloc(struct bio *bio)
struct bio_vec bv;
bio_for_each_segment(bv, bio, iter)
- page_state(bv.bv_page)->alloc_state = BCH_PAGE_UNALLOCATED;
+ page_state(bv.bv_page)->allocated = 0;
}
-static void bch_add_page_sectors(struct bio *bio, const struct bkey *k)
+static void bch_add_page_sectors(struct bio *bio, struct bkey_s_c k)
{
struct bvec_iter iter;
struct bio_vec bv;
@@ -597,12 +613,17 @@ static void bch_add_page_sectors(struct bio *bio, const struct bkey *k)
struct bch_page_state *s = page_state(bv.bv_page);
/* sectors in @k from the start of this page: */
- unsigned k_sectors = k->size - (iter.bi_sector - k->p.offset);
+ unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset);
unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
- BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
+ if (!s->sectors)
+ s->nr_replicas = bch_extent_nr_dirty_ptrs(k);
+ else
+ s->nr_replicas = min_t(unsigned, s->nr_replicas,
+ bch_extent_nr_dirty_ptrs(k));
+ BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
s->sectors += page_sectors;
}
}
@@ -634,7 +655,7 @@ static void bchfs_read(struct cache_set *c, struct bch_read_bio *rbio, u64 inode
EBUG_ON(s->reserved);
- s->alloc_state = BCH_PAGE_ALLOCATED;
+ s->allocated = 1;
s->sectors = 0;
}
@@ -650,7 +671,7 @@ static void bchfs_read(struct cache_set *c, struct bch_read_bio *rbio, u64 inode
k = bkey_i_to_s_c(&tmp.k);
if (!bkey_extent_is_allocation(k.k) ||
- bkey_extent_is_compressed(c, k))
+ bkey_extent_is_compressed(k))
bch_mark_pages_unalloc(bio);
bch_extent_pick_ptr(c, k, &pick);
@@ -667,7 +688,7 @@ static void bchfs_read(struct cache_set *c, struct bch_read_bio *rbio, u64 inode
swap(bio->bi_iter.bi_size, bytes);
if (bkey_extent_is_allocation(k.k))
- bch_add_page_sectors(bio, k.k);
+ bch_add_page_sectors(bio, k);
if (pick.ca) {
PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
@@ -859,6 +880,10 @@ static void bch_writepage_io_alloc(struct cache_set *c,
struct page *page)
{
u64 inum = ei->vfs_inode.i_ino;
+ unsigned nr_replicas = page_state(page)->nr_replicas;
+
+ EBUG_ON(!nr_replicas);
+ /* XXX: disk_reservation->gen isn't plumbed through */
if (!w->io) {
alloc_io:
@@ -881,7 +906,8 @@ alloc_io:
w->io->op.op.index_update_fn = bchfs_write_index_update;
}
- if (bio_add_page_contig(&w->io->bio.bio, page)) {
+ if (w->io->op.op.res.nr_replicas != nr_replicas ||
+ bio_add_page_contig(&w->io->bio.bio, page)) {
bch_writepage_do_io(w);
goto alloc_io;
}
@@ -936,13 +962,13 @@ do_io:
/* Before unlocking the page, transfer reservation to w->io: */
old = page_state_cmpxchg(page_state(page), new, {
- BUG_ON(!new.reserved &&
- (new.sectors != PAGE_SECTORS ||
- new.alloc_state != BCH_PAGE_ALLOCATED));
+ EBUG_ON(!new.reserved &&
+ (new.sectors != PAGE_SECTORS ||
+ !new.allocated));
- if (new.alloc_state == BCH_PAGE_ALLOCATED &&
+ if (new.allocated &&
w->io->op.op.compression_type != BCH_COMPRESSION_NONE)
- new.alloc_state = BCH_PAGE_UNALLOCATED;
+ new.allocated = 0;
else if (!new.reserved)
goto out;
new.reserved = 0;
@@ -1919,7 +1945,7 @@ int bch_truncate(struct inode *inode, struct iattr *iattr)
mutex_lock(&ei->update_lock);
setattr_copy(inode, iattr);
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
/* clear I_SIZE_DIRTY: */
i_size_dirty_put(ei);
@@ -1981,7 +2007,7 @@ static long bch_fpunch(struct inode *inode, loff_t offset, loff_t len)
ret = bch_discard(c,
POS(ino, discard_start),
POS(ino, discard_end),
- 0,
+ ZERO_VERSION,
&disk_res,
&i_sectors_hook.hook,
&ei->journal_seq);
@@ -2132,12 +2158,11 @@ static long bch_fallocate(struct inode *inode, int mode,
struct cache_set *c = inode->i_sb->s_fs_info;
struct i_sectors_hook i_sectors_hook;
struct btree_iter iter;
- struct bkey_i reservation;
- struct bkey_s_c k;
struct bpos end;
loff_t block_start, block_end;
loff_t new_size = offset + len;
unsigned sectors;
+ unsigned replicas = READ_ONCE(c->opts.data_replicas);
int ret;
bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
@@ -2186,13 +2211,16 @@ static long bch_fallocate(struct inode *inode, int mode,
while (bkey_cmp(iter.pos, end) < 0) {
struct disk_reservation disk_res = { 0 };
+ struct bkey_i_reservation reservation;
+ struct bkey_s_c k;
k = bch_btree_iter_peek_with_holes(&iter);
if ((ret = btree_iter_err(k)))
goto btree_iter_err;
/* already reserved */
- if (k.k->type == BCH_RESERVATION) {
+ if (k.k->type == BCH_RESERVATION &&
+ bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
bch_btree_iter_advance_pos(&iter);
continue;
}
@@ -2204,29 +2232,32 @@ static long bch_fallocate(struct inode *inode, int mode,
}
}
- bkey_init(&reservation.k);
+ bkey_reservation_init(&reservation.k_i);
reservation.k.type = BCH_RESERVATION;
reservation.k.p = k.k->p;
reservation.k.size = k.k->size;
- bch_cut_front(iter.pos, &reservation);
+ bch_cut_front(iter.pos, &reservation.k_i);
bch_cut_back(end, &reservation.k);
sectors = reservation.k.size;
+ reservation.v.nr_replicas = bch_extent_nr_dirty_ptrs(k);
- if (!bkey_extent_is_allocation(k.k) ||
- bkey_extent_is_compressed(c, k)) {
+ if (reservation.v.nr_replicas < replicas ||
+ bkey_extent_is_compressed(k)) {
ret = bch_disk_reservation_get(c, &disk_res,
sectors, 0);
if (ret)
goto err_put_sectors_dirty;
+
+ reservation.v.nr_replicas = disk_res.nr_replicas;
}
ret = bch_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
&ei->journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(&iter, &reservation));
+ BTREE_INSERT_ENTRY(&iter, &reservation.k_i));
bch_disk_reservation_put(c, &disk_res);
btree_iter_err:
if (ret < 0 && ret != -EINTR)
diff --git a/libbcache/fs.c b/libbcache/fs.c
index 884a950..76948e7 100644
--- a/libbcache/fs.c
+++ b/libbcache/fs.c
@@ -26,7 +26,9 @@
static struct kmem_cache *bch_inode_cache;
-static void bch_vfs_inode_init(struct bch_inode_info *, struct bkey_s_c_inode);
+static void bch_vfs_inode_init(struct cache_set *,
+ struct bch_inode_info *,
+ struct bch_inode_unpacked *);
/*
* I_SIZE_DIRTY requires special handling:
@@ -63,11 +65,20 @@ int __must_check __bch_write_inode(struct cache_set *c,
{
struct btree_iter iter;
struct inode *inode = &ei->vfs_inode;
- struct bkey_i_inode new_inode;
- struct bch_inode *bi;
+ struct bch_inode_unpacked inode_u;
+ struct bkey_inode_buf inode_p;
u64 inum = inode->i_ino;
+ unsigned i_nlink = READ_ONCE(inode->i_nlink);
int ret;
+ /*
+ * We can't write an inode with i_nlink == 0 because it's stored biased;
+ * however, we don't need to because if i_nlink is 0 the inode is
+ * getting deleted when it's evicted.
+ */
+ if (!i_nlink)
+ return 0;
+
lockdep_assert_held(&ei->update_lock);
bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(inum, 0));
@@ -84,33 +95,41 @@ int __must_check __bch_write_inode(struct cache_set *c,
return -ENOENT;
}
- bkey_reassemble(&new_inode.k_i, k);
- bi = &new_inode.v;
+ ret = bch_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+ if (WARN_ONCE(ret,
+ "error %i unpacking inode %llu", ret, inum)) {
+ ret = -ENOENT;
+ break;
+ }
if (set) {
- ret = set(ei, bi, p);
+ ret = set(ei, &inode_u, p);
if (ret)
goto out;
}
- bi->i_mode = cpu_to_le16(inode->i_mode);
- bi->i_uid = cpu_to_le32(i_uid_read(inode));
- bi->i_gid = cpu_to_le32(i_gid_read(inode));
- bi->i_nlink = cpu_to_le32(inode->i_nlink);
- bi->i_dev = cpu_to_le32(inode->i_rdev);
- bi->i_atime = cpu_to_le64(timespec_to_ns(&inode->i_atime));
- bi->i_mtime = cpu_to_le64(timespec_to_ns(&inode->i_mtime));
- bi->i_ctime = cpu_to_le64(timespec_to_ns(&inode->i_ctime));
+ BUG_ON(i_nlink < nlink_bias(inode->i_mode));
+
+ inode_u.i_mode = inode->i_mode;
+ inode_u.i_uid = i_uid_read(inode);
+ inode_u.i_gid = i_gid_read(inode);
+ inode_u.i_nlink = i_nlink - nlink_bias(inode->i_mode);
+ inode_u.i_dev = inode->i_rdev;
+ inode_u.i_atime = timespec_to_bch_time(c, inode->i_atime);
+ inode_u.i_mtime = timespec_to_bch_time(c, inode->i_mtime);
+ inode_u.i_ctime = timespec_to_bch_time(c, inode->i_ctime);
+
+ bch_inode_pack(&inode_p, &inode_u);
ret = bch_btree_insert_at(c, NULL, NULL, &ei->journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL,
- BTREE_INSERT_ENTRY(&iter, &new_inode.k_i));
+ BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
} while (ret == -EINTR);
if (!ret) {
- ei->i_size = le64_to_cpu(bi->i_size);
- ei->i_flags = le32_to_cpu(bi->i_flags);
+ ei->i_size = inode_u.i_size;
+ ei->i_flags = inode_u.i_flags;
}
out:
bch_btree_iter_unlock(&iter);
@@ -138,7 +157,7 @@ int bch_inc_nlink(struct cache_set *c, struct bch_inode_info *ei)
int bch_dec_nlink(struct cache_set *c, struct bch_inode_info *ei)
{
- int ret;
+ int ret = 0;
mutex_lock(&ei->update_lock);
drop_nlink(&ei->vfs_inode);
@@ -152,9 +171,8 @@ static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum)
{
struct cache_set *c = sb->s_fs_info;
struct inode *inode;
+ struct bch_inode_unpacked inode_u;
struct bch_inode_info *ei;
- struct btree_iter iter;
- struct bkey_s_c k;
int ret;
pr_debug("inum %llu", inum);
@@ -165,24 +183,19 @@ static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum)
if (!(inode->i_state & I_NEW))
return inode;
- bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0));
- k = bch_btree_iter_peek_with_holes(&iter);
-
- if ((ret = btree_iter_err(k)) || k.k->type != BCH_INODE_FS) {
- ret = bch_btree_iter_unlock(&iter);
+ ret = bch_inode_find_by_inum(c, inum, &inode_u);
+ if (ret) {
iget_failed(inode);
- return ERR_PTR(ret ?: -ENOENT);
+ return ERR_PTR(ret);
}
ei = to_bch_ei(inode);
- bch_vfs_inode_init(ei, bkey_s_c_to_inode(k));
+ bch_vfs_inode_init(c, ei, &inode_u);
ei->journal_seq = bch_inode_journal_seq(&c->journal, inum);
unlock_new_inode(inode);
- bch_btree_iter_unlock(&iter);
-
return inode;
}
@@ -193,7 +206,8 @@ static struct inode *bch_vfs_inode_create(struct cache_set *c,
struct inode *inode;
struct posix_acl *default_acl = NULL, *acl = NULL;
struct bch_inode_info *ei;
- struct bkey_i_inode bkey_inode;
+ struct bch_inode_unpacked inode_u;
+ struct bkey_inode_buf inode_p;
int ret;
inode = new_inode(parent->i_sb);
@@ -210,10 +224,11 @@ static struct inode *bch_vfs_inode_create(struct cache_set *c,
ei = to_bch_ei(inode);
- bch_inode_init(c, &bkey_inode, i_uid_read(inode),
+ bch_inode_init(c, &inode_u, i_uid_read(inode),
i_gid_read(inode), inode->i_mode, rdev);
+ bch_inode_pack(&inode_p, &inode_u);
- ret = bch_inode_create(c, &bkey_inode.k_i,
+ ret = bch_inode_create(c, &inode_p.inode.k_i,
BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);
if (unlikely(ret)) {
@@ -225,7 +240,8 @@ static struct inode *bch_vfs_inode_create(struct cache_set *c,
goto err;
}
- bch_vfs_inode_init(ei, inode_i_to_s_c(&bkey_inode));
+ inode_u.inum = inode_p.inode.k.p.inode;
+ bch_vfs_inode_init(c, ei, &inode_u);
if (default_acl) {
ret = bch_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
@@ -266,7 +282,7 @@ static int bch_vfs_dirent_create(struct cache_set *c, struct inode *dir,
if (unlikely(ret))
return ret;
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
mark_inode_dirty_sync(dir);
return 0;
}
@@ -337,7 +353,7 @@ static int bch_link(struct dentry *old_dentry, struct inode *dir,
lockdep_assert_held(&inode->i_rwsem);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_fs_time(dir->i_sb);
ret = bch_inc_nlink(c, ei);
if (ret)
@@ -382,12 +398,7 @@ static int bch_unlink(struct inode *dir, struct dentry *dentry)
drop_nlink(inode);
}
- drop_nlink(inode);
- if (inode->i_nlink) {
- mutex_lock(&ei->update_lock);
- ret = bch_write_inode(c, ei);
- mutex_unlock(&ei->update_lock);
- }
+ bch_dec_nlink(c, ei);
return 0;
}
@@ -473,7 +484,7 @@ static int bch_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = old_dentry->d_inode;
struct bch_inode_info *ei = to_bch_ei(old_inode);
struct inode *new_inode = new_dentry->d_inode;
- struct timespec now = CURRENT_TIME;
+ struct timespec now = current_fs_time(old_dir->i_sb);
int ret;
lockdep_assert_held(&old_dir->i_rwsem);
@@ -550,7 +561,7 @@ static int bch_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
struct bch_inode_info *ei = to_bch_ei(old_inode);
- struct timespec now = CURRENT_TIME;
+ struct timespec now = current_fs_time(old_dir->i_sb);
int ret;
ret = bch_dirent_rename(c,
@@ -783,14 +794,14 @@ static unsigned bch_inode_flags_to_user_flags(unsigned flags)
}
static int bch_inode_user_flags_set(struct bch_inode_info *ei,
- struct bch_inode *bi,
+ struct bch_inode_unpacked *bi,
void *p)
{
/*
* We're relying on btree locking here for exclusion with other ioctl
* calls - use the flags in the btree (@bi), not ei->i_flags:
*/
- unsigned bch_flags = le32_to_cpu(bi->i_flags);
+ unsigned bch_flags = bi->i_flags;
unsigned oldflags = bch_inode_flags_to_user_flags(bch_flags);
unsigned newflags = *((unsigned *) p);
unsigned i;
@@ -812,8 +823,8 @@ static int bch_inode_user_flags_set(struct bch_inode_info *ei,
if (oldflags != newflags)
return -EOPNOTSUPP;
- bi->i_flags = cpu_to_le32(bch_flags);
- ei->vfs_inode.i_ctime = CURRENT_TIME;
+ bi->i_flags = bch_flags;
+ ei->vfs_inode.i_ctime = current_fs_time(ei->vfs_inode.i_sb);
return 0;
}
@@ -1010,32 +1021,33 @@ static const struct address_space_operations bch_address_space_operations = {
.error_remove_page = generic_error_remove_page,
};
-static void bch_vfs_inode_init(struct bch_inode_info *ei,
- struct bkey_s_c_inode bkey_inode)
+static void bch_vfs_inode_init(struct cache_set *c,
+ struct bch_inode_info *ei,
+ struct bch_inode_unpacked *bi)
{
struct inode *inode = &ei->vfs_inode;
- const struct bch_inode *bi = bkey_inode.v;
pr_debug("init inode %llu with mode %o",
- bkey_inode.k->p.inode, bi->i_mode);
-
- ei->i_flags = le32_to_cpu(bi->i_flags);
- ei->i_size = le64_to_cpu(bi->i_size);
-
- inode->i_mode = le16_to_cpu(bi->i_mode);
- i_uid_write(inode, le32_to_cpu(bi->i_uid));
- i_gid_write(inode, le32_to_cpu(bi->i_gid));
-
- atomic64_set(&ei->i_sectors, le64_to_cpu(bi->i_sectors));
- inode->i_blocks = atomic64_read(&ei->i_sectors);
-
- inode->i_ino = bkey_inode.k->p.inode;
- set_nlink(inode, le32_to_cpu(bi->i_nlink));
- inode->i_rdev = le32_to_cpu(bi->i_dev);
- inode->i_size = le64_to_cpu(bi->i_size);
- inode->i_atime = ns_to_timespec(le64_to_cpu(bi->i_atime));
- inode->i_mtime = ns_to_timespec(le64_to_cpu(bi->i_mtime));
- inode->i_ctime = ns_to_timespec(le64_to_cpu(bi->i_ctime));
+ bi->inum, bi->i_mode);
+
+ ei->i_flags = bi->i_flags;
+ ei->i_size = bi->i_size;
+
+ inode->i_mode = bi->i_mode;
+ i_uid_write(inode, bi->i_uid);
+ i_gid_write(inode, bi->i_gid);
+
+ atomic64_set(&ei->i_sectors, bi->i_sectors);
+ inode->i_blocks = bi->i_sectors;
+
+ inode->i_ino = bi->inum;
+ set_nlink(inode, bi->i_nlink + nlink_bias(inode->i_mode));
+ inode->i_rdev = bi->i_dev;
+ inode->i_generation = bi->i_generation;
+ inode->i_size = bi->i_size;
+ inode->i_atime = bch_time_to_timespec(c, bi->i_atime);
+ inode->i_mtime = bch_time_to_timespec(c, bi->i_mtime);
+ inode->i_ctime = bch_time_to_timespec(c, bi->i_ctime);
bch_inode_flags_to_vfs(inode);
ei->str_hash = bch_hash_info_init(bi);
@@ -1149,8 +1161,8 @@ static int bch_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = atomic_long_read(&c->nr_inodes);
buf->f_ffree = U64_MAX;
- fsid = le64_to_cpup((void *) c->disk_sb.user_uuid.b) ^
- le64_to_cpup((void *) c->disk_sb.user_uuid.b + sizeof(u64));
+ fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
+ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
buf->f_namelen = NAME_MAX;
@@ -1380,7 +1392,7 @@ static struct dentry *bch_mount(struct file_system_type *fs_type,
sb->s_op = &bch_super_operations;
sb->s_xattr = bch_xattr_handlers;
sb->s_magic = BCACHE_STATFS_MAGIC;
- sb->s_time_gran = 1;
+ sb->s_time_gran = c->sb.time_precision;
c->vfs_sb = sb;
sb->s_bdi = &c->bdi;
diff --git a/libbcache/fs.h b/libbcache/fs.h
index c982024..aec6159 100644
--- a/libbcache/fs.h
+++ b/libbcache/fs.h
@@ -34,9 +34,16 @@ static inline u8 mode_to_type(umode_t mode)
return (mode >> 12) & 15;
}
+static inline unsigned nlink_bias(umode_t mode)
+{
+ return S_ISDIR(mode) ? 2 : 1;
+}
+
+struct bch_inode_unpacked;
+
/* returns 0 if we want to do the update, or error is passed up */
typedef int (*inode_set_fn)(struct bch_inode_info *,
- struct bch_inode *, void *);
+ struct bch_inode_unpacked *, void *);
int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *,
inode_set_fn, void *);
diff --git a/libbcache/inode.c b/libbcache/inode.c
index 200deb0..b72a1c5 100644
--- a/libbcache/inode.c
+++ b/libbcache/inode.c
@@ -9,51 +9,195 @@
#include <linux/random.h>
-ssize_t bch_inode_status(char *buf, size_t len, const struct bkey *k)
+#include <asm/unaligned.h>
+
+#define FIELD_BYTES() \
+
+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
+static const u8 bits_table[8] = {
+ 1 * 8 - 1,
+ 2 * 8 - 2,
+ 3 * 8 - 3,
+ 4 * 8 - 4,
+ 6 * 8 - 5,
+ 8 * 8 - 6,
+ 10 * 8 - 7,
+ 13 * 8 - 8,
+};
+
+static int inode_encode_field(u8 *out, u8 *end, const u64 in[2])
{
- if (k->p.offset)
- return scnprintf(buf, len, "offset nonzero: %llu", k->p.offset);
-
- if (k->size)
- return scnprintf(buf, len, "size nonzero: %u", k->size);
-
- switch (k->type) {
- case KEY_TYPE_DELETED:
- return scnprintf(buf, len, "deleted");
- case KEY_TYPE_DISCARD:
- return scnprintf(buf, len, "discarded");
- case KEY_TYPE_ERROR:
- return scnprintf(buf, len, "error");
- case KEY_TYPE_COOKIE:
- return scnprintf(buf, len, "cookie");
+ unsigned bytes, bits, shift;
- case BCH_INODE_FS:
- if (bkey_val_bytes(k) != sizeof(struct bch_inode))
- return scnprintf(buf, len, "bad size: %zu",
- bkey_val_bytes(k));
+ if (likely(!in[1]))
+ bits = fls64(in[0]);
+ else
+ bits = fls64(in[1]) + 64;
- if (k->p.inode < BLOCKDEV_INODE_MAX)
- return scnprintf(buf, len,
- "fs inode in blockdev range: %llu",
- k->p.inode);
- return 0;
+ for (shift = 1; shift <= 8; shift++)
+ if (bits < bits_table[shift - 1])
+ goto got_shift;
- case BCH_INODE_BLOCKDEV:
- if (bkey_val_bytes(k) != sizeof(struct bch_inode_blockdev))
- return scnprintf(buf, len, "bad size: %zu",
- bkey_val_bytes(k));
+ BUG();
+got_shift:
+ bytes = byte_table[shift - 1];
- if (k->p.inode >= BLOCKDEV_INODE_MAX)
- return scnprintf(buf, len,
- "blockdev inode in fs range: %llu",
- k->p.inode);
- return 0;
+ BUG_ON(out + bytes > end);
- default:
- return scnprintf(buf, len, "unknown inode type: %u", k->type);
+ if (likely(bytes <= 8)) {
+ u64 b = cpu_to_be64(in[0]);
+
+ memcpy(out, (void *) &b + 8 - bytes, bytes);
+ } else {
+ u64 b = cpu_to_be64(in[1]);
+
+ memcpy(out, (void *) &b + 16 - bytes, bytes);
+ put_unaligned_be64(in[0], out + bytes - 8);
+ }
+
+ *out |= (1 << 8) >> shift;
+
+ return bytes;
+}
+
+static int inode_decode_field(const u8 *in, const u8 *end,
+ u64 out[2], unsigned *out_bits)
+{
+ unsigned bytes, bits, shift;
+
+ if (in >= end)
+ return -1;
+
+ if (!*in)
+ return -1;
+
+ /*
+ * position of highest set bit indicates number of bytes:
+ * shift = number of bits to remove in high byte:
+ */
+ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */
+ bytes = byte_table[shift - 1];
+ bits = bytes * 8 - shift;
+
+ if (in + bytes > end)
+ return -1;
+
+ /*
+ * we're assuming it's safe to deref up to 7 bytes < in; this will work
+ * because keys always start quite a bit more than 7 bytes after the
+ * start of the btree node header:
+ */
+ if (likely(bytes <= 8)) {
+ out[0] = get_unaligned_be64(in + bytes - 8);
+ out[0] <<= 64 - bits;
+ out[0] >>= 64 - bits;
+ out[1] = 0;
+ } else {
+ out[0] = get_unaligned_be64(in + bytes - 8);
+ out[1] = get_unaligned_be64(in + bytes - 16);
+ out[1] <<= 128 - bits;
+ out[1] >>= 128 - bits;
+ }
+
+ *out_bits = out[1] ? 64 + fls64(out[1]) : fls64(out[0]);
+ return bytes;
+}
+
+void bch_inode_pack(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
+{
+ u8 *out = packed->inode.v.fields;
+ u8 *end = (void *) &packed[1];
+ u8 *last_nonzero_field = out;
+ u64 field[2];
+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+
+ bkey_inode_init(&packed->inode.k_i);
+ packed->inode.k.p.inode = inode->inum;
+ packed->inode.v.i_hash_seed = inode->i_hash_seed;
+ packed->inode.v.i_flags = cpu_to_le32(inode->i_flags);
+ packed->inode.v.i_mode = cpu_to_le16(inode->i_mode);
+
+#define BCH_INODE_FIELD(_name, _bits) \
+ field[0] = inode->_name; \
+ field[1] = 0; \
+ out += inode_encode_field(out, end, field); \
+ nr_fields++; \
+ \
+ if (field[0] | field[1]) { \
+ last_nonzero_field = out; \
+ last_nonzero_fieldnr = nr_fields; \
+ }
+
+ BCH_INODE_FIELDS()
+#undef BCH_INODE_FIELD
+
+ out = last_nonzero_field;
+ nr_fields = last_nonzero_fieldnr;
+
+ set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v);
+ memset(out, 0,
+ (u8 *) &packed->inode.v +
+ bkey_val_bytes(&packed->inode.k) - out);
+
+ SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+
+ if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+ struct bch_inode_unpacked unpacked;
+
+ int ret = bch_inode_unpack(inode_i_to_s_c(&packed->inode),
+ &unpacked);
+ BUG_ON(ret);
+ BUG_ON(unpacked.inum != inode->inum);
+ BUG_ON(unpacked.i_hash_seed != inode->i_hash_seed);
+ BUG_ON(unpacked.i_mode != inode->i_mode);
+
+#define BCH_INODE_FIELD(_name, _bits) BUG_ON(unpacked._name != inode->_name);
+ BCH_INODE_FIELDS()
+#undef BCH_INODE_FIELD
}
}
+int bch_inode_unpack(struct bkey_s_c_inode inode,
+ struct bch_inode_unpacked *unpacked)
+{
+ const u8 *in = inode.v->fields;
+ const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+ u64 field[2];
+ unsigned fieldnr = 0, field_bits;
+ int ret;
+
+ unpacked->inum = inode.k->p.inode;
+ unpacked->i_hash_seed = inode.v->i_hash_seed;
+ unpacked->i_flags = le32_to_cpu(inode.v->i_flags);
+ unpacked->i_mode = le16_to_cpu(inode.v->i_mode);
+
+#define BCH_INODE_FIELD(_name, _bits) \
+ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
+ memset(&unpacked->_name, 0, \
+ sizeof(*unpacked) - \
+ offsetof(struct bch_inode_unpacked, _name)); \
+ return 0; \
+ } \
+ \
+ ret = inode_decode_field(in, end, field, &field_bits); \
+ if (ret < 0) \
+ return ret; \
+ \
+ if (field_bits > sizeof(unpacked->_name) * 8) \
+ return -1; \
+ \
+ unpacked->_name = field[0]; \
+ in += ret;
+
+ BCH_INODE_FIELDS()
+#undef BCH_INODE_FIELD
+
+ /* XXX: signal if there were more fields than expected? */
+
+ return 0;
+}
+
static const char *bch_inode_invalid(const struct cache_set *c,
struct bkey_s_c k)
{
@@ -63,16 +207,20 @@ static const char *bch_inode_invalid(const struct cache_set *c,
switch (k.k->type) {
case BCH_INODE_FS: {
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+ struct bch_inode_unpacked unpacked;
- if (bkey_val_bytes(k.k) != sizeof(struct bch_inode))
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
return "incorrect value size";
if (k.k->p.inode < BLOCKDEV_INODE_MAX)
return "fs inode in blockdev range";
- if (INODE_STR_HASH_TYPE(inode.v) >= BCH_STR_HASH_NR)
+ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
return "invalid str hash type";
+ if (bch_inode_unpack(inode, &unpacked))
+ return "invalid variable length fields";
+
return NULL;
}
case BCH_INODE_BLOCKDEV:
@@ -92,12 +240,17 @@ static void bch_inode_to_text(struct cache_set *c, char *buf,
size_t size, struct bkey_s_c k)
{
struct bkey_s_c_inode inode;
+ struct bch_inode_unpacked unpacked;
switch (k.k->type) {
case BCH_INODE_FS:
inode = bkey_s_c_to_inode(k);
+ if (bch_inode_unpack(inode, &unpacked)) {
+ scnprintf(buf, size, "(unpack error)");
+ break;
+ }
- scnprintf(buf, size, "i_size %llu", inode.v->i_size);
+ scnprintf(buf, size, "i_size %llu", unpacked.i_size);
break;
}
}
@@ -107,26 +260,25 @@ const struct bkey_ops bch_bkey_inode_ops = {
.val_to_text = bch_inode_to_text,
};
-void bch_inode_init(struct cache_set *c, struct bkey_i_inode *inode,
+void bch_inode_init(struct cache_set *c, struct bch_inode_unpacked *inode_u,
uid_t uid, gid_t gid, umode_t mode, dev_t rdev)
{
- struct timespec ts = CURRENT_TIME;
- s64 now = timespec_to_ns(&ts);
- struct bch_inode *bi;
-
- bi = &bkey_inode_init(&inode->k_i)->v;
- bi->i_uid = cpu_to_le32(uid);
- bi->i_gid = cpu_to_le32(gid);
-
- bi->i_mode = cpu_to_le16(mode);
- bi->i_dev = cpu_to_le32(rdev);
- bi->i_atime = cpu_to_le64(now);
- bi->i_mtime = cpu_to_le64(now);
- bi->i_ctime = cpu_to_le64(now);
- bi->i_nlink = cpu_to_le32(S_ISDIR(mode) ? 2 : 1);
-
- get_random_bytes(&bi->i_hash_seed, sizeof(bi->i_hash_seed));
- SET_INODE_STR_HASH_TYPE(bi, c->sb.str_hash_type);
+ s64 now = timespec_to_bch_time(c, CURRENT_TIME);
+
+ memset(inode_u, 0, sizeof(*inode_u));
+
+ /* ick */
+ inode_u->i_flags |= c->sb.str_hash_type << INODE_STR_HASH_OFFSET;
+ get_random_bytes(&inode_u->i_hash_seed, sizeof(inode_u->i_hash_seed));
+
+ inode_u->i_mode = mode;
+ inode_u->i_uid = uid;
+ inode_u->i_gid = gid;
+ inode_u->i_dev = rdev;
+ inode_u->i_atime = now;
+ inode_u->i_mtime = now;
+ inode_u->i_ctime = now;
+ inode_u->i_otime = now;
}
int bch_inode_create(struct cache_set *c, struct bkey_i *inode,
@@ -200,7 +352,7 @@ int bch_inode_truncate(struct cache_set *c, u64 inode_nr, u64 new_size,
struct extent_insert_hook *hook, u64 *journal_seq)
{
return bch_discard(c, POS(inode_nr, new_size), POS(inode_nr + 1, 0),
- 0, NULL, hook, journal_seq);
+ ZERO_VERSION, NULL, hook, journal_seq);
}
int bch_inode_rm(struct cache_set *c, u64 inode_nr)
@@ -215,7 +367,7 @@ int bch_inode_rm(struct cache_set *c, u64 inode_nr)
ret = bch_btree_delete_range(c, BTREE_ID_XATTRS,
POS(inode_nr, 0),
POS(inode_nr + 1, 0),
- 0, NULL, NULL, NULL);
+ ZERO_VERSION, NULL, NULL, NULL);
if (ret < 0)
return ret;
@@ -230,7 +382,7 @@ int bch_inode_rm(struct cache_set *c, u64 inode_nr)
ret = bch_btree_delete_range(c, BTREE_ID_DIRENTS,
POS(inode_nr, 0),
POS(inode_nr + 1, 0),
- 0, NULL, NULL, NULL);
+ ZERO_VERSION, NULL, NULL, NULL);
if (ret < 0)
return ret;
@@ -241,25 +393,19 @@ int bch_inode_rm(struct cache_set *c, u64 inode_nr)
NULL, NULL, BTREE_INSERT_NOFAIL);
}
-int bch_inode_update(struct cache_set *c, struct bkey_i *inode,
- u64 *journal_seq)
-{
- return bch_btree_update(c, BTREE_ID_INODES, inode, journal_seq);
-}
-
int bch_inode_find_by_inum(struct cache_set *c, u64 inode_nr,
- struct bkey_i_inode *inode)
+ struct bch_inode_unpacked *inode)
{
struct btree_iter iter;
struct bkey_s_c k;
+ int ret = -ENOENT;
for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES,
POS(inode_nr, 0), k) {
switch (k.k->type) {
case BCH_INODE_FS:
- bkey_reassemble(&inode->k_i, k);
- bch_btree_iter_unlock(&iter);
- return 0;
+ ret = bch_inode_unpack(bkey_s_c_to_inode(k), inode);
+ break;
default:
/* hole, not found */
break;
@@ -269,7 +415,7 @@ int bch_inode_find_by_inum(struct cache_set *c, u64 inode_nr,
}
- return bch_btree_iter_unlock(&iter) ?: -ENOENT;
+ return bch_btree_iter_unlock(&iter) ?: ret;
}
int bch_cached_dev_inode_find_by_uuid(struct cache_set *c, uuid_le *uuid,
diff --git a/libbcache/inode.h b/libbcache/inode.h
index fa1a4cf..81dccf6 100644
--- a/libbcache/inode.h
+++ b/libbcache/inode.h
@@ -3,18 +3,53 @@
extern const struct bkey_ops bch_bkey_inode_ops;
-ssize_t bch_inode_status(char *, size_t, const struct bkey *);
+struct bch_inode_unpacked {
+ u64 inum;
+ __le64 i_hash_seed;
+ u32 i_flags;
+ u16 i_mode;
-void bch_inode_init(struct cache_set *, struct bkey_i_inode *,
+#define BCH_INODE_FIELD(_name, _bits) u##_bits _name;
+ BCH_INODE_FIELDS()
+#undef BCH_INODE_FIELD
+};
+
+struct bkey_inode_buf {
+ struct bkey_i_inode inode;
+
+#define BCH_INODE_FIELD(_name, _bits) + 8 + _bits / 8
+ u8 _pad[0 + BCH_INODE_FIELDS()];
+#undef BCH_INODE_FIELD
+} __packed;
+
+void bch_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+int bch_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+
+void bch_inode_init(struct cache_set *, struct bch_inode_unpacked *,
uid_t, gid_t, umode_t, dev_t);
int bch_inode_create(struct cache_set *, struct bkey_i *, u64, u64, u64 *);
int bch_inode_truncate(struct cache_set *, u64, u64,
struct extent_insert_hook *, u64 *);
int bch_inode_rm(struct cache_set *, u64);
-int bch_inode_update(struct cache_set *, struct bkey_i *, u64 *);
-int bch_inode_find_by_inum(struct cache_set *, u64, struct bkey_i_inode *);
+int bch_inode_find_by_inum(struct cache_set *, u64,
+ struct bch_inode_unpacked *);
int bch_cached_dev_inode_find_by_uuid(struct cache_set *, uuid_le *,
struct bkey_i_inode_blockdev *);
+static inline struct timespec bch_time_to_timespec(struct cache_set *c, u64 time)
+{
+ return ns_to_timespec(time * c->sb.time_precision + c->sb.time_base_lo);
+}
+
+static inline u64 timespec_to_bch_time(struct cache_set *c, struct timespec ts)
+{
+ s64 ns = timespec_to_ns(&ts) - c->sb.time_base_lo;
+
+ if (c->sb.time_precision == 1)
+ return ns;
+
+ return div_s64(ns, c->sb.time_precision);
+}
+
#endif
diff --git a/libbcache/io.c b/libbcache/io.c
index 4112ea5..2f0e48a 100644
--- a/libbcache/io.c
+++ b/libbcache/io.c
@@ -22,7 +22,7 @@
#include "move.h"
#include "notify.h"
#include "stats.h"
-#include "super.h"
+#include "super-io.h"
#include <linux/blkdev.h>
#include <linux/random.h>
@@ -382,11 +382,27 @@ static void bch_write_endio(struct bio *bio)
closure_put(cl);
}
+static struct nonce extent_nonce(struct bversion version,
+ unsigned nonce,
+ unsigned uncompressed_size,
+ unsigned compression_type)
+{
+ return (struct nonce) {{
+ [0] = cpu_to_le32((nonce << 12) |
+ (uncompressed_size << 22)),
+ [1] = cpu_to_le32(version.lo),
+ [2] = cpu_to_le32(version.lo >> 32),
+ [3] = cpu_to_le32(version.hi|
+ (compression_type << 24))^BCH_NONCE_EXTENT,
+ }};
+}
+
static void init_append_extent(struct bch_write_op *op,
unsigned compressed_size,
unsigned uncompressed_size,
unsigned compression_type,
- u64 csum, unsigned csum_type,
+ unsigned nonce,
+ struct bch_csum csum, unsigned csum_type,
struct open_bucket *ob)
{
struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
@@ -394,11 +410,13 @@ static void init_append_extent(struct bch_write_op *op,
op->pos.offset += uncompressed_size;
e->k.p = op->pos;
e->k.size = uncompressed_size;
+ e->k.version = op->version;
+ bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
bch_extent_crc_append(e, compressed_size,
uncompressed_size,
compression_type,
- csum, csum_type);
+ nonce, csum, csum_type);
bch_alloc_sectors_append_ptrs(op->c, e, op->nr_replicas,
ob, compressed_size);
@@ -417,7 +435,7 @@ static int bch_write_extent(struct bch_write_op *op,
unsigned key_to_write_offset = op->insert_keys.top_p -
op->insert_keys.keys_p;
struct bkey_i *key_to_write;
- unsigned csum_type = c->opts.data_checksum;
+ unsigned csum_type = op->csum_type;
unsigned compression_type = op->compression_type;
int ret;
@@ -426,8 +444,8 @@ static int bch_write_extent(struct bch_write_op *op,
/* Need to decompress data? */
if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
- (op->crc.uncompressed_size != op->size ||
- op->crc.compressed_size > ob->sectors_free)) {
+ (crc_uncompressed_size(NULL, &op->crc) != op->size ||
+ crc_compressed_size(NULL, &op->crc) > ob->sectors_free)) {
int ret;
ret = bch_bio_uncompress_inplace(c, orig, op->size, op->crc);
@@ -439,9 +457,10 @@ static int bch_write_extent(struct bch_write_op *op,
if (op->flags & BCH_WRITE_DATA_COMPRESSED) {
init_append_extent(op,
- op->crc.compressed_size,
- op->crc.uncompressed_size,
+ crc_compressed_size(NULL, &op->crc),
+ crc_uncompressed_size(NULL, &op->crc),
op->crc.compression_type,
+ op->crc.nonce,
op->crc.csum,
op->crc.csum_type,
ob);
@@ -457,7 +476,10 @@ static int bch_write_extent(struct bch_write_op *op,
/* all units here in bytes */
unsigned total_output = 0, output_available =
min(ob->sectors_free << 9, orig->bi_iter.bi_size);
- u64 csum;
+ unsigned crc_nonce = bch_csum_type_is_encryption(csum_type)
+ ? op->nonce : 0;
+ struct bch_csum csum;
+ struct nonce nonce;
bio = bio_alloc_bioset(GFP_NOIO,
DIV_ROUND_UP(output_available, PAGE_SIZE),
@@ -489,13 +511,20 @@ static int bch_write_extent(struct bch_write_op *op,
BUG_ON(src_len & (block_bytes(c) - 1));
swap(bio->bi_iter.bi_size, dst_len);
- csum = bch_checksum_bio(bio, csum_type);
+ nonce = extent_nonce(op->version,
+ crc_nonce,
+ src_len >> 9,
+ compression_type),
+
+ bch_encrypt_bio(c, csum_type, nonce, bio);
+
+ csum = bch_checksum_bio(c, csum_type, nonce, bio);
swap(bio->bi_iter.bi_size, dst_len);
init_append_extent(op,
dst_len >> 9, src_len >> 9,
fragment_compression_type,
- csum, csum_type, ob);
+ crc_nonce, csum, csum_type, ob);
total_output += dst_len;
bio_advance(bio, dst_len);
@@ -531,7 +560,8 @@ static int bch_write_extent(struct bch_write_op *op,
wbio->put_bio = bio != orig;
init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
- compression_type, 0, csum_type, ob);
+ compression_type, 0,
+ (struct bch_csum) { 0 }, csum_type, ob);
ret = bio != orig;
}
@@ -546,8 +576,7 @@ static int bch_write_extent(struct bch_write_op *op,
key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
- if (!(op->flags & BCH_WRITE_CACHED))
- bch_check_mark_super(c, key_to_write, false);
+ bch_check_mark_super(c, key_to_write, false);
#ifndef CONFIG_BCACHE_NO_IO
bch_submit_wbio_replicas(to_wbio(bio), c, key_to_write, false);
@@ -748,6 +777,11 @@ void bch_write(struct closure *cl)
closure_return(cl);
}
+ if (bversion_zero(op->version) &&
+ bch_csum_type_is_encryption(op->csum_type))
+ op->version.lo =
+ atomic64_inc_return(&c->key_version) + 1;
+
if (!(op->flags & BCH_WRITE_DISCARD))
bch_increment_clock(c, bio_sectors(bio), WRITE);
@@ -804,17 +838,21 @@ void bch_write_op_init(struct bch_write_op *op, struct cache_set *c,
struct write_point *wp, struct bpos pos,
u64 *journal_seq, unsigned flags)
{
+ EBUG_ON(res.sectors && !res.nr_replicas);
+
op->c = c;
op->io_wq = index_update_wq(op);
op->bio = bio;
op->written = 0;
op->error = 0;
op->flags = flags;
+ op->csum_type = bch_data_checksum_type(c);
op->compression_type = c->opts.compression;
op->nr_replicas = res.nr_replicas;
op->alloc_reserve = RESERVE_NONE;
+ op->nonce = 0;
op->pos = pos;
- op->version = 0;
+ op->version = ZERO_VERSION;
op->res = res;
op->wp = wp;
@@ -853,7 +891,7 @@ void bch_write_op_init(struct bch_write_op *op, struct cache_set *c,
* appropriately inode_truncate should call this
*/
int bch_discard(struct cache_set *c, struct bpos start,
- struct bpos end, u64 version,
+ struct bpos end, struct bversion version,
struct disk_reservation *disk_res,
struct extent_insert_hook *hook,
u64 *journal_seq)
@@ -878,7 +916,11 @@ static int bio_checksum_uncompress(struct cache_set *c,
struct bio *src = &rbio->bio;
struct bio *dst = &bch_rbio_parent(rbio)->bio;
struct bvec_iter dst_iter = rbio->parent_iter;
- u64 csum;
+ struct nonce nonce = extent_nonce(rbio->version,
+ rbio->crc.nonce,
+ crc_uncompressed_size(NULL, &rbio->crc),
+ rbio->crc.compression_type);
+ struct bch_csum csum;
int ret = 0;
/*
@@ -888,18 +930,19 @@ static int bio_checksum_uncompress(struct cache_set *c,
* in order to promote
*/
if (rbio->bounce) {
- src->bi_iter.bi_size = rbio->crc.compressed_size << 9;
- src->bi_iter.bi_idx = 0;
- src->bi_iter.bi_bvec_done = 0;
+ src->bi_iter.bi_size = crc_compressed_size(NULL, &rbio->crc) << 9;
+ src->bi_iter.bi_idx = 0;
+ src->bi_iter.bi_bvec_done = 0;
} else {
src->bi_iter = rbio->parent_iter;
}
- csum = bch_checksum_bio(src, rbio->crc.csum_type);
- if (cache_nonfatal_io_err_on(rbio->crc.csum != csum, rbio->ca,
- "data checksum error, inode %llu offset %llu: expected %0llx got %0llx (type %u)",
+ csum = bch_checksum_bio(c, rbio->crc.csum_type, nonce, src);
+ if (cache_nonfatal_io_err_on(bch_crc_cmp(rbio->crc.csum, csum), rbio->ca,
+ "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
rbio->inode, (u64) rbio->parent_iter.bi_sector << 9,
- rbio->crc.csum, csum, rbio->crc.csum_type))
+ rbio->crc.csum.hi, rbio->crc.csum.lo, csum.hi, csum.lo,
+ rbio->crc.csum_type))
ret = -EIO;
/*
@@ -908,6 +951,7 @@ static int bio_checksum_uncompress(struct cache_set *c,
*/
if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) {
if (!ret) {
+ bch_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
ret = bch_bio_uncompress(c, src, dst,
dst_iter, rbio->crc);
if (ret)
@@ -915,8 +959,20 @@ static int bio_checksum_uncompress(struct cache_set *c,
}
} else if (rbio->bounce) {
bio_advance(src, rbio->crc.offset << 9);
+
+ /* don't need to decrypt the entire bio: */
+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+ src->bi_iter.bi_size = dst_iter.bi_size;
+
+ nonce = nonce_add(nonce, rbio->crc.offset << 9);
+
+ bch_encrypt_bio(c, rbio->crc.csum_type,
+ nonce, src);
+
bio_copy_data_iter(dst, dst_iter,
src, src->bi_iter);
+ } else {
+ bch_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
}
return ret;
@@ -1108,7 +1164,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
*/
unsigned sectors =
max_t(unsigned, k.k->size,
- pick->crc.uncompressed_size);
+ crc_uncompressed_size(NULL, &pick->crc));
unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
promote_op = kmalloc(sizeof(*promote_op) +
@@ -1130,7 +1186,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
*/
if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
(pick->crc.csum_type != BCH_CSUM_NONE &&
- (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
+ (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) ||
(flags & BCH_READ_FORCE_BOUNCE)))) {
read_full = true;
bounce = true;
@@ -1138,7 +1194,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
if (bounce) {
unsigned sectors = read_full
- ? (pick->crc.compressed_size ?: k.k->size)
+ ? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size)
: bvec_iter_sectors(iter);
rbio = container_of(bio_alloc_bioset(GFP_NOIO,
@@ -1183,6 +1239,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
rbio->flags = flags;
rbio->bounce = bounce;
rbio->split = split;
+ rbio->version = k.k->version;
rbio->crc = pick->crc;
/*
* crc.compressed_size will be 0 if there wasn't any checksum
@@ -1190,7 +1247,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
* bounced (which isn't necessarily the original key size, if we bounced
* only for promoting)
*/
- rbio->crc.compressed_size = bio_sectors(&rbio->bio);
+ rbio->crc._compressed_size = bio_sectors(&rbio->bio) - 1;
rbio->ptr = pick->ptr;
rbio->ca = pick->ca;
rbio->promote = promote_op;
@@ -1210,7 +1267,8 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
bch_migrate_write_init(c, &promote_op->write,
&c->promote_write_point,
k, NULL,
- BCH_WRITE_ALLOC_NOWAIT);
+ BCH_WRITE_ALLOC_NOWAIT|
+ BCH_WRITE_CACHED);
promote_op->write.promote = true;
if (rbio->crc.compression_type) {
diff --git a/libbcache/io.h b/libbcache/io.h
index b7668b4..99e5108 100644
--- a/libbcache/io.h
+++ b/libbcache/io.h
@@ -79,7 +79,7 @@ void bch_submit_wbio_replicas(struct bch_write_bio *, struct cache_set *,
const struct bkey_i *, bool);
int bch_discard(struct cache_set *, struct bpos, struct bpos,
- u64, struct disk_reservation *,
+ struct bversion, struct disk_reservation *,
struct extent_insert_hook *, u64 *);
void bch_read_retry_work(struct work_struct *);
diff --git a/libbcache/io_types.h b/libbcache/io_types.h
index f7d99cd..64269d9 100644
--- a/libbcache/io_types.h
+++ b/libbcache/io_types.h
@@ -43,7 +43,8 @@ struct bch_read_bio {
u8 bounce:1,
split:1;
- struct bch_extent_crc64 crc;
+ struct bversion version;
+ struct bch_extent_crc128 crc;
struct bch_extent_ptr ptr;
struct cache *ca;
@@ -101,15 +102,17 @@ struct bch_write_op {
short error;
u16 flags;
+ unsigned csum_type:4;
unsigned compression_type:4;
unsigned nr_replicas:4;
unsigned alloc_reserve:4;
+ unsigned nonce:14;
struct bpos pos;
- unsigned version;
+ struct bversion version;
/* For BCH_WRITE_DATA_COMPRESSED: */
- struct bch_extent_crc64 crc;
+ struct bch_extent_crc128 crc;
unsigned size;
struct disk_reservation res;
diff --git a/libbcache/journal.c b/libbcache/journal.c
index 9e09b86..3bb9e3c 100644
--- a/libbcache/journal.c
+++ b/libbcache/journal.c
@@ -18,7 +18,8 @@
#include "io.h"
#include "keylist.h"
#include "journal.h"
-#include "super.h"
+#include "super-io.h"
+#include "vstructs.h"
#include <trace/events/bcache.h>
@@ -52,19 +53,14 @@ static inline u64 journal_pin_seq(struct journal *j,
return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
}
-#define for_each_jset_entry(entry, jset) \
- for (entry = (jset)->start; \
- entry < bkey_idx(jset, le32_to_cpu((jset)->u64s)); \
- entry = jset_keys_next(entry))
-
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
- while (entry < bkey_idx(jset, le32_to_cpu(jset->u64s))) {
+ while (entry < vstruct_last(jset)) {
if (JOURNAL_ENTRY_TYPE(entry) == type)
return entry;
- entry = jset_keys_next(entry);
+ entry = vstruct_next(entry);
}
return NULL;
@@ -73,14 +69,11 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
#define for_each_jset_entry_type(entry, jset, type) \
for (entry = (jset)->start; \
(entry = __jset_entry_type_next(jset, entry, type)); \
- entry = jset_keys_next(entry))
+ entry = vstruct_next(entry))
#define for_each_jset_key(k, _n, entry, jset) \
for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
- for (k = (entry)->start; \
- (k < bkey_idx(entry, le16_to_cpu((entry)->u64s)) &&\
- (_n = bkey_next(k), 1)); \
- k = _n)
+ vstruct_for_each_safe(entry, k, _n)
static inline void bch_journal_add_entry(struct journal_buf *buf,
const void *data, size_t u64s,
@@ -199,8 +192,6 @@ redo_peek:
closure_sync(&cl);
- mutex_lock(&c->btree_interior_update_lock);
-
for (i = 0;; i++) {
struct btree_interior_update *as;
struct pending_btree_node_free *d;
@@ -212,6 +203,8 @@ redo_peek:
}
n = bl->entries[i];
mutex_unlock(&j->blacklist_lock);
+redo_wait:
+ mutex_lock(&c->btree_interior_update_lock);
/*
* Is the node on the list of pending interior node updates -
@@ -225,11 +218,11 @@ redo_peek:
closure_wait(&as->wait, &cl);
mutex_unlock(&c->btree_interior_update_lock);
closure_sync(&cl);
- break;
+ goto redo_wait;
}
- }
- mutex_unlock(&c->btree_interior_update_lock);
+ mutex_unlock(&c->btree_interior_update_lock);
+ }
mutex_lock(&j->blacklist_lock);
@@ -377,7 +370,6 @@ out:
struct journal_list {
struct closure cl;
struct mutex lock;
- struct mutex cache_set_buffer_lock;
struct list_head *head;
int ret;
};
@@ -394,7 +386,7 @@ static int journal_entry_add(struct cache_set *c, struct journal_list *jlist,
{
struct journal_replay *i, *pos;
struct list_head *where;
- size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s));
+ size_t bytes = vstruct_bytes(j);
__le64 last_seq;
int ret;
@@ -422,8 +414,7 @@ static int journal_entry_add(struct cache_set *c, struct journal_list *jlist,
list_for_each_entry_reverse(i, jlist->head, list) {
/* Duplicate? */
if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
- fsck_err_on(bytes != __set_bytes(&i->j,
- le32_to_cpu(i->j.u64s)) ||
+ fsck_err_on(bytes != vstruct_bytes(&i->j) ||
memcmp(j, &i->j, bytes), c,
"found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
@@ -455,11 +446,21 @@ fsck_err:
return ret;
}
+static struct nonce journal_nonce(const struct jset *jset)
+{
+ return (struct nonce) {{
+ [0] = 0,
+ [1] = ((__le32 *) &jset->seq)[0],
+ [2] = ((__le32 *) &jset->seq)[1],
+ [3] = BCH_NONCE_JOURNAL,
+ }};
+}
+
static void journal_entry_null_range(void *start, void *end)
{
struct jset_entry *entry;
- for (entry = start; entry != end; entry = jset_keys_next(entry)) {
+ for (entry = start; entry != end; entry = vstruct_next(entry)) {
entry->u64s = 0;
entry->btree_id = 0;
entry->level = 0;
@@ -473,7 +474,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
struct bkey_i *k, enum bkey_type key_type,
const char *type)
{
- void *next = jset_keys_next(entry);
+ void *next = vstruct_next(entry);
const char *invalid;
char buf[160];
int ret = 0;
@@ -481,16 +482,16 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
if (mustfix_fsck_err_on(!k->k.u64s, c,
"invalid %s in journal: k->u64s 0", type)) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
- journal_entry_null_range(jset_keys_next(entry), next);
+ journal_entry_null_range(vstruct_next(entry), next);
return 0;
}
if (mustfix_fsck_err_on((void *) bkey_next(k) >
- (void *) jset_keys_next(entry), c,
+ (void *) vstruct_next(entry), c,
"invalid %s in journal: extends past end of journal entry",
type)) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
- journal_entry_null_range(jset_keys_next(entry), next);
+ journal_entry_null_range(vstruct_next(entry), next);
return 0;
}
@@ -499,7 +500,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
type, k->k.format)) {
le16_add_cpu(&entry->u64s, -k->k.u64s);
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
- journal_entry_null_range(jset_keys_next(entry), next);
+ journal_entry_null_range(vstruct_next(entry), next);
return 0;
}
@@ -514,7 +515,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
le16_add_cpu(&entry->u64s, -k->k.u64s);
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
- journal_entry_null_range(jset_keys_next(entry), next);
+ journal_entry_null_range(vstruct_next(entry), next);
return 0;
}
fsck_err:
@@ -525,16 +526,17 @@ fsck_err:
#define JOURNAL_ENTRY_NONE 6
#define JOURNAL_ENTRY_BAD 7
-static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 sector,
+static int journal_entry_validate(struct cache_set *c,
+ struct jset *j, u64 sector,
unsigned bucket_sectors_left,
unsigned sectors_read)
{
struct jset_entry *entry;
- size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s));
- u64 got, expect;
+ size_t bytes = vstruct_bytes(j);
+ struct bch_csum csum;
int ret = 0;
- if (le64_to_cpu(j->magic) != jset_magic(&c->disk_sb))
+ if (le64_to_cpu(j->magic) != jset_magic(c))
return JOURNAL_ENTRY_NONE;
if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
@@ -554,25 +556,32 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
if (bytes > sectors_read << 9)
return JOURNAL_ENTRY_REREAD;
- got = le64_to_cpu(j->csum);
- expect = __csum_set(j, le32_to_cpu(j->u64s), JSET_CSUM_TYPE(j));
- if (mustfix_fsck_err_on(got != expect, c,
- "journal checksum bad (got %llu expect %llu), sector %lluu",
- got, expect, sector)) {
+ if (fsck_err_on(!bch_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
+ "journal entry with unknown csum type %llu sector %lluu",
+ JSET_CSUM_TYPE(j), sector))
+ return JOURNAL_ENTRY_BAD;
+
+ csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
+ if (mustfix_fsck_err_on(bch_crc_cmp(csum, j->csum), c,
+ "journal checksum bad, sector %llu", sector)) {
/* XXX: retry IO, when we start retrying checksum errors */
/* XXX: note we might have missing journal entries */
return JOURNAL_ENTRY_BAD;
}
- if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq),
- c, "invalid journal entry: last_seq > seq"))
+ bch_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+ j->encrypted_start,
+ vstruct_end(j) - (void *) j->encrypted_start);
+
+ if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
+ "invalid journal entry: last_seq > seq"))
j->last_seq = j->seq;
- for_each_jset_entry(entry, j) {
+ vstruct_for_each(j, entry) {
struct bkey_i *k;
- if (mustfix_fsck_err_on(jset_keys_next(entry) >
- bkey_idx(j, le32_to_cpu(j->u64s)), c,
+ if (mustfix_fsck_err_on(vstruct_next(entry) >
+ vstruct_last(j), c,
"journal entry extents past end of jset")) {
j->u64s = cpu_to_le64((u64 *) entry - j->_data);
break;
@@ -580,9 +589,7 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
switch (JOURNAL_ENTRY_TYPE(entry)) {
case JOURNAL_ENTRY_BTREE_KEYS:
- for (k = entry->start;
- k < bkey_idx(entry, le16_to_cpu(entry->u64s));
- k = bkey_next(k)) {
+ vstruct_for_each(entry, k) {
ret = journal_validate_key(c, j, entry, k,
bkey_type(entry->level,
entry->btree_id),
@@ -599,7 +606,7 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
le16_to_cpu(entry->u64s) != k->k.u64s, c,
"invalid btree root journal entry: wrong number of keys")) {
journal_entry_null_range(entry,
- jset_keys_next(entry));
+ vstruct_next(entry));
continue;
}
@@ -616,14 +623,14 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
if (mustfix_fsck_err_on(le16_to_cpu(entry->u64s) != 1, c,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry,
- jset_keys_next(entry));
+ vstruct_next(entry));
}
break;
default:
mustfix_fsck_err(c, "invalid journal entry type %llu",
JOURNAL_ENTRY_TYPE(entry));
- journal_entry_null_range(entry, jset_keys_next(entry));
+ journal_entry_null_range(entry, vstruct_next(entry));
break;
}
}
@@ -632,126 +639,127 @@ fsck_err:
return ret;
}
-static int journal_read_bucket(struct cache *ca, struct journal_list *jlist,
+struct journal_read_buf {
+ void *data;
+ size_t size;
+};
+
+static int journal_read_buf_realloc(struct journal_read_buf *b,
+ size_t new_size)
+{
+ void *n;
+
+ new_size = roundup_pow_of_two(new_size);
+ n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size));
+ if (!n)
+ return -ENOMEM;
+
+ free_pages((unsigned long) b->data, get_order(b->size));
+ b->data = n;
+ b->size = new_size;
+ return 0;
+}
+
+static int journal_read_bucket(struct cache *ca,
+ struct journal_read_buf *buf,
+ struct journal_list *jlist,
unsigned bucket, u64 *seq, bool *entries_found)
{
struct cache_set *c = ca->set;
struct journal_device *ja = &ca->journal;
struct bio *bio = ja->bio;
- struct jset *j, *data;
- unsigned blocks, sectors_read, bucket_offset = 0;
- unsigned max_entry_sectors = c->journal.entry_size_max >> 9;
- u64 sector = bucket_to_sector(ca,
- journal_bucket(ca->disk_sb.sb, bucket));
+ struct jset *j = NULL;
+ unsigned sectors, sectors_read = 0;
+ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
+ end = offset + ca->mi.bucket_size;
bool saw_bad = false;
int ret = 0;
- data = (void *) __get_free_pages(GFP_KERNEL,
- get_order(c->journal.entry_size_max));
- if (!data) {
- mutex_lock(&jlist->cache_set_buffer_lock);
- data = c->journal.buf[0].data;
- }
-
pr_debug("reading %u", bucket);
- while (bucket_offset < ca->mi.bucket_size) {
-reread:
- sectors_read = min_t(unsigned,
- ca->mi.bucket_size - bucket_offset,
- max_entry_sectors);
+ while (offset < end) {
+ if (!sectors_read) {
+reread: sectors_read = min_t(unsigned,
+ end - offset, buf->size >> 9);
- bio_reset(bio);
- bio->bi_bdev = ca->disk_sb.bdev;
- bio->bi_iter.bi_sector = sector + bucket_offset;
- bio->bi_iter.bi_size = sectors_read << 9;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
- bch_bio_map(bio, data);
-
- ret = submit_bio_wait(bio);
-
- if (cache_fatal_io_err_on(ret, ca,
- "journal read from sector %llu",
- sector + bucket_offset) ||
- bch_meta_read_fault("journal")) {
- ret = -EIO;
- goto err;
- }
+ bio_reset(bio);
+ bio->bi_bdev = ca->disk_sb.bdev;
+ bio->bi_iter.bi_sector = offset;
+ bio->bi_iter.bi_size = sectors_read << 9;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bch_bio_map(bio, buf->data);
- /* This function could be simpler now since we no longer write
- * journal entries that overlap bucket boundaries; this means
- * the start of a bucket will always have a valid journal entry
- * if it has any journal entries at all.
- */
+ ret = submit_bio_wait(bio);
- j = data;
- while (sectors_read) {
- ret = journal_entry_validate(c, j,
- sector + bucket_offset,
- ca->mi.bucket_size - bucket_offset,
- sectors_read);
- switch (ret) {
- case BCH_FSCK_OK:
- break;
- case JOURNAL_ENTRY_REREAD:
- goto reread;
- case JOURNAL_ENTRY_NONE:
- if (!saw_bad)
- goto out;
- blocks = 1;
- goto next_block;
- case JOURNAL_ENTRY_BAD:
- saw_bad = true;
- blocks = 1;
- goto next_block;
- default:
- goto err;
- }
+ if (cache_fatal_io_err_on(ret, ca,
+ "journal read from sector %llu",
+ offset) ||
+ bch_meta_read_fault("journal"))
+ return -EIO;
- /*
- * This happens sometimes if we don't have discards on -
- * when we've partially overwritten a bucket with new
- * journal entries. We don't need the rest of the
- * bucket:
- */
- if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- goto out;
-
- ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
-
- ret = journal_entry_add(c, jlist, j);
- switch (ret) {
- case JOURNAL_ENTRY_ADD_OK:
- *entries_found = true;
- break;
- case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
- break;
- default:
- goto err;
+ j = buf->data;
+ }
+
+ ret = journal_entry_validate(c, j, offset,
+ end - offset, sectors_read);
+ switch (ret) {
+ case BCH_FSCK_OK:
+ break;
+ case JOURNAL_ENTRY_REREAD:
+ if (vstruct_bytes(j) > buf->size) {
+ ret = journal_read_buf_realloc(buf,
+ vstruct_bytes(j));
+ if (ret)
+ return ret;
}
+ goto reread;
+ case JOURNAL_ENTRY_NONE:
+ if (!saw_bad)
+ return 0;
+ sectors = c->sb.block_size;
+ goto next_block;
+ case JOURNAL_ENTRY_BAD:
+ saw_bad = true;
+ sectors = c->sb.block_size;
+ goto next_block;
+ default:
+ return ret;
+ }
- if (le64_to_cpu(j->seq) > *seq)
- *seq = le64_to_cpu(j->seq);
-next_block:
- blocks = __set_blocks(j, le32_to_cpu(j->u64s),
- block_bytes(c));
+ /*
+ * This happens sometimes if we don't have discards on -
+ * when we've partially overwritten a bucket with new
+ * journal entries. We don't need the rest of the
+ * bucket:
+ */
+ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+ return 0;
+
+ ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
- pr_debug("next");
- bucket_offset += blocks * c->sb.block_size;
- sectors_read -= blocks * c->sb.block_size;
- j = ((void *) j) + blocks * block_bytes(c);
+ ret = journal_entry_add(c, jlist, j);
+ switch (ret) {
+ case JOURNAL_ENTRY_ADD_OK:
+ *entries_found = true;
+ break;
+ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+ break;
+ default:
+ return ret;
}
+
+ if (le64_to_cpu(j->seq) > *seq)
+ *seq = le64_to_cpu(j->seq);
+
+ sectors = vstruct_sectors(j, c->block_bits);
+next_block:
+ pr_debug("next");
+ offset += sectors;
+ sectors_read -= sectors;
+ j = ((void *) j) + (sectors << 9);
}
-out:
- ret = 0;
-err:
- if (data == c->journal.buf[0].data)
- mutex_unlock(&jlist->cache_set_buffer_lock);
- else
- free_pages((unsigned long) data,
- get_order(c->journal.entry_size_max));
- return ret;
+ return 0;
}
static void bch_journal_read_device(struct closure *cl)
@@ -759,15 +767,11 @@ static void bch_journal_read_device(struct closure *cl)
#define read_bucket(b) \
({ \
bool entries_found = false; \
- int ret = journal_read_bucket(ca, jlist, b, \
- &seq, &entries_found); \
+ ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \
+ &entries_found); \
+ if (ret) \
+ goto err; \
__set_bit(b, bitmap); \
- if (ret) { \
- mutex_lock(&jlist->lock); \
- jlist->ret = ret; \
- mutex_unlock(&jlist->lock); \
- closure_return(cl); \
- } \
entries_found; \
})
@@ -777,24 +781,29 @@ static void bch_journal_read_device(struct closure *cl)
struct journal_list *jlist =
container_of(cl->parent, struct journal_list, cl);
struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
+ struct journal_read_buf buf = { NULL, 0 };
- unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
- DECLARE_BITMAP(bitmap, nr_buckets);
+ DECLARE_BITMAP(bitmap, ja->nr);
unsigned i, l, r;
u64 seq = 0;
+ int ret;
- if (!nr_buckets)
- closure_return(cl);
+ if (!ja->nr)
+ goto out;
+
+ bitmap_zero(bitmap, ja->nr);
+ ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+ if (ret)
+ goto err;
- bitmap_zero(bitmap, nr_buckets);
- pr_debug("%u journal buckets", nr_buckets);
+ pr_debug("%u journal buckets", ja->nr);
/*
* If the device supports discard but not secure discard, we can't do
* the fancy fibonacci hash/binary search because the live journal
* entries might not form a contiguous range:
*/
- for (i = 0; i < nr_buckets; i++)
+ for (i = 0; i < ja->nr; i++)
read_bucket(i);
goto search_done;
@@ -805,8 +814,8 @@ static void bch_journal_read_device(struct closure *cl)
* Read journal buckets ordered by golden ratio hash to quickly
* find a sequence of buckets with valid journal entries
*/
- for (i = 0; i < nr_buckets; i++) {
- l = (i * 2654435769U) % nr_buckets;
+ for (i = 0; i < ja->nr; i++) {
+ l = (i * 2654435769U) % ja->nr;
if (test_bit(l, bitmap))
break;
@@ -821,18 +830,18 @@ static void bch_journal_read_device(struct closure *cl)
*/
pr_debug("falling back to linear search");
linear_scan:
- for (l = find_first_zero_bit(bitmap, nr_buckets);
- l < nr_buckets;
- l = find_next_zero_bit(bitmap, nr_buckets, l + 1))
+ for (l = find_first_zero_bit(bitmap, ja->nr);
+ l < ja->nr;
+ l = find_next_zero_bit(bitmap, ja->nr, l + 1))
if (read_bucket(l))
goto bsearch;
/* no journal entries on this device? */
- if (l == nr_buckets)
- closure_return(cl);
+ if (l == ja->nr)
+ goto out;
bsearch:
/* Binary search */
- r = find_next_bit(bitmap, nr_buckets, l + 1);
+ r = find_next_bit(bitmap, ja->nr, l + 1);
pr_debug("starting binary search, l %u r %u", l, r);
while (l + 1 < r) {
@@ -858,9 +867,9 @@ search_done:
*/
seq = 0;
- for (i = 0; i < nr_buckets; i++)
+ for (i = 0; i < ja->nr; i++)
if (ja->bucket_seq[i] >= seq &&
- ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % nr_buckets]) {
+ ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
/*
* When journal_next_bucket() goes to allocate for
* the first time, it'll use the bucket after
@@ -875,20 +884,26 @@ search_done:
* reclaimed - journal reclaim will immediately reclaim whatever isn't
* pinned when it first runs:
*/
- ja->last_idx = (ja->cur_idx + 1) % nr_buckets;
+ ja->last_idx = (ja->cur_idx + 1) % ja->nr;
/*
* Read buckets in reverse order until we stop finding more journal
* entries:
*/
- for (i = (ja->cur_idx + nr_buckets - 1) % nr_buckets;
+ for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
i != ja->cur_idx;
- i = (i + nr_buckets - 1) % nr_buckets)
+ i = (i + ja->nr - 1) % ja->nr)
if (!test_bit(i, bitmap) &&
!read_bucket(i))
break;
-
+out:
+ free_pages((unsigned long) buf.data, get_order(buf.size));
closure_return(cl);
+err:
+ mutex_lock(&jlist->lock);
+ jlist->ret = ret;
+ mutex_unlock(&jlist->lock);
+ goto out;
#undef read_bucket
}
@@ -930,6 +945,19 @@ static int journal_seq_blacklist_read(struct journal *j,
return 0;
}
+static inline bool journal_has_keys(struct list_head *list)
+{
+ struct journal_replay *i;
+ struct jset_entry *entry;
+ struct bkey_i *k, *_n;
+
+ list_for_each_entry(i, list, list)
+ for_each_jset_key(k, _n, entry, &i->j)
+ return true;
+
+ return false;
+}
+
int bch_journal_read(struct cache_set *c, struct list_head *list)
{
struct jset_entry *prio_ptrs;
@@ -944,7 +972,6 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
closure_init_stack(&jlist.cl);
mutex_init(&jlist.lock);
- mutex_init(&jlist.cache_set_buffer_lock);
jlist.head = list;
jlist.ret = 0;
@@ -964,6 +991,9 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
return BCH_FSCK_REPAIR_IMPOSSIBLE;
}
+ fsck_err_on(c->sb.clean && journal_has_keys(list), c,
+ "filesystem marked clean but journal has keys to replay");
+
j = &list_entry(list->prev, struct journal_replay, list)->j;
unfixable_fsck_err_on(le64_to_cpu(j->seq) -
@@ -1057,7 +1087,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
if (btree_type_has_ptrs(type))
- __bch_btree_mark_key(c, type, k_s_c);
+ bch_btree_mark_key_initial(c, type, k_s_c);
}
}
@@ -1171,10 +1201,9 @@ static enum {
buf->data->last_seq = cpu_to_le64(last_seq(j));
j->prev_buf_sectors =
- __set_blocks(buf->data,
- le32_to_cpu(buf->data->u64s) +
- journal_entry_u64s_reserve(buf),
- block_bytes(c)) * c->sb.block_size;
+ vstruct_blocks_plus(buf->data, c->block_bits,
+ journal_entry_u64s_reserve(buf)) *
+ c->sb.block_size;
BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
@@ -1219,9 +1248,8 @@ static unsigned journal_dev_buckets_available(struct journal *j,
struct cache *ca)
{
struct journal_device *ja = &ca->journal;
- unsigned nr = bch_nr_journal_buckets(ca->disk_sb.sb);
- unsigned next = (ja->cur_idx + 1) % nr;
- unsigned available = (ja->last_idx + nr - next) % nr;
+ unsigned next = (ja->cur_idx + 1) % ja->nr;
+ unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
/*
* Hack to avoid a deadlock during journal replay:
@@ -1271,7 +1299,7 @@ static int journal_entry_sectors(struct journal *j)
* for the previous entry we have to make sure we have space for
* it too:
*/
- if (bch_extent_has_device(e.c, ca->sb.nr_this_dev)) {
+ if (bch_extent_has_device(e.c, ca->dev_idx)) {
if (j->prev_buf_sectors > ca->journal.sectors_free)
buckets_required++;
@@ -1479,17 +1507,28 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list)
entries++;
}
+ if (keys) {
+ bch_btree_flush(c);
+
+ /*
+ * Write a new journal entry _before_ we start journalling new data -
+ * otherwise, we could end up with btree node bsets with journal seqs
+ * arbitrarily far in the future vs. the most recently written journal
+ * entry on disk, if we crash before writing the next journal entry:
+ */
+ ret = bch_journal_meta(&c->journal);
+ if (ret)
+ goto err;
+ }
+
bch_info(c, "journal replay done, %i keys in %i entries, seq %llu",
keys, entries, (u64) atomic64_read(&j->seq));
- fsck_err_on(c->sb.clean && keys, c,
- "filesystem marked clean, but journal had keys to replay");
-
bch_journal_set_replay_done(&c->journal);
err:
if (ret)
bch_err(c, "journal replay error: %d", ret);
-fsck_err:
+
bch_journal_entries_free(list);
return ret;
@@ -1497,28 +1536,40 @@ fsck_err:
static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
{
- unsigned u64s = bch_journal_buckets_offset(ca->disk_sb.sb) + nr;
+ struct journal_device *ja = &ca->journal;
+ struct bch_sb_field_journal *journal_buckets =
+ bch_sb_get_journal(ca->disk_sb.sb);
+ struct bch_sb_field *f;
u64 *p;
- int ret;
- ret = bch_super_realloc(&ca->disk_sb, u64s);
- if (ret)
- return ret;
+ p = krealloc(ja->bucket_seq, nr * sizeof(u64),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!p)
+ return -ENOMEM;
+
+ ja->bucket_seq = p;
- p = krealloc(ca->journal.bucket_seq,
- nr * sizeof(u64),
+ p = krealloc(ja->buckets, nr * sizeof(u64),
GFP_KERNEL|__GFP_ZERO);
if (!p)
return -ENOMEM;
- ca->journal.bucket_seq = p;
- ca->disk_sb.sb->u64s = cpu_to_le16(u64s);
+ ja->buckets = p;
+
+ f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr +
+ sizeof(*journal_buckets) / sizeof(u64));
+ if (!f)
+ return -ENOMEM;
+ f->type = BCH_SB_FIELD_journal;
+ ja->nr = nr;
return 0;
}
int bch_cache_journal_alloc(struct cache *ca)
{
+ struct journal_device *ja = &ca->journal;
+ struct bch_sb_field_journal *journal_buckets;
int ret;
unsigned i;
@@ -1540,11 +1591,15 @@ int bch_cache_journal_alloc(struct cache *ca)
if (ret)
return ret;
- for (i = 0; i < bch_nr_journal_buckets(ca->disk_sb.sb); i++) {
- unsigned long r = ca->mi.first_bucket + i;
+ journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
+
+ for (i = 0; i < ja->nr; i++) {
+ u64 bucket = ca->mi.first_bucket + i;
- bch_mark_metadata_bucket(ca, &ca->buckets[r], true);
- set_journal_bucket(ca->disk_sb.sb, i, r);
+ ja->buckets[i] = bucket;
+ journal_buckets->buckets[i] = cpu_to_le64(bucket);
+
+ bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true);
}
return 0;
@@ -1749,7 +1804,7 @@ static void journal_reclaim_work(struct work_struct *work)
struct cache *ca;
struct journal_entry_pin *pin;
u64 seq_to_flush = 0;
- unsigned iter, nr, bucket_to_flush;
+ unsigned iter, bucket_to_flush;
unsigned long next_flush;
bool reclaim_lock_held = false, need_flush;
@@ -1781,13 +1836,11 @@ static void journal_reclaim_work(struct work_struct *work)
blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca,
- journal_bucket(ca->disk_sb.sb,
- ja->last_idx)),
+ ja->buckets[ja->last_idx]),
ca->mi.bucket_size, GFP_NOIO, 0);
spin_lock(&j->lock);
- ja->last_idx = (ja->last_idx + 1) %
- bch_nr_journal_buckets(ca->disk_sb.sb);
+ ja->last_idx = (ja->last_idx + 1) % ja->nr;
spin_unlock(&j->lock);
wake_up(&j->wait);
@@ -1798,8 +1851,7 @@ static void journal_reclaim_work(struct work_struct *work)
* buckets
*/
spin_lock(&j->lock);
- nr = bch_nr_journal_buckets(ca->disk_sb.sb),
- bucket_to_flush = (ja->cur_idx + (nr >> 1)) % nr;
+ bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
seq_to_flush = max_t(u64, seq_to_flush,
ja->bucket_seq[bucket_to_flush]);
spin_unlock(&j->lock);
@@ -1861,7 +1913,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
*/
extent_for_each_ptr_backwards(e, ptr)
if (!(ca = PTR_CACHE(c, ptr)) ||
- ca->mi.state != CACHE_ACTIVE ||
+ ca->mi.state != BCH_MEMBER_STATE_ACTIVE ||
ca->journal.sectors_free <= sectors)
__bch_extent_drop_ptr(e, ptr);
else
@@ -1875,7 +1927,6 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
*/
group_for_each_cache_rcu(ca, &j->devs, iter) {
struct journal_device *ja = &ca->journal;
- unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
if (replicas >= replicas_want)
break;
@@ -1884,21 +1935,20 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
* Check that we can use this device, and aren't already using
* it:
*/
- if (bch_extent_has_device(e.c, ca->sb.nr_this_dev) ||
+ if (bch_extent_has_device(e.c, ca->dev_idx) ||
!journal_dev_buckets_available(j, ca) ||
sectors > ca->mi.bucket_size)
continue;
ja->sectors_free = ca->mi.bucket_size - sectors;
- ja->cur_idx = (ja->cur_idx + 1) % nr_buckets;
+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq);
extent_ptr_append(bkey_i_to_extent(&j->key),
(struct bch_extent_ptr) {
.offset = bucket_to_sector(ca,
- journal_bucket(ca->disk_sb.sb,
- ja->cur_idx)),
- .dev = ca->sb.nr_this_dev,
+ ja->buckets[ja->cur_idx]),
+ .dev = ca->dev_idx,
});
replicas++;
@@ -1928,10 +1978,7 @@ static void journal_write_compact(struct jset *jset)
* If we wanted to be really fancy here, we could sort all the keys in
* the jset and drop keys that were overwritten - probably not worth it:
*/
- for (i = jset->start;
- i < (struct jset_entry *) bkey_idx(jset, le32_to_cpu(jset->u64s)) &&
- (next = jset_keys_next(i), true);
- i = next) {
+ vstruct_for_each_safe(jset, i, next) {
unsigned u64s = le16_to_cpu(i->u64s);
/* Empty entry: */
@@ -1945,7 +1992,7 @@ static void journal_write_compact(struct jset *jset)
JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) &&
JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS &&
le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
- memmove_u64s_down(jset_keys_next(prev),
+ memmove_u64s_down(vstruct_next(prev),
i->_data,
u64s);
le16_add_cpu(&prev->u64s, u64s);
@@ -1953,12 +2000,12 @@ static void journal_write_compact(struct jset *jset)
}
/* Couldn't merge, move i into new position (after prev): */
- prev = prev ? jset_keys_next(prev) : jset->start;
+ prev = prev ? vstruct_next(prev) : jset->start;
if (i != prev)
memmove_u64s_down(prev, i, jset_u64s(u64s));
}
- prev = prev ? jset_keys_next(prev) : jset->start;
+ prev = prev ? vstruct_next(prev) : jset->start;
jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
}
@@ -2019,6 +2066,7 @@ static void journal_write(struct closure *cl)
struct cache_set *c = container_of(j, struct cache_set, journal);
struct cache *ca;
struct journal_buf *w = journal_prev_buf(j);
+ struct jset *jset = w->data;
struct bio *bio;
struct bch_extent_ptr *ptr;
unsigned i, sectors, bytes;
@@ -2036,24 +2084,27 @@ static void journal_write(struct closure *cl)
}
mutex_unlock(&c->btree_root_lock);
- journal_write_compact(w->data);
+ journal_write_compact(jset);
+
+ jset->read_clock = cpu_to_le16(c->prio_clock[READ].hand);
+ jset->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand);
+ jset->magic = cpu_to_le64(jset_magic(c));
+ jset->version = cpu_to_le32(BCACHE_JSET_VERSION);
- w->data->read_clock = cpu_to_le16(c->prio_clock[READ].hand);
- w->data->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand);
- w->data->magic = cpu_to_le64(jset_magic(&c->disk_sb));
- w->data->version = cpu_to_le32(BCACHE_JSET_VERSION);
+ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
+ SET_JSET_CSUM_TYPE(jset, bch_meta_checksum_type(c));
- SET_JSET_BIG_ENDIAN(w->data, CPU_BIG_ENDIAN);
- SET_JSET_CSUM_TYPE(w->data, c->opts.metadata_checksum);
- w->data->csum = cpu_to_le64(__csum_set(w->data,
- le32_to_cpu(w->data->u64s),
- JSET_CSUM_TYPE(w->data)));
+ bch_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+ jset->encrypted_start,
+ vstruct_end(jset) - (void *) jset->encrypted_start);
- sectors = __set_blocks(w->data, le32_to_cpu(w->data->u64s),
- block_bytes(c)) * c->sb.block_size;
+ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
+ journal_nonce(jset), jset);
+
+ sectors = vstruct_sectors(jset, c->block_bits);
BUG_ON(sectors > j->prev_buf_sectors);
- bytes = __set_bytes(w->data, le32_to_cpu(w->data->u64s));
+ bytes = vstruct_bytes(w->data);
memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
if (journal_write_alloc(j, sectors)) {
@@ -2096,7 +2147,7 @@ static void journal_write(struct closure *cl)
bio->bi_private = ca;
bio_set_op_attrs(bio, REQ_OP_WRITE,
REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
- bch_bio_map(bio, w->data);
+ bch_bio_map(bio, jset);
trace_bcache_journal_write(bio);
closure_bio_submit_punt(bio, cl, c);
@@ -2105,7 +2156,7 @@ static void journal_write(struct closure *cl)
}
for_each_cache(ca, c, i)
- if (ca->mi.state == CACHE_ACTIVE &&
+ if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
journal_flushes_device(ca) &&
!bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) {
percpu_ref_get(&ca->ref);
@@ -2503,7 +2554,7 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf)
"\tnr\t\t%u\n"
"\tcur_idx\t\t%u (seq %llu)\n"
"\tlast_idx\t%u (seq %llu)\n",
- iter, bch_nr_journal_buckets(ca->disk_sb.sb),
+ iter, ja->nr,
ja->cur_idx, ja->bucket_seq[ja->cur_idx],
ja->last_idx, ja->bucket_seq[ja->last_idx]);
}
@@ -2521,7 +2572,7 @@ static bool bch_journal_writing_to_device(struct cache *ca)
spin_lock(&j->lock);
ret = bch_extent_has_device(bkey_i_to_s_c_extent(&j->key),
- ca->sb.nr_this_dev);
+ ca->dev_idx);
spin_unlock(&j->lock);
return ret;
@@ -2541,10 +2592,11 @@ static bool bch_journal_writing_to_device(struct cache *ca)
int bch_journal_move(struct cache *ca)
{
- unsigned i, nr_buckets;
u64 last_flushed_seq;
+ struct journal_device *ja = &ca->journal;
struct cache_set *c = ca->set;
struct journal *j = &c->journal;
+ unsigned i;
int ret = 0; /* Success */
if (bch_journal_writing_to_device(ca)) {
@@ -2585,10 +2637,45 @@ int bch_journal_move(struct cache *ca)
last_flushed_seq = last_seq(j);
spin_unlock(&j->lock);
- nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
-
- for (i = 0; i < nr_buckets; i += 1)
- BUG_ON(ca->journal.bucket_seq[i] > last_flushed_seq);
+ for (i = 0; i < ja->nr; i += 1)
+ BUG_ON(ja->bucket_seq[i] > last_flushed_seq);
return ret;
}
+
+void bch_journal_free_cache(struct cache *ca)
+{
+ kfree(ca->journal.buckets);
+ kfree(ca->journal.bucket_seq);
+}
+
+int bch_journal_init_cache(struct cache *ca)
+{
+ struct journal_device *ja = &ca->journal;
+ struct bch_sb_field_journal *journal_buckets =
+ bch_sb_get_journal(ca->disk_sb.sb);
+ unsigned i, journal_entry_pages;
+
+ journal_entry_pages =
+ DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb),
+ PAGE_SECTORS);
+
+ ja->nr = bch_nr_journal_buckets(journal_buckets);
+
+ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+ if (!ja->bucket_seq)
+ return -ENOMEM;
+
+ ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages);
+ if (!ca->journal.bio)
+ return -ENOMEM;
+
+ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+ if (!ja->buckets)
+ return -ENOMEM;
+
+ for (i = 0; i < ja->nr; i++)
+ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+
+ return 0;
+}
diff --git a/libbcache/journal.h b/libbcache/journal.h
index 759ed60..9274831 100644
--- a/libbcache/journal.h
+++ b/libbcache/journal.h
@@ -111,11 +111,7 @@
#include <linux/hash.h>
#include "journal_types.h"
-
-static inline struct jset_entry *jset_keys_next(struct jset_entry *j)
-{
- return (void *) __bkey_idx(j, le16_to_cpu(j->u64s));
-}
+//#include "super-io.h"
/*
* Only used for holding the journal entries we read in btree_journal_read()
@@ -182,7 +178,7 @@ static inline void bch_journal_add_entry_at(struct journal_buf *buf,
unsigned type, enum btree_id id,
unsigned level, unsigned offset)
{
- struct jset_entry *entry = bkey_idx(buf->data, offset);
+ struct jset_entry *entry = vstruct_idx(buf->data, offset);
entry->u64s = cpu_to_le16(u64s);
entry->btree_id = id;
@@ -336,7 +332,7 @@ static inline int bch_journal_error(struct journal *j)
static inline bool is_journal_device(struct cache *ca)
{
- return ca->mi.state == CACHE_ACTIVE && ca->mi.tier == 0;
+ return ca->mi.state == BCH_MEMBER_STATE_ACTIVE && ca->mi.tier == 0;
}
static inline bool journal_flushes_device(struct cache *ca)
@@ -367,21 +363,16 @@ ssize_t bch_journal_print_debug(struct journal *, char *);
int bch_cache_journal_alloc(struct cache *);
-static inline __le64 *__journal_buckets(struct cache_sb *sb)
-{
- return sb->_data + bch_journal_buckets_offset(sb);
-}
-
-static inline u64 journal_bucket(struct cache_sb *sb, unsigned nr)
+static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j)
{
- return le64_to_cpu(__journal_buckets(sb)[nr]);
-}
-
-static inline void set_journal_bucket(struct cache_sb *sb, unsigned nr, u64 bucket)
-{
- __journal_buckets(sb)[nr] = cpu_to_le64(bucket);
+ return j
+ ? (__le64 *) vstruct_end(&j->field) - j->buckets
+ : 0;
}
int bch_journal_move(struct cache *);
+void bch_journal_free_cache(struct cache *);
+int bch_journal_init_cache(struct cache *);
+
#endif /* _BCACHE_JOURNAL_H */
diff --git a/libbcache/journal_types.h b/libbcache/journal_types.h
index e3698b5..5c95e37 100644
--- a/libbcache/journal_types.h
+++ b/libbcache/journal_types.h
@@ -186,7 +186,7 @@ struct journal {
* ugh: need to get prio_buckets converted over to the eventual new
* transaction machinery
*/
- __le64 prio_buckets[MAX_CACHES_PER_SET];
+ __le64 prio_buckets[BCH_SB_MEMBERS_MAX];
unsigned nr_prio_buckets;
unsigned write_delay_ms;
@@ -208,7 +208,7 @@ struct journal {
/*
* Embedded in struct cache. First three fields refer to the array of journal
- * buckets, in cache_sb.
+ * buckets, in bch_sb.
*/
struct journal_device {
/*
@@ -229,6 +229,8 @@ struct journal_device {
* sufficient to read:
*/
unsigned last_idx;
+ unsigned nr;
+ u64 *buckets;
/* Bio for journal reads/writes to this device */
struct bio *bio;
diff --git a/libbcache/migrate.c b/libbcache/migrate.c
index 5a26e22..407ca17 100644
--- a/libbcache/migrate.c
+++ b/libbcache/migrate.c
@@ -25,7 +25,7 @@ static int issue_migration_move(struct cache *ca,
return -ENOSPC;
extent_for_each_ptr(bkey_s_c_to_extent(k), ptr)
- if (ptr->dev == ca->sb.nr_this_dev)
+ if (ptr->dev == ca->dev_idx)
goto found;
BUG();
@@ -62,7 +62,7 @@ int bch_move_data_off_device(struct cache *ca)
u64 seen_key_count;
int ret = 0;
- BUG_ON(ca->mi.state == CACHE_ACTIVE);
+ BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE);
bch_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
ctxt.avoid = ca;
@@ -99,7 +99,7 @@ int bch_move_data_off_device(struct cache *ca)
!(ret = btree_iter_err(k))) {
if (!bkey_extent_is_data(k.k) ||
!bch_extent_has_device(bkey_s_c_to_extent(k),
- ca->sb.nr_this_dev))
+ ca->dev_idx))
goto next;
ret = issue_migration_move(ca, &ctxt, k);
@@ -151,14 +151,14 @@ static int bch_move_btree_off(struct cache *ca, enum btree_id id)
struct btree *b;
int ret;
- BUG_ON(ca->mi.state == CACHE_ACTIVE);
+ BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE);
closure_init_stack(&cl);
for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
retry:
- if (!bch_extent_has_device(e, ca->sb.nr_this_dev))
+ if (!bch_extent_has_device(e, ca->dev_idx))
continue;
ret = bch_btree_node_rewrite(&iter, b, &cl);
@@ -188,7 +188,7 @@ retry:
for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
- BUG_ON(bch_extent_has_device(e, ca->sb.nr_this_dev));
+ BUG_ON(bch_extent_has_device(e, ca->dev_idx));
}
bch_btree_iter_unlock(&iter);
}
@@ -282,7 +282,7 @@ static int bch_flag_key_bad(struct btree_iter *iter,
e = bkey_i_to_s_extent(&tmp.key);
extent_for_each_ptr_backwards(e, ptr)
- if (ptr->dev == ca->sb.nr_this_dev)
+ if (ptr->dev == ca->dev_idx)
bch_extent_drop_ptr(e, ptr);
/*
@@ -323,7 +323,7 @@ int bch_flag_data_bad(struct cache *ca)
goto advance;
e = bkey_s_c_to_extent(k);
- if (!bch_extent_has_device(e, ca->sb.nr_this_dev))
+ if (!bch_extent_has_device(e, ca->dev_idx))
goto advance;
ret = bch_flag_key_bad(&iter, ca, e);
diff --git a/libbcache/move.c b/libbcache/move.c
index f3ab9e8..655a523 100644
--- a/libbcache/move.c
+++ b/libbcache/move.c
@@ -5,7 +5,7 @@
#include "buckets.h"
#include "io.h"
#include "move.h"
-#include "super.h"
+#include "super-io.h"
#include "keylist.h"
#include <linux/ioprio.h>
@@ -63,7 +63,8 @@ static int bch_migrate_index_update(struct bch_write_op *op)
bkey_start_pos(&bch_keylist_front(keys)->k));
while (1) {
- struct bkey_i *insert = bch_keylist_front(keys);
+ struct bkey_s_extent insert =
+ bkey_i_to_s_extent(bch_keylist_front(keys));
struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
struct bch_extent_ptr *ptr;
struct bkey_s_extent e;
@@ -79,17 +80,18 @@ static int bch_migrate_index_update(struct bch_write_op *op)
bkey_reassemble(&new.k, k);
bch_cut_front(iter.pos, &new.k);
- bch_cut_back(insert->k.p, &new.k.k);
+ bch_cut_back(insert.k->p, &new.k.k);
e = bkey_i_to_s_extent(&new.k);
/* hack - promotes can race: */
if (m->promote)
- extent_for_each_ptr(bkey_i_to_s_extent(insert), ptr)
+ extent_for_each_ptr(insert, ptr)
if (bch_extent_has_device(e.c, ptr->dev))
goto nomatch;
ptr = bch_migrate_matching_ptr(m, e);
if (ptr) {
+ int nr_new_dirty = bch_extent_nr_dirty_ptrs(insert.s_c);
unsigned insert_flags =
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL;
@@ -98,17 +100,22 @@ static int bch_migrate_index_update(struct bch_write_op *op)
if (m->move)
insert_flags |= BTREE_INSERT_USE_RESERVE;
- if (m->move)
+ if (m->move) {
+ nr_new_dirty -= !ptr->cached;
__bch_extent_drop_ptr(e, ptr);
+ }
+
+ BUG_ON(nr_new_dirty < 0);
memcpy_u64s(extent_entry_last(e),
- &insert->v,
- bkey_val_u64s(&insert->k));
- e.k->u64s += bkey_val_u64s(&insert->k);
+ insert.v,
+ bkey_val_u64s(insert.k));
+ e.k->u64s += bkey_val_u64s(insert.k);
bch_extent_narrow_crcs(e);
bch_extent_drop_redundant_crcs(e);
bch_extent_normalize(c, e.s);
+ bch_extent_mark_replicas_cached(c, e, nr_new_dirty);
ret = bch_btree_insert_at(c, &op->res,
NULL, op_journal_seq(op),
@@ -148,7 +155,8 @@ void bch_migrate_write_init(struct cache_set *c,
if (move_ptr)
m->move_ptr = *move_ptr;
- if (bkey_extent_is_cached(k.k))
+ if (bkey_extent_is_cached(k.k) ||
+ (move_ptr && move_ptr->cached))
flags |= BCH_WRITE_CACHED;
bch_write_op_init(&m->op, c, &m->wbio,
@@ -160,6 +168,7 @@ void bch_migrate_write_init(struct cache_set *c,
if (m->move)
m->op.alloc_reserve = RESERVE_MOVINGGC;
+ m->op.nonce = extent_current_nonce(bkey_s_c_to_extent(k));
m->op.nr_replicas = 1;
m->op.index_update_fn = bch_migrate_index_update;
}
diff --git a/libbcache/movinggc.c b/libbcache/movinggc.c
index cb4f165..83407eb 100644
--- a/libbcache/movinggc.c
+++ b/libbcache/movinggc.c
@@ -28,7 +28,7 @@ static const struct bch_extent_ptr *moving_pred(struct cache *ca,
if (bkey_extent_is_data(k.k) &&
(ptr = bch_extent_has_device(bkey_s_c_to_extent(k),
- ca->sb.nr_this_dev)) &&
+ ca->dev_idx)) &&
PTR_BUCKET(ca, ptr)->mark.copygc)
return ptr;
diff --git a/libbcache/notify.c b/libbcache/notify.c
index e9b5568..3a50f8f 100644
--- a/libbcache/notify.c
+++ b/libbcache/notify.c
@@ -25,7 +25,7 @@ static void notify_get(struct cache_set *c)
env->envp_idx = 0;
env->buflen = 0;
- notify_var(c, "SET_UUID=%pU", c->disk_sb.user_uuid.b);
+ notify_var(c, "SET_UUID=%pU", c->sb.user_uuid.b);
}
static void notify_get_cache(struct cache *ca)
@@ -34,7 +34,7 @@ static void notify_get_cache(struct cache *ca)
char buf[BDEVNAME_SIZE];
notify_get(c);
- notify_var(c, "UUID=%pU", ca->disk_sb.sb->disk_uuid.b);
+ notify_var(c, "UUID=%pU", ca->uuid.b);
notify_var(c, "BLOCKDEV=%s", bdevname(ca->disk_sb.bdev, buf));
}
diff --git a/libbcache/opts.c b/libbcache/opts.c
index 60a2a4d..333654e 100644
--- a/libbcache/opts.c
+++ b/libbcache/opts.c
@@ -29,7 +29,6 @@ const char * const bch_str_hash_types[] = {
"crc32c",
"crc64",
"siphash",
- "sha1",
NULL
};
@@ -70,11 +69,11 @@ const char * const bch_uint_opt[] = {
};
enum bch_opts {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
Opt_##_name,
- CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+ BCH_VISIBLE_OPTS()
+#undef BCH_OPT
Opt_bad_opt,
};
@@ -144,15 +143,15 @@ static int parse_string_opt(const struct bch_option *opt, const char *s)
static struct bch_opt_result parse_one_opt(const char *opt)
{
static const struct bch_option opt_table[] = {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
[Opt_##_name] = { \
.name = #_name, \
.opts = _choices, \
.min = _min, \
.max = _max, \
},
- CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+ BCH_VISIBLE_OPTS()
+#undef BCH_OPT
}, *i;
for (i = opt_table;
@@ -186,13 +185,13 @@ int bch_parse_options(struct cache_set_opts *opts, int flags, char *options)
struct bch_opt_result res = parse_one_opt(p);
switch (res.opt) {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
case Opt_##_name: \
opts->_name = res.val; \
break;
- CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+ BCH_VISIBLE_OPTS()
+#undef BCH_OPT
case Opt_bad_opt:
return -EINVAL;
diff --git a/libbcache/opts.h b/libbcache/opts.h
index 70df232..1d30848 100644
--- a/libbcache/opts.h
+++ b/libbcache/opts.h
@@ -30,47 +30,47 @@ extern const char * const bch_bool_opt[];
extern const char * const bch_uint_opt[];
/* dummy option, for options that aren't stored in the superblock */
-LE64_BITMASK(NO_SB_OPT, struct cache_sb, flags, 0, 0);
-
-#define CACHE_SET_VISIBLE_OPTS() \
- CACHE_SET_OPT(verbose_recovery, \
- bch_bool_opt, 0, 2, \
- NO_SB_OPT, false) \
- CACHE_SET_OPT(posix_acl, \
- bch_bool_opt, 0, 2, \
- NO_SB_OPT, false) \
- CACHE_SET_OPT(journal_flush_disabled, \
- bch_bool_opt, 0, 2, \
- NO_SB_OPT, true) \
- CACHE_SET_OPT(nofsck, \
- bch_bool_opt, 0, 2, \
- NO_SB_OPT, true) \
- CACHE_SET_OPT(fix_errors, \
- bch_bool_opt, 0, 2, \
- NO_SB_OPT, true) \
- CACHE_SET_OPT(nochanges, \
- bch_bool_opt, 0, 2, \
- NO_SB_OPT, 0) \
- CACHE_SET_OPT(noreplay, \
- bch_bool_opt, 0, 2, \
- NO_SB_OPT, 0) \
- CACHE_SET_OPT(norecovery, \
- bch_bool_opt, 0, 2, \
- NO_SB_OPT, 0) \
- CACHE_SET_SB_OPTS()
-
-#define CACHE_SET_OPTS() \
- CACHE_SET_OPT(read_only, \
- bch_bool_opt, 0, 2, \
- NO_SB_OPT, 0) \
- CACHE_SET_VISIBLE_OPTS()
+LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0);
+
+#define BCH_VISIBLE_OPTS() \
+ BCH_OPT(verbose_recovery, \
+ bch_bool_opt, 0, 2, \
+ NO_SB_OPT, false) \
+ BCH_OPT(posix_acl, \
+ bch_bool_opt, 0, 2, \
+ NO_SB_OPT, false) \
+ BCH_OPT(journal_flush_disabled, \
+ bch_bool_opt, 0, 2, \
+ NO_SB_OPT, true) \
+ BCH_OPT(nofsck, \
+ bch_bool_opt, 0, 2, \
+ NO_SB_OPT, true) \
+ BCH_OPT(fix_errors, \
+ bch_bool_opt, 0, 2, \
+ NO_SB_OPT, true) \
+ BCH_OPT(nochanges, \
+ bch_bool_opt, 0, 2, \
+ NO_SB_OPT, 0) \
+ BCH_OPT(noreplay, \
+ bch_bool_opt, 0, 2, \
+ NO_SB_OPT, 0) \
+ BCH_OPT(norecovery, \
+ bch_bool_opt, 0, 2, \
+ NO_SB_OPT, 0) \
+ BCH_SB_OPTS()
+
+#define BCH_OPTS() \
+ BCH_OPT(read_only, \
+ bch_bool_opt, 0, 2, \
+ NO_SB_OPT, 0) \
+ BCH_VISIBLE_OPTS()
struct cache_set_opts {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
s8 _name;
- CACHE_SET_OPTS()
-#undef CACHE_SET_OPT
+ BCH_OPTS()
+#undef BCH_OPT
};
static inline struct cache_set_opts cache_set_opts_empty(void)
@@ -85,27 +85,27 @@ static inline struct cache_set_opts cache_set_opts_empty(void)
* Initial options from superblock - here we don't want any options undefined,
* any options the superblock doesn't specify are set to 0:
*/
-static inline struct cache_set_opts cache_superblock_opts(struct cache_sb *sb)
+static inline struct cache_set_opts cache_superblock_opts(struct bch_sb *sb)
{
return (struct cache_set_opts) {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
._name = _sb_opt##_BITS ? _sb_opt(sb) : 0,
- CACHE_SET_OPTS()
-#undef CACHE_SET_OPT
+ BCH_SB_OPTS()
+#undef BCH_OPT
};
}
static inline void cache_set_opts_apply(struct cache_set_opts *dst,
struct cache_set_opts src)
{
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
BUILD_BUG_ON(_max > S8_MAX); \
if (src._name >= 0) \
dst->_name = src._name;
- CACHE_SET_OPTS()
-#undef CACHE_SET_OPT
+ BCH_SB_OPTS()
+#undef BCH_OPT
}
int bch_parse_options(struct cache_set_opts *, int, char *);
diff --git a/libbcache/siphash.c b/libbcache/siphash.c
index 5ba80b5..3a6c9c8 100644
--- a/libbcache/siphash.c
+++ b/libbcache/siphash.c
@@ -43,19 +43,46 @@
* https://131002.net/siphash/
*/
-//#include <sys/param.h>
-//#include <sys/systm.h>
-
#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/bitops.h>
#include <linux/string.h>
#include "siphash.h"
-static void SipHash_CRounds(SIPHASH_CTX *, int);
-static void SipHash_Rounds(SIPHASH_CTX *, int);
+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
+{
+ while (rounds--) {
+ ctx->v[0] += ctx->v[1];
+ ctx->v[2] += ctx->v[3];
+ ctx->v[1] = rol64(ctx->v[1], 13);
+ ctx->v[3] = rol64(ctx->v[3], 16);
+
+ ctx->v[1] ^= ctx->v[0];
+ ctx->v[3] ^= ctx->v[2];
+ ctx->v[0] = rol64(ctx->v[0], 32);
+
+ ctx->v[2] += ctx->v[1];
+ ctx->v[0] += ctx->v[3];
+ ctx->v[1] = rol64(ctx->v[1], 17);
+ ctx->v[3] = rol64(ctx->v[3], 21);
+
+ ctx->v[1] ^= ctx->v[2];
+ ctx->v[3] ^= ctx->v[0];
+ ctx->v[2] = rol64(ctx->v[2], 32);
+ }
+}
+
+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
+{
+ u64 m = get_unaligned_le64(ptr);
-void
-SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
+ ctx->v[3] ^= m;
+ SipHash_Rounds(ctx, rounds);
+ ctx->v[0] ^= m;
+}
+
+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
{
u64 k0, k1;
@@ -71,8 +98,8 @@ SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
ctx->bytes = 0;
}
-void
-SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
+ const void *src, size_t len)
{
const u8 *ptr = src;
size_t left, used;
@@ -88,7 +115,7 @@ SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
if (len >= left) {
memcpy(&ctx->buf[used], ptr, left);
- SipHash_CRounds(ctx, rc);
+ SipHash_CRounds(ctx, ctx->buf, rc);
len -= left;
ptr += left;
} else {
@@ -98,8 +125,7 @@ SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
}
while (len >= sizeof(ctx->buf)) {
- memcpy(ctx->buf, ptr, sizeof(ctx->buf));
- SipHash_CRounds(ctx, rc);
+ SipHash_CRounds(ctx, ptr, rc);
len -= sizeof(ctx->buf);
ptr += sizeof(ctx->buf);
}
@@ -108,8 +134,7 @@ SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
memcpy(&ctx->buf[used], ptr, len);
}
-void
-SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
{
u64 r;
@@ -118,8 +143,7 @@ SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
*((__le64 *) dst) = cpu_to_le64(r);
}
-u64
-SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
{
u64 r;
size_t left, used;
@@ -129,7 +153,7 @@ SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
memset(&ctx->buf[used], 0, left - 1);
ctx->buf[7] = ctx->bytes;
- SipHash_CRounds(ctx, rc);
+ SipHash_CRounds(ctx, ctx->buf, rc);
ctx->v[2] ^= 0xff;
SipHash_Rounds(ctx, rf);
@@ -138,48 +162,11 @@ SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
return (r);
}
-u64
-SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
{
SIPHASH_CTX ctx;
SipHash_Init(&ctx, key);
SipHash_Update(&ctx, rc, rf, src, len);
- return (SipHash_End(&ctx, rc, rf));
-}
-
-#define SIP_ROTL(x, b) ((x) << (b)) | ( (x) >> (64 - (b)))
-
-static void
-SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
-{
- while (rounds--) {
- ctx->v[0] += ctx->v[1];
- ctx->v[2] += ctx->v[3];
- ctx->v[1] = SIP_ROTL(ctx->v[1], 13);
- ctx->v[3] = SIP_ROTL(ctx->v[3], 16);
-
- ctx->v[1] ^= ctx->v[0];
- ctx->v[3] ^= ctx->v[2];
- ctx->v[0] = SIP_ROTL(ctx->v[0], 32);
-
- ctx->v[2] += ctx->v[1];
- ctx->v[0] += ctx->v[3];
- ctx->v[1] = SIP_ROTL(ctx->v[1], 17);
- ctx->v[3] = SIP_ROTL(ctx->v[3], 21);
-
- ctx->v[1] ^= ctx->v[2];
- ctx->v[3] ^= ctx->v[0];
- ctx->v[2] = SIP_ROTL(ctx->v[2], 32);
- }
-}
-
-static void
-SipHash_CRounds(SIPHASH_CTX *ctx, int rounds)
-{
- u64 m = le64_to_cpu(*((__le64 *)ctx->buf));
-
- ctx->v[3] ^= m;
- SipHash_Rounds(ctx, rounds);
- ctx->v[0] ^= m;
+ return SipHash_End(&ctx, rc, rf);
}
diff --git a/libbcache/str_hash.h b/libbcache/str_hash.h
index a489304..b14d05c 100644
--- a/libbcache/str_hash.h
+++ b/libbcache/str_hash.h
@@ -3,37 +3,74 @@
#include "btree_iter.h"
#include "checksum.h"
+#include "inode.h"
#include "siphash.h"
#include "super.h"
-#include <crypto/sha1_base.h>
#include <linux/crc32c.h>
+#include <crypto/hash.h>
-static const SIPHASH_KEY bch_siphash_key = {
- .k0 = cpu_to_le64(0x5a9585fd80087730ULL),
- .k1 = cpu_to_le64(0xc8de666d50b45664ULL ),
+struct bch_hash_info {
+ u8 type;
+ union {
+ __le64 crc_key;
+ SIPHASH_KEY siphash_key;
+ };
};
+static inline struct bch_hash_info
+bch_hash_info_init(const struct bch_inode_unpacked *bi)
+{
+ /* XXX ick */
+ struct bch_hash_info info = {
+ .type = (bi->i_flags >> INODE_STR_HASH_OFFSET) &
+ ~(~0 << INODE_STR_HASH_BITS)
+ };
+
+ switch (info.type) {
+ case BCH_STR_HASH_CRC32C:
+ case BCH_STR_HASH_CRC64:
+ info.crc_key = bi->i_hash_seed;
+ break;
+ case BCH_STR_HASH_SIPHASH: {
+ SHASH_DESC_ON_STACK(desc, bch_sha256);
+ u8 digest[crypto_shash_digestsize(bch_sha256)];
+
+ desc->tfm = bch_sha256;
+ desc->flags = 0;
+
+ crypto_shash_digest(desc, (void *) &bi->i_hash_seed,
+ sizeof(bi->i_hash_seed), digest);
+ memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
+ break;
+ }
+ default:
+ BUG();
+ }
+
+ return info;
+}
+
struct bch_str_hash_ctx {
union {
- u32 crc32c;
- u64 crc64;
- SIPHASH_CTX siphash;
+ u32 crc32c;
+ u64 crc64;
+ SIPHASH_CTX siphash;
};
};
static inline void bch_str_hash_init(struct bch_str_hash_ctx *ctx,
- enum bch_str_hash_type type)
+ const struct bch_hash_info *info)
{
- switch (type) {
+ switch (info->type) {
case BCH_STR_HASH_CRC32C:
- ctx->crc32c = ~0;
+ ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key));
break;
case BCH_STR_HASH_CRC64:
- ctx->crc64 = ~0;
+ ctx->crc64 = bch_crc64_update(~0, &info->crc_key, sizeof(info->crc_key));
break;
case BCH_STR_HASH_SIPHASH:
- SipHash24_Init(&ctx->siphash, &bch_siphash_key);
+ SipHash24_Init(&ctx->siphash, &info->siphash_key);
break;
default:
BUG();
@@ -41,10 +78,10 @@ static inline void bch_str_hash_init(struct bch_str_hash_ctx *ctx,
}
static inline void bch_str_hash_update(struct bch_str_hash_ctx *ctx,
- enum bch_str_hash_type type,
- const void *data, size_t len)
+ const struct bch_hash_info *info,
+ const void *data, size_t len)
{
- switch (type) {
+ switch (info->type) {
case BCH_STR_HASH_CRC32C:
ctx->crc32c = crc32c(ctx->crc32c, data, len);
break;
@@ -60,9 +97,9 @@ static inline void bch_str_hash_update(struct bch_str_hash_ctx *ctx,
}
static inline u64 bch_str_hash_end(struct bch_str_hash_ctx *ctx,
- enum bch_str_hash_type type)
+ const struct bch_hash_info *info)
{
- switch (type) {
+ switch (info->type) {
case BCH_STR_HASH_CRC32C:
return ctx->crc32c;
case BCH_STR_HASH_CRC64:
@@ -74,19 +111,6 @@ static inline u64 bch_str_hash_end(struct bch_str_hash_ctx *ctx,
}
}
-struct bch_hash_info {
- u64 seed;
- u8 type;
-};
-
-static inline struct bch_hash_info bch_hash_info_init(const struct bch_inode *bi)
-{
- return (struct bch_hash_info) {
- .seed = le64_to_cpu(bi->i_hash_seed),
- .type = INODE_STR_HASH_TYPE(bi),
- };
-}
-
struct bch_hash_desc {
enum btree_id btree_id;
u8 key_type;
diff --git a/libbcache/super-io.c b/libbcache/super-io.c
new file mode 100644
index 0000000..66338a1
--- /dev/null
+++ b/libbcache/super-io.c
@@ -0,0 +1,798 @@
+
+#include "bcache.h"
+#include "blockdev.h"
+#include "checksum.h"
+#include "error.h"
+#include "io.h"
+#include "journal.h"
+#include "super-io.h"
+#include "super.h"
+#include "vstructs.h"
+
+#include <linux/backing-dev.h>
+
+static inline void __bch_sb_layout_size_assert(void)
+{
+ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
+}
+
+struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb,
+ enum bch_sb_field_types type)
+{
+ struct bch_sb_field *f;
+
+ /* XXX: need locking around superblock to access optional fields */
+
+ vstruct_for_each(sb, f)
+ if (le32_to_cpu(f->type) == type)
+ return f;
+ return NULL;
+}
+
+void bch_free_super(struct bcache_superblock *sb)
+{
+ if (sb->bio)
+ bio_put(sb->bio);
+ if (!IS_ERR_OR_NULL(sb->bdev))
+ blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+
+ free_pages((unsigned long) sb->sb, sb->page_order);
+ memset(sb, 0, sizeof(*sb));
+}
+
+static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
+{
+ struct bch_sb *new_sb;
+ struct bio *bio;
+
+ if (sb->page_order >= order && sb->sb)
+ return 0;
+
+ if (dynamic_fault("bcache:add:super_realloc"))
+ return -ENOMEM;
+
+ bio = bio_kmalloc(GFP_KERNEL, 1 << order);
+ if (!bio)
+ return -ENOMEM;
+
+ if (sb->bio)
+ bio_put(sb->bio);
+ sb->bio = bio;
+
+ new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
+ if (!new_sb)
+ return -ENOMEM;
+
+ if (sb->sb)
+ memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
+
+ free_pages((unsigned long) sb->sb, sb->page_order);
+ sb->sb = new_sb;
+
+ sb->page_order = order;
+
+ return 0;
+}
+
+int bch_dev_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
+{
+ u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
+ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
+
+ if (new_bytes > max_bytes) {
+ char buf[BDEVNAME_SIZE];
+
+ pr_err("%s: superblock too big: want %llu but have %llu",
+ bdevname(sb->bdev, buf), new_bytes, max_bytes);
+ return -ENOSPC;
+ }
+
+ return __bch_super_realloc(sb, get_order(new_bytes));
+}
+
+static int bch_fs_sb_realloc(struct cache_set *c, unsigned u64s)
+{
+ u64 bytes = __vstruct_bytes(struct bch_sb, u64s);
+ struct bch_sb *sb;
+ unsigned order = get_order(bytes);
+
+ if (c->disk_sb && order <= c->disk_sb_order)
+ return 0;
+
+ sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+ if (!sb)
+ return -ENOMEM;
+
+ if (c->disk_sb)
+ memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order);
+
+ free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
+
+ c->disk_sb = sb;
+ c->disk_sb_order = order;
+ return 0;
+}
+
+static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ unsigned u64s)
+{
+ unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+
+ if (!f) {
+ f = vstruct_last(sb);
+ memset(f, 0, sizeof(u64) * u64s);
+ f->u64s = cpu_to_le32(u64s);
+ f->type = 0;
+ } else {
+ void *src, *dst;
+
+ src = vstruct_end(f);
+ f->u64s = cpu_to_le32(u64s);
+ dst = vstruct_end(f);
+
+ memmove(dst, src, vstruct_end(sb) - src);
+
+ if (dst > src)
+ memset(src, 0, dst - src);
+ }
+
+ le32_add_cpu(&sb->u64s, u64s - old_u64s);
+
+ return f;
+
+}
+
+struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
+ struct bch_sb_field *f,
+ unsigned u64s)
+{
+ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+ ssize_t d = -old_u64s + u64s;
+ struct cache *ca;
+ unsigned i;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ if (bch_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d))
+ return NULL;
+
+ for_each_cache(ca, c, i) {
+ struct bcache_superblock *sb = &ca->disk_sb;
+
+ if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+ percpu_ref_put(&ca->ref);
+ return NULL;
+ }
+ }
+
+ return __bch_sb_field_resize(c->disk_sb, f, u64s);
+}
+
+struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *sb,
+ struct bch_sb_field *f,
+ unsigned u64s)
+{
+ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+ ssize_t d = -old_u64s + u64s;
+
+ if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+ return NULL;
+
+ return __bch_sb_field_resize(sb->sb, f, u64s);
+}
+
+static const char *validate_sb_layout(struct bch_sb_layout *layout)
+{
+ u64 offset, prev_offset, max_sectors;
+ unsigned i;
+
+ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
+ return "Not a bcache superblock layout";
+
+ if (layout->layout_type != 0)
+ return "Invalid superblock layout type";
+
+ if (!layout->nr_superblocks)
+ return "Invalid superblock layout: no superblocks";
+
+ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
+ return "Invalid superblock layout: too many superblocks";
+
+ max_sectors = 1 << layout->sb_max_size_bits;
+
+ prev_offset = le64_to_cpu(layout->sb_offset[0]);
+
+ if (prev_offset != BCH_SB_SECTOR)
+ return "Invalid superblock layout: doesn't have default superblock location";
+
+ for (i = 1; i < layout->nr_superblocks; i++) {
+ offset = le64_to_cpu(layout->sb_offset[i]);
+
+ if (offset < prev_offset + max_sectors)
+ return "Invalid superblock layout: superblocks overlap";
+ prev_offset = offset;
+ }
+
+ return NULL;
+}
+
+const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
+{
+ struct bch_sb *sb = disk_sb->sb;
+ struct bch_sb_field *f;
+ struct bch_sb_field_members *sb_mi;
+ struct bch_sb_field_journal *journal;
+ struct cache_member_cpu mi;
+ const char *err;
+ u16 block_size;
+ unsigned i;
+
+ switch (le64_to_cpu(sb->version)) {
+ case BCACHE_SB_VERSION_CDEV_V4:
+ break;
+ default:
+ return"Unsupported superblock version";
+ }
+
+ if (BCH_SB_INITIALIZED(sb) &&
+ le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V4)
+ return "Unsupported superblock version";
+
+ block_size = le16_to_cpu(sb->block_size);
+
+ if (!is_power_of_2(block_size) ||
+ block_size > PAGE_SECTORS)
+ return "Bad block size";
+
+ if (bch_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
+ return "Bad user UUID";
+
+ if (bch_is_zero(sb->uuid.b, sizeof(uuid_le)))
+ return "Bad internal UUID";
+
+ if (!sb->nr_devices ||
+ sb->nr_devices <= sb->dev_idx ||
+ sb->nr_devices > BCH_SB_MEMBERS_MAX)
+ return "Bad cache device number in set";
+
+ if (!BCH_SB_META_REPLICAS_WANT(sb) ||
+ BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+ return "Invalid number of metadata replicas";
+
+ if (!BCH_SB_META_REPLICAS_HAVE(sb) ||
+ BCH_SB_META_REPLICAS_HAVE(sb) >
+ BCH_SB_META_REPLICAS_WANT(sb))
+ return "Invalid number of metadata replicas";
+
+ if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
+ BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+ return "Invalid number of data replicas";
+
+ if (!BCH_SB_DATA_REPLICAS_HAVE(sb) ||
+ BCH_SB_DATA_REPLICAS_HAVE(sb) >
+ BCH_SB_DATA_REPLICAS_WANT(sb))
+ return "Invalid number of data replicas";
+
+ if (!BCH_SB_BTREE_NODE_SIZE(sb))
+ return "Btree node size not set";
+
+ if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
+ return "Btree node size not a power of two";
+
+ if (BCH_SB_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
+ return "Btree node size too large";
+
+ if (BCH_SB_GC_RESERVE(sb) < 5)
+ return "gc reserve percentage too small";
+
+ if (1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) < block_size)
+ return "max journal entry size too small";
+
+ /* 4 mb max: */
+ if (512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX)
+ return "max journal entry size too big";
+
+ if (!sb->time_precision ||
+ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
+ return "invalid time precision";
+
+ /* validate layout */
+ err = validate_sb_layout(&sb->layout);
+ if (err)
+ return err;
+
+ vstruct_for_each(sb, f) {
+ if (!f->u64s)
+ return "Invalid superblock: invalid optional field";
+
+ if (vstruct_next(f) > vstruct_last(sb))
+ return "Invalid superblock: invalid optional field";
+
+ if (le32_to_cpu(f->type) >= BCH_SB_FIELD_NR)
+ return "Invalid superblock: unknown optional field type";
+ }
+
+ /* Validate member info: */
+ sb_mi = bch_sb_get_members(sb);
+ if (!sb_mi)
+ return "Invalid superblock: member info area missing";
+
+ if ((void *) (sb_mi->members + sb->nr_devices) >
+ vstruct_end(&sb_mi->field))
+ return "Invalid superblock: bad member info";
+
+ mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx);
+
+ for (i = 0; i < sb->layout.nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(sb->layout.sb_offset[i]);
+ u64 max_size = 1 << sb->layout.sb_max_size_bits;
+
+ if (offset + max_size > mi.first_bucket * mi.bucket_size)
+ return "Invalid superblock: first bucket comes before end of super";
+ }
+
+ if (mi.nbuckets > LONG_MAX)
+ return "Too many buckets";
+
+ if (mi.nbuckets - mi.first_bucket < 1 << 10)
+ return "Not enough buckets";
+
+ if (!is_power_of_2(mi.bucket_size) ||
+ mi.bucket_size < PAGE_SECTORS ||
+ mi.bucket_size < block_size)
+ return "Bad bucket size";
+
+ if (get_capacity(disk_sb->bdev->bd_disk) <
+ mi.bucket_size * mi.nbuckets)
+ return "Invalid superblock: device too small";
+
+ /* Validate journal buckets: */
+ journal = bch_sb_get_journal(sb);
+ if (journal) {
+ for (i = 0; i < bch_nr_journal_buckets(journal); i++) {
+ u64 b = le64_to_cpu(journal->buckets[i]);
+
+ if (b < mi.first_bucket || b >= mi.nbuckets)
+ return "bad journal bucket";
+ }
+ }
+
+ return NULL;
+}
+
+/* device open: */
+
+static bool bch_is_open_cache(struct block_device *bdev)
+{
+ struct cache_set *c;
+ struct cache *ca;
+ unsigned i;
+
+ rcu_read_lock();
+ list_for_each_entry(c, &bch_cache_sets, list)
+ for_each_cache_rcu(ca, c, i)
+ if (ca->disk_sb.bdev == bdev) {
+ rcu_read_unlock();
+ return true;
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+static bool bch_is_open(struct block_device *bdev)
+{
+ lockdep_assert_held(&bch_register_lock);
+
+ return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+}
+
+static const char *bch_blkdev_open(const char *path, void *holder,
+ struct cache_set_opts opts,
+ struct block_device **ret)
+{
+ struct block_device *bdev;
+ fmode_t mode = opts.nochanges > 0
+ ? FMODE_READ
+ : FMODE_READ|FMODE_WRITE|FMODE_EXCL;
+ const char *err;
+
+ *ret = NULL;
+ bdev = blkdev_get_by_path(path, mode, holder);
+
+ if (bdev == ERR_PTR(-EBUSY)) {
+ bdev = lookup_bdev(path);
+ if (IS_ERR(bdev))
+ return "device busy";
+
+ err = bch_is_open(bdev)
+ ? "device already registered"
+ : "device busy";
+
+ bdput(bdev);
+ return err;
+ }
+
+ if (IS_ERR(bdev))
+ return "failed to open device";
+
+ bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
+
+ *ret = bdev;
+ return NULL;
+}
+
+/* Update cached mi: */
+int bch_cache_set_mi_update(struct cache_set *c,
+ struct bch_member *mi,
+ unsigned nr_devices)
+{
+ struct cache_member_rcu *new, *old;
+ struct cache *ca;
+ unsigned i;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ new = kzalloc(sizeof(struct cache_member_rcu) +
+ sizeof(struct cache_member_cpu) * nr_devices,
+ GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ new->nr_devices = nr_devices;
+
+ for (i = 0; i < nr_devices; i++)
+ new->m[i] = cache_mi_to_cpu_mi(&mi[i]);
+
+ rcu_read_lock();
+ for_each_cache(ca, c, i)
+ ca->mi = new->m[i];
+ rcu_read_unlock();
+
+ old = rcu_dereference_protected(c->members,
+ lockdep_is_held(&c->sb_lock));
+
+ rcu_assign_pointer(c->members, new);
+ if (old)
+ kfree_rcu(old, rcu);
+
+ return 0;
+}
+
+static void bch_sb_update(struct cache_set *c)
+{
+ struct bch_sb *src = c->disk_sb;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ c->sb.uuid = src->uuid;
+ c->sb.user_uuid = src->user_uuid;
+ c->sb.block_size = le16_to_cpu(src->block_size);
+ c->sb.btree_node_size = BCH_SB_BTREE_NODE_SIZE(src);
+ c->sb.nr_devices = src->nr_devices;
+ c->sb.clean = BCH_SB_CLEAN(src);
+ c->sb.meta_replicas_have= BCH_SB_META_REPLICAS_HAVE(src);
+ c->sb.data_replicas_have= BCH_SB_DATA_REPLICAS_HAVE(src);
+ c->sb.str_hash_type = BCH_SB_STR_HASH_TYPE(src);
+ c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src);
+ c->sb.time_base_lo = le64_to_cpu(src->time_base_lo);
+ c->sb.time_base_hi = le32_to_cpu(src->time_base_hi);
+ c->sb.time_precision = le32_to_cpu(src->time_precision);
+}
+
+/* doesn't copy member info */
+static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
+{
+ struct bch_sb_field *src_f, *dst_f;
+
+ dst->version = src->version;
+ dst->seq = src->seq;
+ dst->uuid = src->uuid;
+ dst->user_uuid = src->user_uuid;
+ memcpy(dst->label, src->label, sizeof(dst->label));
+
+ dst->block_size = src->block_size;
+ dst->nr_devices = src->nr_devices;
+
+ dst->time_base_lo = src->time_base_lo;
+ dst->time_base_hi = src->time_base_hi;
+ dst->time_precision = src->time_precision;
+
+ memcpy(dst->flags, src->flags, sizeof(dst->flags));
+ memcpy(dst->features, src->features, sizeof(dst->features));
+ memcpy(dst->compat, src->compat, sizeof(dst->compat));
+
+ vstruct_for_each(src, src_f) {
+ if (src_f->type == BCH_SB_FIELD_journal)
+ continue;
+
+ dst_f = bch_sb_field_get(dst, src_f->type);
+ dst_f = __bch_sb_field_resize(dst, dst_f,
+ le32_to_cpu(src_f->u64s));
+
+ memcpy(dst_f, src_f, vstruct_bytes(src_f));
+ }
+}
+
+int bch_sb_to_cache_set(struct cache_set *c, struct bch_sb *src)
+{
+ struct bch_sb_field_members *members =
+ bch_sb_get_members(src);
+ struct bch_sb_field_journal *journal_buckets =
+ bch_sb_get_journal(src);
+ unsigned journal_u64s = journal_buckets
+ ? le32_to_cpu(journal_buckets->field.u64s)
+ : 0;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ if (bch_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s))
+ return -ENOMEM;
+
+ if (bch_cache_set_mi_update(c, members->members, src->nr_devices))
+ return -ENOMEM;
+
+ __copy_super(c->disk_sb, src);
+ bch_sb_update(c);
+
+ return 0;
+}
+
+int bch_sb_from_cache_set(struct cache_set *c, struct cache *ca)
+{
+ struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb;
+ struct bch_sb_field_journal *journal_buckets =
+ bch_sb_get_journal(dst);
+ unsigned journal_u64s = journal_buckets
+ ? le32_to_cpu(journal_buckets->field.u64s)
+ : 0;
+ unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
+ int ret;
+
+ ret = bch_dev_sb_realloc(&ca->disk_sb, u64s);
+ if (ret)
+ return ret;
+
+ __copy_super(dst, src);
+
+ return 0;
+}
+
+/* read superblock: */
+
+static const char *read_one_super(struct bcache_superblock *sb, u64 offset)
+{
+ struct bch_csum csum;
+ size_t bytes;
+ unsigned order;
+reread:
+ bio_reset(sb->bio);
+ sb->bio->bi_bdev = sb->bdev;
+ sb->bio->bi_iter.bi_sector = BCH_SB_SECTOR;
+ sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
+ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
+ bch_bio_map(sb->bio, sb->sb);
+
+ if (submit_bio_wait(sb->bio))
+ return "IO error";
+
+ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
+ return "Not a bcache superblock";
+
+ if (le64_to_cpu(sb->sb->version) != BCACHE_SB_VERSION_CDEV_V4)
+ return "Unsupported superblock version";
+
+ bytes = vstruct_bytes(sb->sb);
+
+ if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
+ return "Bad superblock: too big";
+
+ order = get_order(bytes);
+ if (order > sb->page_order) {
+ if (__bch_super_realloc(sb, order))
+ return "cannot allocate memory";
+ goto reread;
+ }
+
+ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
+ return "unknown csum type";
+
+ /* XXX: verify MACs */
+ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
+ (struct nonce) { 0 }, sb->sb);
+
+ if (bch_crc_cmp(csum, sb->sb->csum))
+ return "bad checksum reading superblock";
+
+ return NULL;
+}
+
+const char *bch_read_super(struct bcache_superblock *sb,
+ struct cache_set_opts opts,
+ const char *path)
+{
+ struct bch_sb_layout layout;
+ const char *err;
+ unsigned i;
+
+ lockdep_assert_held(&bch_register_lock);
+
+ memset(sb, 0, sizeof(*sb));
+
+ err = bch_blkdev_open(path, &sb, opts, &sb->bdev);
+ if (err)
+ return err;
+
+ err = "cannot allocate memory";
+ if (__bch_super_realloc(sb, 0))
+ goto err;
+
+ err = "dynamic fault";
+ if (cache_set_init_fault("read_super"))
+ goto err;
+
+ err = read_one_super(sb, BCH_SB_SECTOR);
+ if (!err)
+ goto got_super;
+
+ pr_err("error reading default super: %s", err);
+
+ /*
+ * Error reading primary superblock - read location of backup
+ * superblocks:
+ */
+ bio_reset(sb->bio);
+ sb->bio->bi_bdev = sb->bdev;
+ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
+ sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
+ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
+ /*
+ * use sb buffer to read layout, since sb buffer is page aligned but
+ * layout won't be:
+ */
+ bch_bio_map(sb->bio, sb->sb);
+
+ err = "IO error";
+ if (submit_bio_wait(sb->bio))
+ goto err;
+
+ memcpy(&layout, sb->sb, sizeof(layout));
+ err = validate_sb_layout(&layout);
+ if (err)
+ goto err;
+
+ for (i = 0; i < layout.nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout.sb_offset[i]);
+
+ if (offset == BCH_SB_SECTOR)
+ continue;
+
+ err = read_one_super(sb, offset);
+ if (!err)
+ goto got_super;
+ }
+ goto err;
+got_super:
+ pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
+ le64_to_cpu(sb->sb->version),
+ le64_to_cpu(sb->sb->flags),
+ le64_to_cpu(sb->sb->seq),
+ le16_to_cpu(sb->sb->u64s));
+
+ err = "Superblock block size smaller than device block size";
+ if (le16_to_cpu(sb->sb->block_size) << 9 <
+ bdev_logical_block_size(sb->bdev))
+ goto err;
+
+ return NULL;
+err:
+ bch_free_super(sb);
+ return err;
+}
+
+/* write superblock: */
+
+static void write_super_endio(struct bio *bio)
+{
+ struct cache *ca = bio->bi_private;
+
+ /* XXX: return errors directly */
+
+ cache_fatal_io_err_on(bio->bi_error, ca, "superblock write");
+
+ bch_account_io_completion(ca);
+
+ closure_put(&ca->set->sb_write);
+ percpu_ref_put(&ca->ref);
+}
+
+static bool write_one_super(struct cache_set *c, struct cache *ca, unsigned idx)
+{
+ struct bch_sb *sb = ca->disk_sb.sb;
+ struct bio *bio = ca->disk_sb.bio;
+
+ if (idx >= sb->layout.nr_superblocks)
+ return false;
+
+ sb->offset = sb->layout.sb_offset[idx];
+
+ SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
+ sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
+ (struct nonce) { 0 }, sb);
+
+ bio_reset(bio);
+ bio->bi_bdev = ca->disk_sb.bdev;
+ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
+ bio->bi_iter.bi_size =
+ roundup(vstruct_bytes(sb),
+ bdev_logical_block_size(ca->disk_sb.bdev));
+ bio->bi_end_io = write_super_endio;
+ bio->bi_private = ca;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
+ bch_bio_map(bio, sb);
+
+ percpu_ref_get(&ca->ref);
+ closure_bio_submit_punt(bio, &c->sb_write, c);
+
+ return true;
+}
+
+void bch_write_super(struct cache_set *c)
+{
+ struct bch_sb_field_members *members =
+ bch_sb_get_members(c->disk_sb);
+ struct closure *cl = &c->sb_write;
+ struct cache *ca;
+ unsigned i, super_idx = 0;
+ bool wrote;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ closure_init_stack(cl);
+
+ le64_add_cpu(&c->disk_sb->seq, 1);
+
+ for_each_cache(ca, c, i)
+ bch_sb_from_cache_set(c, ca);
+
+ do {
+ wrote = false;
+ for_each_cache(ca, c, i)
+ if (write_one_super(c, ca, super_idx))
+ wrote = true;
+
+ closure_sync(cl);
+ super_idx++;
+ } while (wrote);
+
+ /* Make new options visible after they're persistent: */
+ bch_cache_set_mi_update(c, members->members, c->sb.nr_devices);
+ bch_sb_update(c);
+}
+
+void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k,
+ bool meta)
+{
+ struct bch_member *mi;
+ struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+ const struct bch_extent_ptr *ptr;
+
+ mutex_lock(&c->sb_lock);
+
+ /* recheck, might have raced */
+ if (bch_check_super_marked(c, k, meta)) {
+ mutex_unlock(&c->sb_lock);
+ return;
+ }
+
+ mi = bch_sb_get_members(c->disk_sb)->members;
+
+ extent_for_each_ptr(e, ptr)
+ if (!ptr->cached)
+ (meta
+ ? SET_BCH_MEMBER_HAS_METADATA
+ : SET_BCH_MEMBER_HAS_DATA)(mi + ptr->dev, true);
+
+ bch_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
diff --git a/libbcache/super-io.h b/libbcache/super-io.h
new file mode 100644
index 0000000..1eda57b
--- /dev/null
+++ b/libbcache/super-io.h
@@ -0,0 +1,141 @@
+#ifndef _BCACHE_SUPER_IO_H
+#define _BCACHE_SUPER_IO_H
+
+#include "extents.h"
+#include "super_types.h"
+
+#include <asm/byteorder.h>
+
+struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_types);
+
+#define BCH_SB_FIELD_TYPE(_name) \
+static inline struct bch_sb_field_##_name * \
+bch_sb_get_##_name(struct bch_sb *sb) \
+{ \
+ struct bch_sb_field *f = \
+ bch_sb_field_get(sb, BCH_SB_FIELD_##_name); \
+ \
+ return container_of_or_null(f, struct bch_sb_field_##_name, field);\
+}
+
+BCH_SB_FIELD_TYPE(journal);
+BCH_SB_FIELD_TYPE(members);
+BCH_SB_FIELD_TYPE(crypt);
+
+static inline bool bch_sb_test_feature(struct bch_sb *sb,
+ enum bch_sb_features f)
+{
+ unsigned w = f / 64;
+ unsigned b = f % 64;
+
+ return le64_to_cpu(sb->features[w]) & (1ULL << b);
+}
+
+static inline void bch_sb_set_feature(struct bch_sb *sb,
+ enum bch_sb_features f)
+{
+ if (!bch_sb_test_feature(sb, f)) {
+ unsigned w = f / 64;
+ unsigned b = f % 64;
+
+ le64_add_cpu(&sb->features[w], 1ULL << b);
+ }
+}
+
+static inline __le64 bch_sb_magic(struct cache_set *c)
+{
+ __le64 ret;
+ memcpy(&ret, &c->sb.uuid, sizeof(ret));
+ return ret;
+}
+
+static inline __u64 jset_magic(struct cache_set *c)
+{
+ return __le64_to_cpu(bch_sb_magic(c) ^ JSET_MAGIC);
+}
+
+static inline __u64 pset_magic(struct cache_set *c)
+{
+ return __le64_to_cpu(bch_sb_magic(c) ^ PSET_MAGIC);
+}
+
+static inline __u64 bset_magic(struct cache_set *c)
+{
+ return __le64_to_cpu(bch_sb_magic(c) ^ BSET_MAGIC);
+}
+
+static inline struct cache_member_cpu cache_mi_to_cpu_mi(struct bch_member *mi)
+{
+ return (struct cache_member_cpu) {
+ .nbuckets = le64_to_cpu(mi->nbuckets),
+ .first_bucket = le16_to_cpu(mi->first_bucket),
+ .bucket_size = le16_to_cpu(mi->bucket_size),
+ .state = BCH_MEMBER_STATE(mi),
+ .tier = BCH_MEMBER_TIER(mi),
+ .has_metadata = BCH_MEMBER_HAS_METADATA(mi),
+ .has_data = BCH_MEMBER_HAS_DATA(mi),
+ .replacement = BCH_MEMBER_REPLACEMENT(mi),
+ .discard = BCH_MEMBER_DISCARD(mi),
+ .valid = !bch_is_zero(mi->uuid.b, sizeof(uuid_le)),
+ };
+}
+
+int bch_cache_set_mi_update(struct cache_set *, struct bch_member *, unsigned);
+
+int bch_sb_to_cache_set(struct cache_set *, struct bch_sb *);
+int bch_sb_from_cache_set(struct cache_set *, struct cache *);
+
+struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
+ struct bch_sb_field *, unsigned);
+struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *,
+ struct bch_sb_field *, unsigned);
+
+void bch_free_super(struct bcache_superblock *);
+int bch_super_realloc(struct bcache_superblock *, unsigned);
+
+const char *bch_validate_cache_super(struct bcache_superblock *);
+
+const char *bch_read_super(struct bcache_superblock *,
+ struct cache_set_opts, const char *);
+void bch_write_super(struct cache_set *);
+
+void bch_check_mark_super_slowpath(struct cache_set *,
+ const struct bkey_i *, bool);
+
+#define cache_member_info_get(_c) \
+ (rcu_read_lock(), rcu_dereference((_c)->members))
+
+#define cache_member_info_put() rcu_read_unlock()
+
+static inline bool bch_check_super_marked(struct cache_set *c,
+ const struct bkey_i *k, bool meta)
+{
+ struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+ const struct bch_extent_ptr *ptr;
+ struct cache_member_cpu *mi = cache_member_info_get(c)->m;
+ bool ret = true;
+
+ extent_for_each_ptr(e, ptr)
+ if (!ptr->cached &&
+ !(meta
+ ? mi[ptr->dev].has_metadata
+ : mi[ptr->dev].has_data)) {
+ ret = false;
+ break;
+ }
+
+ cache_member_info_put();
+
+ return ret;
+}
+
+static inline void bch_check_mark_super(struct cache_set *c,
+ const struct bkey_i *k, bool meta)
+{
+ if (bch_check_super_marked(c, k, meta))
+ return;
+
+ bch_check_mark_super_slowpath(c, k, meta);
+}
+
+#endif /* _BCACHE_SUPER_IO_H */
diff --git a/libbcache/super.c b/libbcache/super.c
index 296700b..c026c0d 100644
--- a/libbcache/super.c
+++ b/libbcache/super.c
@@ -31,12 +31,14 @@
#include "notify.h"
#include "stats.h"
#include "super.h"
+#include "super-io.h"
#include "tier.h"
#include "writeback.h"
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/debugfs.h>
+#include <linux/device.h>
#include <linux/genhd.h>
#include <linux/idr.h>
#include <linux/kthread.h>
@@ -69,70 +71,11 @@ static struct device *bch_chardev;
static DEFINE_IDR(bch_chardev_minor);
static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
struct workqueue_struct *bcache_io_wq;
-struct crypto_shash *bch_sha1;
+struct crypto_shash *bch_sha256;
static void bch_cache_stop(struct cache *);
static int bch_cache_online(struct cache *);
-static bool bch_is_open_cache(struct block_device *bdev)
-{
- struct cache_set *c;
- struct cache *ca;
- unsigned i;
-
- rcu_read_lock();
- list_for_each_entry(c, &bch_cache_sets, list)
- for_each_cache_rcu(ca, c, i)
- if (ca->disk_sb.bdev == bdev) {
- rcu_read_unlock();
- return true;
- }
- rcu_read_unlock();
- return false;
-}
-
-static bool bch_is_open(struct block_device *bdev)
-{
- lockdep_assert_held(&bch_register_lock);
-
- return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
-}
-
-static const char *bch_blkdev_open(const char *path, void *holder,
- struct cache_set_opts opts,
- struct block_device **ret)
-{
- struct block_device *bdev;
- fmode_t mode = opts.nochanges > 0
- ? FMODE_READ
- : FMODE_READ|FMODE_WRITE|FMODE_EXCL;
- const char *err;
-
- *ret = NULL;
- bdev = blkdev_get_by_path(path, mode, holder);
-
- if (bdev == ERR_PTR(-EBUSY)) {
- bdev = lookup_bdev(path);
- if (IS_ERR(bdev))
- return "device busy";
-
- err = bch_is_open(bdev)
- ? "device already registered"
- : "device busy";
-
- bdput(bdev);
- return err;
- }
-
- if (IS_ERR(bdev))
- return "failed to open device";
-
- bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
-
- *ret = bdev;
- return NULL;
-}
-
static int bch_congested_fn(void *data, int bdi_bits)
{
struct backing_dev_info *bdi;
@@ -168,520 +111,6 @@ static int bch_congested_fn(void *data, int bdi_bits)
return ret;
}
-/* Superblock */
-
-static struct cache_member_cpu cache_mi_to_cpu_mi(struct cache_member *mi)
-{
- return (struct cache_member_cpu) {
- .nbuckets = le64_to_cpu(mi->nbuckets),
- .first_bucket = le16_to_cpu(mi->first_bucket),
- .bucket_size = le16_to_cpu(mi->bucket_size),
- .state = CACHE_STATE(mi),
- .tier = CACHE_TIER(mi),
- .replication_set= CACHE_REPLICATION_SET(mi),
- .has_metadata = CACHE_HAS_METADATA(mi),
- .has_data = CACHE_HAS_DATA(mi),
- .replacement = CACHE_REPLACEMENT(mi),
- .discard = CACHE_DISCARD(mi),
- .valid = !bch_is_zero(mi->uuid.b, sizeof(uuid_le)),
- };
-}
-
-static const char *validate_cache_super(struct bcache_superblock *disk_sb)
-{
- struct cache_sb *sb = disk_sb->sb;
- struct cache_member_cpu mi;
- u16 block_size;
- unsigned i;
-
- switch (le64_to_cpu(sb->version)) {
- case BCACHE_SB_VERSION_CDEV_V0:
- case BCACHE_SB_VERSION_CDEV_WITH_UUID:
- case BCACHE_SB_VERSION_CDEV_V2:
- case BCACHE_SB_VERSION_CDEV_V3:
- break;
- default:
- return"Unsupported superblock version";
- }
-
- if (CACHE_SET_SYNC(sb) &&
- le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V3)
- return "Unsupported superblock version";
-
- block_size = le16_to_cpu(sb->block_size);
-
- if (!is_power_of_2(block_size) ||
- block_size > PAGE_SECTORS)
- return "Bad block size";
-
- if (bch_is_zero(sb->disk_uuid.b, sizeof(uuid_le)))
- return "Bad disk UUID";
-
- if (bch_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
- return "Bad user UUID";
-
- if (bch_is_zero(sb->set_uuid.b, sizeof(uuid_le)))
- return "Bad set UUID";
-
- if (!sb->nr_in_set ||
- sb->nr_in_set <= sb->nr_this_dev ||
- sb->nr_in_set > MAX_CACHES_PER_SET)
- return "Bad cache device number in set";
-
- if (!CACHE_SET_META_REPLICAS_WANT(sb) ||
- CACHE_SET_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
- return "Invalid number of metadata replicas";
-
- if (!CACHE_SET_META_REPLICAS_HAVE(sb) ||
- CACHE_SET_META_REPLICAS_HAVE(sb) >
- CACHE_SET_META_REPLICAS_WANT(sb))
- return "Invalid number of metadata replicas";
-
- if (!CACHE_SET_DATA_REPLICAS_WANT(sb) ||
- CACHE_SET_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
- return "Invalid number of data replicas";
-
- if (!CACHE_SET_DATA_REPLICAS_HAVE(sb) ||
- CACHE_SET_DATA_REPLICAS_HAVE(sb) >
- CACHE_SET_DATA_REPLICAS_WANT(sb))
- return "Invalid number of data replicas";
-
- if (CACHE_SB_CSUM_TYPE(sb) >= BCH_CSUM_NR)
- return "Invalid checksum type";
-
- if (!CACHE_SET_BTREE_NODE_SIZE(sb))
- return "Btree node size not set";
-
- if (!is_power_of_2(CACHE_SET_BTREE_NODE_SIZE(sb)))
- return "Btree node size not a power of two";
-
- if (CACHE_SET_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
- return "Btree node size too large";
-
- /* Default value, for old filesystems: */
- if (!CACHE_SET_GC_RESERVE(sb))
- SET_CACHE_SET_GC_RESERVE(sb, 10);
-
- if (CACHE_SET_GC_RESERVE(sb) < 5)
- return "gc reserve percentage too small";
-
- if (!CACHE_SET_JOURNAL_ENTRY_SIZE(sb))
- SET_CACHE_SET_JOURNAL_ENTRY_SIZE(sb, 9);
-
- /* 4 mb max: */
- if (512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX)
- return "max journal entry size too big";
-
- if (le16_to_cpu(sb->u64s) < bch_journal_buckets_offset(sb))
- return "Invalid superblock: member info area missing";
-
- mi = cache_mi_to_cpu_mi(sb->members + sb->nr_this_dev);
-
- if (mi.nbuckets > LONG_MAX)
- return "Too many buckets";
-
- if (mi.nbuckets < 1 << 8)
- return "Not enough buckets";
-
- if (!is_power_of_2(mi.bucket_size) ||
- mi.bucket_size < PAGE_SECTORS ||
- mi.bucket_size < block_size)
- return "Bad bucket size";
-
- if (get_capacity(disk_sb->bdev->bd_disk) <
- mi.bucket_size * mi.nbuckets)
- return "Invalid superblock: device too small";
-
- if (le64_to_cpu(sb->offset) +
- (__set_blocks(sb, le16_to_cpu(sb->u64s),
- block_size << 9) * block_size) >
- mi.first_bucket * mi.bucket_size)
- return "Invalid superblock: first bucket comes before end of super";
-
- for (i = 0; i < bch_nr_journal_buckets(sb); i++)
- if (journal_bucket(sb, i) < mi.first_bucket ||
- journal_bucket(sb, i) >= mi.nbuckets)
- return "bad journal bucket";
-
- return NULL;
-}
-
-void free_super(struct bcache_superblock *sb)
-{
- if (sb->bio)
- bio_put(sb->bio);
- if (!IS_ERR_OR_NULL(sb->bdev))
- blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-
- free_pages((unsigned long) sb->sb, sb->page_order);
- memset(sb, 0, sizeof(*sb));
-}
-
-static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
-{
- struct cache_sb *new_sb;
- struct bio *bio;
-
- if (sb->page_order >= order && sb->sb)
- return 0;
-
- new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
- if (!new_sb)
- return -ENOMEM;
-
- bio = (dynamic_fault("bcache:add:super_realloc")
- ? NULL
- : bio_kmalloc(GFP_KERNEL, 1 << order));
- if (!bio) {
- free_pages((unsigned long) new_sb, order);
- return -ENOMEM;
- }
-
- if (sb->sb)
- memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
-
- free_pages((unsigned long) sb->sb, sb->page_order);
- sb->sb = new_sb;
-
- if (sb->bio)
- bio_put(sb->bio);
- sb->bio = bio;
-
- sb->page_order = order;
-
- return 0;
-}
-
-int bch_super_realloc(struct bcache_superblock *sb, unsigned u64s)
-{
- struct cache_member *mi = sb->sb->members + sb->sb->nr_this_dev;
- char buf[BDEVNAME_SIZE];
- size_t bytes = __set_bytes((struct cache_sb *) NULL, u64s);
- u64 want = bytes + (SB_SECTOR << 9);
-
- u64 first_bucket_offset = (u64) le16_to_cpu(mi->first_bucket) *
- ((u64) le16_to_cpu(mi->bucket_size) << 9);
-
- if (want > first_bucket_offset) {
- pr_err("%s: superblock too big: want %llu but have %llu",
- bdevname(sb->bdev, buf), want, first_bucket_offset);
- return -ENOSPC;
- }
-
- return __bch_super_realloc(sb, get_order(bytes));
-}
-
-static const char *read_super(struct bcache_superblock *sb,
- struct cache_set_opts opts,
- const char *path)
-{
- const char *err;
- unsigned order = 0;
-
- lockdep_assert_held(&bch_register_lock);
-
- memset(sb, 0, sizeof(*sb));
-
- err = bch_blkdev_open(path, &sb, opts, &sb->bdev);
- if (err)
- return err;
-retry:
- err = "cannot allocate memory";
- if (__bch_super_realloc(sb, order))
- goto err;
-
- err = "dynamic fault";
- if (cache_set_init_fault("read_super"))
- goto err;
-
- bio_reset(sb->bio);
- sb->bio->bi_bdev = sb->bdev;
- sb->bio->bi_iter.bi_sector = SB_SECTOR;
- sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
- bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
- bch_bio_map(sb->bio, sb->sb);
-
- err = "IO error";
- if (submit_bio_wait(sb->bio))
- goto err;
-
- err = "Not a bcache superblock";
- if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
- goto err;
-
- err = "Superblock has incorrect offset";
- if (le64_to_cpu(sb->sb->offset) != SB_SECTOR)
- goto err;
-
- pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
- le64_to_cpu(sb->sb->version),
- le64_to_cpu(sb->sb->flags),
- le64_to_cpu(sb->sb->seq),
- le16_to_cpu(sb->sb->u64s));
-
- err = "Superblock block size smaller than device block size";
- if (le16_to_cpu(sb->sb->block_size) << 9 <
- bdev_logical_block_size(sb->bdev))
- goto err;
-
- order = get_order(__set_bytes(sb->sb, le16_to_cpu(sb->sb->u64s)));
- if (order > sb->page_order)
- goto retry;
-
- err = "bad checksum reading superblock";
- if (le64_to_cpu(sb->sb->csum) !=
- __csum_set(sb->sb, le16_to_cpu(sb->sb->u64s),
- le64_to_cpu(sb->sb->version) <
- BCACHE_SB_VERSION_CDEV_V3
- ? BCH_CSUM_CRC64
- : CACHE_SB_CSUM_TYPE(sb->sb)))
- goto err;
-
- return NULL;
-err:
- free_super(sb);
- return err;
-}
-
-void __write_super(struct cache_set *c, struct bcache_superblock *disk_sb)
-{
- struct cache_sb *sb = disk_sb->sb;
- struct bio *bio = disk_sb->bio;
-
- bio->bi_bdev = disk_sb->bdev;
- bio->bi_iter.bi_sector = SB_SECTOR;
- bio->bi_iter.bi_size =
- roundup(__set_bytes(sb, le16_to_cpu(sb->u64s)),
- bdev_logical_block_size(disk_sb->bdev));
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
- bch_bio_map(bio, sb);
-
- pr_debug("ver %llu, flags %llu, seq %llu",
- le64_to_cpu(sb->version),
- le64_to_cpu(sb->flags),
- le64_to_cpu(sb->seq));
-
- bch_generic_make_request(bio, c);
-}
-
-static void write_super_endio(struct bio *bio)
-{
- struct cache *ca = bio->bi_private;
-
- /* XXX: return errors directly */
-
- cache_fatal_io_err_on(bio->bi_error, ca, "superblock write");
-
- bch_account_io_completion(ca);
-
- closure_put(&ca->set->sb_write);
- percpu_ref_put(&ca->ref);
-}
-
-static void bcache_write_super_unlock(struct closure *cl)
-{
- struct cache_set *c = container_of(cl, struct cache_set, sb_write);
-
- up(&c->sb_write_mutex);
-}
-
-/* Update cached mi: */
-static int cache_set_mi_update(struct cache_set *c,
- struct cache_member *mi,
- unsigned nr_in_set)
-{
- struct cache_member_rcu *new, *old;
- struct cache *ca;
- unsigned i;
-
- mutex_lock(&c->mi_lock);
-
- new = kzalloc(sizeof(struct cache_member_rcu) +
- sizeof(struct cache_member_cpu) * nr_in_set,
- GFP_KERNEL);
- if (!new) {
- mutex_unlock(&c->mi_lock);
- return -ENOMEM;
- }
-
- new->nr_in_set = nr_in_set;
-
- for (i = 0; i < nr_in_set; i++)
- new->m[i] = cache_mi_to_cpu_mi(&mi[i]);
-
- rcu_read_lock();
- for_each_cache(ca, c, i)
- ca->mi = new->m[i];
- rcu_read_unlock();
-
- old = rcu_dereference_protected(c->members,
- lockdep_is_held(&c->mi_lock));
-
- rcu_assign_pointer(c->members, new);
- if (old)
- kfree_rcu(old, rcu);
-
- mutex_unlock(&c->mi_lock);
- return 0;
-}
-
-/* doesn't copy member info */
-static void __copy_super(struct cache_sb *dst, struct cache_sb *src)
-{
- dst->version = src->version;
- dst->seq = src->seq;
- dst->user_uuid = src->user_uuid;
- dst->set_uuid = src->set_uuid;
- memcpy(dst->label, src->label, SB_LABEL_SIZE);
- dst->flags = src->flags;
- dst->flags2 = src->flags2;
- dst->nr_in_set = src->nr_in_set;
- dst->block_size = src->block_size;
-}
-
-static int cache_sb_to_cache_set(struct cache_set *c, struct cache_sb *src)
-{
- struct cache_member *new;
-
- lockdep_assert_held(&bch_register_lock);
-
- new = kzalloc(sizeof(struct cache_member) * src->nr_in_set,
- GFP_KERNEL);
- if (!new)
- return -ENOMEM;
-
- memcpy(new, src->members,
- src->nr_in_set * sizeof(struct cache_member));
-
- if (cache_set_mi_update(c, new, src->nr_in_set)) {
- kfree(new);
- return -ENOMEM;
- }
-
- kfree(c->disk_mi);
- c->disk_mi = new;
-
- __copy_super(&c->disk_sb, src);
-
- c->sb.block_size = le16_to_cpu(src->block_size);
- c->sb.btree_node_size = CACHE_SET_BTREE_NODE_SIZE(src);
- c->sb.nr_in_set = src->nr_in_set;
- c->sb.clean = CACHE_SET_CLEAN(src);
- c->sb.meta_replicas_have= CACHE_SET_META_REPLICAS_HAVE(src);
- c->sb.data_replicas_have= CACHE_SET_DATA_REPLICAS_HAVE(src);
- c->sb.str_hash_type = CACHE_SET_STR_HASH_TYPE(src);
-
- return 0;
-}
-
-static int cache_sb_from_cache_set(struct cache_set *c, struct cache *ca)
-{
- struct cache_sb *src = &c->disk_sb, *dst = ca->disk_sb.sb;
-
- if (src->nr_in_set != dst->nr_in_set) {
- /*
- * We have to preserve the list of journal buckets on the
- * cache's superblock:
- */
- unsigned old_offset = bch_journal_buckets_offset(dst);
- unsigned u64s = bch_journal_buckets_offset(src)
- + bch_nr_journal_buckets(dst);
- int ret = bch_super_realloc(&ca->disk_sb, u64s);
-
- if (ret)
- return ret;
-
- dst->nr_in_set = src->nr_in_set;
- dst->u64s = cpu_to_le16(u64s);
-
- memmove(dst->_data + bch_journal_buckets_offset(dst),
- dst->_data + old_offset,
- bch_nr_journal_buckets(dst) * sizeof(u64));
- }
-
- memcpy(dst->_data,
- c->disk_mi,
- src->nr_in_set * sizeof(struct cache_member));
-
- __copy_super(dst, src);
-
- return 0;
-}
-
-static void __bcache_write_super(struct cache_set *c)
-{
- struct closure *cl = &c->sb_write;
- struct cache *ca;
- unsigned i;
-
- cache_set_mi_update(c, c->disk_mi, c->sb.nr_in_set);
-
- closure_init(cl, &c->cl);
-
- if (c->opts.nochanges)
- goto no_io;
-
- le64_add_cpu(&c->disk_sb.seq, 1);
-
- for_each_cache(ca, c, i) {
- struct cache_sb *sb = ca->disk_sb.sb;
- struct bio *bio = ca->disk_sb.bio;
-
- cache_sb_from_cache_set(c, ca);
-
- SET_CACHE_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
- sb->csum = cpu_to_le64(__csum_set(sb,
- le16_to_cpu(sb->u64s),
- CACHE_SB_CSUM_TYPE(sb)));
-
- bio_reset(bio);
- bio->bi_bdev = ca->disk_sb.bdev;
- bio->bi_end_io = write_super_endio;
- bio->bi_private = ca;
-
- closure_get(cl);
- percpu_ref_get(&ca->ref);
- __write_super(c, &ca->disk_sb);
- }
-no_io:
- closure_return_with_destructor(cl, bcache_write_super_unlock);
-}
-
-void bcache_write_super(struct cache_set *c)
-{
- down(&c->sb_write_mutex);
- __bcache_write_super(c);
-}
-
-void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k,
- bool meta)
-{
- struct cache_member *mi;
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
- const struct bch_extent_ptr *ptr;
-
- if (!CACHE_SET_SYNC(&c->disk_sb))
- return;
-
- down(&c->sb_write_mutex);
-
- /* recheck, might have raced */
- if (bch_check_super_marked(c, k, meta)) {
- up(&c->sb_write_mutex);
- return;
- }
-
- mi = c->disk_mi;
-
- extent_for_each_ptr(e, ptr)
- if (bch_extent_ptr_is_dirty(c, e, ptr))
- (meta
- ? SET_CACHE_HAS_METADATA
- : SET_CACHE_HAS_DATA)(mi + ptr->dev, true);
-
- __bcache_write_super(c);
-}
-
/* Cache set RO/RW: */
/*
@@ -768,8 +197,10 @@ static void bch_cache_set_read_only_work(struct work_struct *work)
if (!bch_journal_error(&c->journal) &&
!test_bit(CACHE_SET_ERROR, &c->flags)) {
- SET_CACHE_SET_CLEAN(&c->disk_sb, true);
- bcache_write_super(c);
+ mutex_lock(&c->sb_lock);
+ SET_BCH_SB_CLEAN(c->disk_sb, true);
+ bch_write_super(c);
+ mutex_unlock(&c->sb_lock);
}
} else {
/*
@@ -848,7 +279,7 @@ static const char *__bch_cache_set_read_write(struct cache_set *c)
err = "error starting allocator thread";
for_each_cache(ca, c, i)
- if (ca->mi.state == CACHE_ACTIVE &&
+ if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
bch_cache_allocator_start(ca)) {
percpu_ref_put(&ca->ref);
goto err;
@@ -859,7 +290,7 @@ static const char *__bch_cache_set_read_write(struct cache_set *c)
goto err;
for_each_cache(ca, c, i) {
- if (ca->mi.state != CACHE_ACTIVE)
+ if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
continue;
err = "error starting moving GC thread";
@@ -913,6 +344,7 @@ static void cache_set_free(struct cache_set *c)
cancel_work_sync(&c->bio_submit_work);
cancel_work_sync(&c->read_retry_work);
+ bch_cache_set_encryption_free(c);
bch_btree_cache_free(c);
bch_journal_free(&c->journal);
bch_io_clock_exit(&c->io_clock[WRITE]);
@@ -939,7 +371,7 @@ static void cache_set_free(struct cache_set *c)
destroy_workqueue(c->wq);
kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */
- kfree(c->disk_mi);
+ free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
kfree(c);
module_put(THIS_MODULE);
}
@@ -1043,15 +475,18 @@ void bch_cache_set_unregister(struct cache_set *c)
static unsigned cache_set_nr_devices(struct cache_set *c)
{
+ struct bch_sb_field_members *mi;
unsigned i, nr = 0;
- struct cache_member *mi = c->disk_mi;
- lockdep_assert_held(&bch_register_lock);
+ mutex_lock(&c->sb_lock);
+ mi = bch_sb_get_members(c->disk_sb);
- for (i = 0; i < c->disk_sb.nr_in_set; i++)
- if (!bch_is_zero(mi[i].uuid.b, sizeof(uuid_le)))
+ for (i = 0; i < c->disk_sb->nr_devices; i++)
+ if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
nr++;
+ mutex_unlock(&c->sb_lock);
+
return nr;
}
@@ -1059,7 +494,7 @@ static unsigned cache_set_nr_online_devices(struct cache_set *c)
{
unsigned i, nr = 0;
- for (i = 0; i < c->sb.nr_in_set; i++)
+ for (i = 0; i < c->sb.nr_devices; i++)
if (c->cache[i])
nr++;
@@ -1069,7 +504,7 @@ static unsigned cache_set_nr_online_devices(struct cache_set *c)
#define alloc_bucket_pages(gfp, ca) \
((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
-static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
+static struct cache_set *bch_cache_set_alloc(struct bch_sb *sb,
struct cache_set_opts opts)
{
struct cache_set *c;
@@ -1083,13 +518,12 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
c->minor = -1;
- sema_init(&c->sb_write_mutex, 1);
+ mutex_init(&c->sb_lock);
INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
mutex_init(&c->btree_cache_lock);
mutex_init(&c->bucket_lock);
mutex_init(&c->btree_root_lock);
INIT_WORK(&c->read_only_work, bch_cache_set_read_only_work);
- mutex_init(&c->mi_lock);
init_rwsem(&c->gc_lock);
@@ -1146,10 +580,16 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
mutex_init(&c->uevent_lock);
- if (cache_sb_to_cache_set(c, sb))
+ mutex_lock(&c->sb_lock);
+
+ if (bch_sb_to_cache_set(c, sb)) {
+ mutex_unlock(&c->sb_lock);
goto err;
+ }
+
+ mutex_unlock(&c->sb_lock);
- scnprintf(c->name, sizeof(c->name), "%pU", &c->disk_sb.user_uuid);
+ scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
c->opts = cache_superblock_opts(sb);
cache_set_opts_apply(&c->opts, opts);
@@ -1165,7 +605,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
iter_size = (btree_blocks(c) + 1) * 2 *
sizeof(struct btree_node_iter_set);
- journal_entry_bytes = 512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb);
+ journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb);
if (!(c->wq = alloc_workqueue("bcache",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
@@ -1185,7 +625,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
mempool_init_page_pool(&c->bio_bounce_pages,
max_t(unsigned,
c->sb.btree_node_size,
- CRC32_EXTENT_SIZE_MAX) /
+ BCH_ENCODED_EXTENT_MAX) /
PAGE_SECTORS, 0) ||
!(c->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache_set)) ||
lg_lock_init(&c->bucket_stats_lock) ||
@@ -1196,7 +636,9 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
bch_io_clock_init(&c->io_clock[WRITE]) ||
bch_journal_alloc(&c->journal, journal_entry_bytes) ||
bch_btree_cache_alloc(c) ||
- bch_compress_init(c))
+ bch_cache_set_encryption_init(c) ||
+ bch_compress_init(c) ||
+ bch_check_set_has_compressed_data(c, c->opts.compression))
goto err;
c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
@@ -1247,7 +689,7 @@ static int bch_cache_set_online(struct cache_set *c)
if (IS_ERR(c->chardev))
return PTR_ERR(c->chardev);
- if (kobject_add(&c->kobj, NULL, "%pU", c->disk_sb.user_uuid.b) ||
+ if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
kobject_add(&c->internal, &c->kobj, "internal") ||
kobject_add(&c->opts_dir, &c->kobj, "options") ||
kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
@@ -1267,6 +709,7 @@ static int bch_cache_set_online(struct cache_set *c)
static const char *run_cache_set(struct cache_set *c)
{
const char *err = "cannot allocate memory";
+ struct bch_sb_field_members *mi;
struct cache *ca;
unsigned i, id;
time64_t now;
@@ -1285,15 +728,9 @@ static const char *run_cache_set(struct cache_set *c)
* we start testing it.
*/
for_each_cache(ca, c, i)
- cache_sb_from_cache_set(c, ca);
+ bch_sb_from_cache_set(c, ca);
- /*
- * CACHE_SET_SYNC is true if the cache set has already been run
- * and potentially has data.
- * It is false if it is the first time it is run.
- */
-
- if (CACHE_SET_SYNC(&c->disk_sb)) {
+ if (BCH_SB_INITIALIZED(c->disk_sb)) {
ret = bch_journal_read(c, &journal);
if (ret)
goto err;
@@ -1363,7 +800,7 @@ static const char *run_cache_set(struct cache_set *c)
err = "error starting allocator thread";
for_each_cache(ca, c, i)
- if (ca->mi.state == CACHE_ACTIVE &&
+ if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
bch_cache_allocator_start(ca)) {
percpu_ref_put(&ca->ref);
goto err;
@@ -1381,25 +818,16 @@ static const char *run_cache_set(struct cache_set *c)
if (c->opts.norecovery)
goto recovery_done;
- /*
- * Write a new journal entry _before_ we start journalling new
- * data - otherwise, we could end up with btree node bsets with
- * journal seqs arbitrarily far in the future vs. the most
- * recently written journal entry on disk, if we crash before
- * writing the next journal entry:
- */
- err = "error writing journal entry";
- if (bch_journal_meta(&c->journal))
- goto err;
-
bch_verbose(c, "starting fsck:");
err = "error in fsck";
ret = bch_fsck(c, !c->opts.nofsck);
if (ret)
goto err;
+
bch_verbose(c, "fsck done");
} else {
- struct bkey_i_inode inode;
+ struct bch_inode_unpacked inode;
+ struct bkey_inode_buf packed_inode;
struct closure cl;
closure_init_stack(&cl);
@@ -1424,7 +852,7 @@ static const char *run_cache_set(struct cache_set *c)
err = "error starting allocator thread";
for_each_cache(ca, c, i)
- if (ca->mi.state == CACHE_ACTIVE &&
+ if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
bch_cache_allocator_start(ca)) {
percpu_ref_put(&ca->ref);
goto err;
@@ -1442,10 +870,13 @@ static const char *run_cache_set(struct cache_set *c)
bch_inode_init(c, &inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
- inode.k.p.inode = BCACHE_ROOT_INO;
+ inode.inum = BCACHE_ROOT_INO;
+
+ bch_inode_pack(&packed_inode, &inode);
err = "error creating root directory";
- if (bch_btree_insert(c, BTREE_ID_INODES, &inode.k_i,
+ if (bch_btree_insert(c, BTREE_ID_INODES,
+ &packed_inode.inode.k_i,
NULL, NULL, NULL, 0))
goto err;
@@ -1462,16 +893,21 @@ recovery_done:
goto err;
}
+ mutex_lock(&c->sb_lock);
+ mi = bch_sb_get_members(c->disk_sb);
now = ktime_get_seconds();
+
rcu_read_lock();
for_each_cache_rcu(ca, c, i)
- c->disk_mi[ca->sb.nr_this_dev].last_mount = cpu_to_le64(now);
+ mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
rcu_read_unlock();
- /* Mark cache set as initialized: */
- SET_CACHE_SET_SYNC(&c->disk_sb, true);
- SET_CACHE_SET_CLEAN(&c->disk_sb, false);
- bcache_write_super(c);
+ SET_BCH_SB_INITIALIZED(c->disk_sb, true);
+ SET_BCH_SB_CLEAN(c->disk_sb, false);
+ c->disk_sb->version = BCACHE_SB_VERSION_CDEV;
+
+ bch_write_super(c);
+ mutex_unlock(&c->sb_lock);
err = "dynamic fault";
if (cache_set_init_fault("run_cache_set"))
@@ -1527,41 +963,46 @@ err:
goto out;
}
-static const char *can_add_cache(struct cache_sb *sb,
+static const char *can_add_cache(struct bch_sb *sb,
struct cache_set *c)
{
+ struct bch_sb_field_members *sb_mi;
+
+ sb_mi = bch_sb_get_members(sb);
+ if (!sb_mi)
+ return "Invalid superblock: member info area missing";
+
if (le16_to_cpu(sb->block_size) != c->sb.block_size)
return "mismatched block size";
- if (le16_to_cpu(sb->members[sb->nr_this_dev].bucket_size) <
- CACHE_SET_BTREE_NODE_SIZE(&c->disk_sb))
+ if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
+ BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
return "new cache bucket_size is too small";
return NULL;
}
-static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c)
+static const char *can_attach_cache(struct bch_sb *sb, struct cache_set *c)
{
+ struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb);
+ struct bch_sb_field_members *dev_mi = bch_sb_get_members(sb);
+ uuid_le dev_uuid = dev_mi->members[sb->dev_idx].uuid;
const char *err;
- bool match;
err = can_add_cache(sb, c);
if (err)
return err;
+ if (bch_is_zero(&dev_uuid, sizeof(dev_uuid)))
+ return "device has been removed";
+
/*
* When attaching an existing device, the cache set superblock must
* already contain member_info with a matching UUID
*/
- match = le64_to_cpu(sb->seq) <= le64_to_cpu(c->disk_sb.seq)
- ? (sb->nr_this_dev < c->disk_sb.nr_in_set &&
- !memcmp(&c->disk_mi[sb->nr_this_dev].uuid,
- &sb->disk_uuid, sizeof(uuid_le)))
- : (sb->nr_this_dev < sb->nr_in_set &&
- !memcmp(&sb->members[sb->nr_this_dev].uuid,
- &sb->disk_uuid, sizeof(uuid_le)));
-
- if (!match)
+ if (sb->dev_idx >= c->disk_sb->nr_devices ||
+ memcmp(&mi->members[sb->dev_idx].uuid,
+ &dev_uuid, sizeof(uuid_le)))
return "cache sb does not match set";
return NULL;
@@ -1572,13 +1013,14 @@ static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c)
bool bch_cache_read_only(struct cache *ca)
{
struct cache_set *c = ca->set;
+ struct bch_sb_field_members *mi;
char buf[BDEVNAME_SIZE];
bdevname(ca->disk_sb.bdev, buf);
lockdep_assert_held(&bch_register_lock);
- if (ca->mi.state != CACHE_ACTIVE)
+ if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
return false;
if (!bch_cache_may_remove(ca)) {
@@ -1609,8 +1051,12 @@ bool bch_cache_read_only(struct cache *ca)
bch_notice(c, "%s read only", bdevname(ca->disk_sb.bdev, buf));
bch_notify_cache_read_only(ca);
- SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_RO);
- bcache_write_super(c);
+ mutex_lock(&c->sb_lock);
+ mi = bch_sb_get_members(c->disk_sb);
+ SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
+ BCH_MEMBER_STATE_RO);
+ bch_write_super(c);
+ mutex_unlock(&c->sb_lock);
return true;
}
@@ -1618,7 +1064,7 @@ static const char *__bch_cache_read_write(struct cache_set *c, struct cache *ca)
{
lockdep_assert_held(&bch_register_lock);
- if (ca->mi.state == CACHE_ACTIVE)
+ if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
return NULL;
if (test_bit(CACHE_DEV_REMOVING, &ca->flags))
@@ -1645,14 +1091,19 @@ static const char *__bch_cache_read_write(struct cache_set *c, struct cache *ca)
const char *bch_cache_read_write(struct cache *ca)
{
struct cache_set *c = ca->set;
+ struct bch_sb_field_members *mi;
const char *err;
err = __bch_cache_read_write(c, ca);
if (err)
return err;
- SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_ACTIVE);
- bcache_write_super(c);
+ mutex_lock(&c->sb_lock);
+ mi = bch_sb_get_members(c->disk_sb);
+ SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
+ BCH_MEMBER_STATE_ACTIVE);
+ bch_write_super(c);
+ mutex_unlock(&c->sb_lock);
return NULL;
}
@@ -1681,14 +1132,14 @@ static void bch_cache_free_work(struct work_struct *work)
if (c && c->kobj.state_in_sysfs) {
char buf[12];
- sprintf(buf, "cache%u", ca->sb.nr_this_dev);
+ sprintf(buf, "cache%u", ca->dev_idx);
sysfs_remove_link(&c->kobj, buf);
}
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
- free_super(&ca->disk_sb);
+ bch_free_super(&ca->disk_sb);
/*
* bch_cache_stop can be called in the middle of initialization
@@ -1697,10 +1148,10 @@ static void bch_cache_free_work(struct work_struct *work)
* However, they were zeroed when the object was allocated.
*/
+ bch_journal_free_cache(ca);
free_percpu(ca->sectors_written);
bioset_exit(&ca->replica_set);
free_percpu(ca->bucket_stats_percpu);
- kfree(ca->journal.bucket_seq);
free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
kfree(ca->prio_buckets);
kfree(ca->bio_prio);
@@ -1754,8 +1205,8 @@ static void bch_cache_stop(struct cache *ca)
lockdep_assert_held(&bch_register_lock);
if (c) {
- BUG_ON(rcu_access_pointer(c->cache[ca->sb.nr_this_dev]) != ca);
- rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], NULL);
+ BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
+ rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
}
call_rcu(&ca->free_rcu, bch_cache_free_rcu);
@@ -1764,10 +1215,11 @@ static void bch_cache_stop(struct cache *ca)
static void bch_cache_remove_work(struct work_struct *work)
{
struct cache *ca = container_of(work, struct cache, remove_work);
+ struct bch_sb_field_members *mi;
struct cache_set *c = ca->set;
char name[BDEVNAME_SIZE];
bool force = test_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags);
- unsigned dev = ca->sb.nr_this_dev;
+ unsigned dev_idx = ca->dev_idx;
bdevname(ca->disk_sb.bdev, name);
@@ -1780,17 +1232,21 @@ static void bch_cache_remove_work(struct work_struct *work)
if (!ca->mi.has_data) {
/* Nothing to do: */
} else if (!bch_move_data_off_device(ca)) {
- lockdep_assert_held(&bch_register_lock);
- SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false);
+ mutex_lock(&c->sb_lock);
+ mi = bch_sb_get_members(c->disk_sb);
+ SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
- bcache_write_super(c);
+ bch_write_super(c);
+ mutex_unlock(&c->sb_lock);
} else if (force) {
bch_flag_data_bad(ca);
- lockdep_assert_held(&bch_register_lock);
- SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false);
+ mutex_lock(&c->sb_lock);
+ mi = bch_sb_get_members(c->disk_sb);
+ SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
- bcache_write_super(c);
+ bch_write_super(c);
+ mutex_unlock(&c->sb_lock);
} else {
bch_err(c, "Remove of %s failed, unable to migrate data off",
name);
@@ -1803,10 +1259,12 @@ static void bch_cache_remove_work(struct work_struct *work)
if (!ca->mi.has_metadata) {
/* Nothing to do: */
} else if (!bch_move_meta_data_off_device(ca)) {
- lockdep_assert_held(&bch_register_lock);
- SET_CACHE_HAS_METADATA(&c->disk_mi[ca->sb.nr_this_dev], false);
+ mutex_lock(&c->sb_lock);
+ mi = bch_sb_get_members(c->disk_sb);
+ SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
- bcache_write_super(c);
+ bch_write_super(c);
+ mutex_unlock(&c->sb_lock);
} else {
bch_err(c, "Remove of %s failed, unable to migrate metadata off",
name);
@@ -1821,7 +1279,7 @@ static void bch_cache_remove_work(struct work_struct *work)
bch_notify_cache_removed(ca);
spin_lock(&c->journal.lock);
- c->journal.prio_buckets[dev] = 0;
+ c->journal.prio_buckets[dev_idx] = 0;
spin_unlock(&c->journal.lock);
bch_journal_meta(&c->journal);
@@ -1844,12 +1302,16 @@ static void bch_cache_remove_work(struct work_struct *work)
lockdep_assert_held(&bch_register_lock);
/*
- * Free this device's slot in the cache_member array - all pointers to
+ * Free this device's slot in the bch_member array - all pointers to
* this device must be gone:
*/
- memset(&c->disk_mi[dev].uuid, 0, sizeof(c->disk_mi[dev].uuid));
+ mutex_lock(&c->sb_lock);
+ mi = bch_sb_get_members(c->disk_sb);
+ memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
+
+ bch_write_super(c);
+ mutex_unlock(&c->sb_lock);
- bcache_write_super(c);
mutex_unlock(&bch_register_lock);
closure_put(&c->cl);
@@ -1891,7 +1353,7 @@ static int bch_cache_online(struct cache *ca)
lockdep_assert_held(&bch_register_lock);
- sprintf(buf, "cache%u", ca->sb.nr_this_dev);
+ sprintf(buf, "cache%u", ca->dev_idx);
if (kobject_add(&ca->kobj,
&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
@@ -1907,13 +1369,14 @@ static const char *cache_alloc(struct bcache_superblock *sb,
struct cache_set *c,
struct cache **ret)
{
+ struct bch_member *member;
size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
size_t heap_size;
- unsigned i, journal_entry_pages;
+ unsigned i;
const char *err = "cannot allocate memory";
struct cache *ca;
- if (c->sb.nr_in_set == 1)
+ if (c->sb.nr_devices == 1)
bdevname(sb->bdev, c->name);
if (cache_set_init_fault("cache_alloc"))
@@ -1934,7 +1397,7 @@ static const char *cache_alloc(struct bcache_superblock *sb,
spin_lock_init(&ca->self.lock);
ca->self.nr_devices = 1;
rcu_assign_pointer(ca->self.d[0].dev, ca);
- ca->sb.nr_this_dev = sb->sb->nr_this_dev;
+ ca->dev_idx = sb->sb->dev_idx;
INIT_WORK(&ca->free_work, bch_cache_free_work);
INIT_WORK(&ca->remove_work, bch_cache_remove_work);
@@ -1953,8 +1416,11 @@ static const char *cache_alloc(struct bcache_superblock *sb,
if (cache_set_init_fault("cache_alloc"))
goto err;
- ca->mi = cache_mi_to_cpu_mi(ca->disk_sb.sb->members +
- ca->disk_sb.sb->nr_this_dev);
+ member = bch_sb_get_members(ca->disk_sb.sb)->members +
+ ca->disk_sb.sb->dev_idx;
+
+ ca->mi = cache_mi_to_cpu_mi(member);
+ ca->uuid = member->uuid;
ca->bucket_bits = ilog2(ca->mi.bucket_size);
/* XXX: tune these */
@@ -1968,10 +1434,6 @@ static const char *cache_alloc(struct bcache_superblock *sb,
free_inc_reserve = movinggc_reserve / 2;
heap_size = movinggc_reserve * 8;
- journal_entry_pages =
- DIV_ROUND_UP(1U << CACHE_SET_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb),
- PAGE_SECTORS);
-
if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_MOVINGGC],
@@ -1987,13 +1449,11 @@ static const char *cache_alloc(struct bcache_superblock *sb,
2, GFP_KERNEL)) ||
!(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
!(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache)) ||
- !(ca->journal.bucket_seq = kcalloc(bch_nr_journal_buckets(ca->disk_sb.sb),
- sizeof(u64), GFP_KERNEL)) ||
- !(ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages)) ||
- !(ca->bio_prio = bio_kmalloc(GFP_KERNEL, bucket_pages(ca))) ||
+ !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
bioset_init(&ca->replica_set, 4,
offsetof(struct bch_write_bio, bio)) ||
- !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
+ !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
+ bch_journal_init_cache(ca))
goto err;
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
@@ -2006,15 +1466,6 @@ static const char *cache_alloc(struct bcache_superblock *sb,
ca->copygc_write_point.group = &ca->self;
ca->tiering_write_point.group = &ca->self;
- kobject_get(&c->kobj);
- ca->set = c;
-
- kobject_get(&ca->kobj);
- rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], ca);
-
- if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb.seq))
- cache_sb_to_cache_set(c, ca->disk_sb.sb);
-
/*
* Increase journal write timeout if flushes to this device are
* expensive:
@@ -2024,6 +1475,19 @@ static const char *cache_alloc(struct bcache_superblock *sb,
c->journal.write_delay_ms =
max(c->journal.write_delay_ms, 1000U);
+ kobject_get(&c->kobj);
+ ca->set = c;
+
+ kobject_get(&ca->kobj);
+ rcu_assign_pointer(c->cache[ca->dev_idx], ca);
+
+ mutex_lock(&c->sb_lock);
+
+ if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb->seq))
+ bch_sb_to_cache_set(c, ca->disk_sb.sb);
+
+ mutex_unlock(&c->sb_lock);
+
err = "error creating kobject";
if (c->kobj.state_in_sysfs &&
bch_cache_online(ca))
@@ -2046,7 +1510,7 @@ static struct cache_set *cache_set_lookup(uuid_le uuid)
lockdep_assert_held(&bch_register_lock);
list_for_each_entry(c, &bch_cache_sets, list)
- if (!memcmp(&c->disk_sb.set_uuid, &uuid, sizeof(uuid_le)))
+ if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
return c;
return NULL;
@@ -2060,13 +1524,13 @@ static const char *register_cache(struct bcache_superblock *sb,
struct cache_set *c;
bool allocated_cache_set = false;
- err = validate_cache_super(sb);
+ err = bch_validate_cache_super(sb);
if (err)
return err;
bdevname(sb->bdev, name);
- c = cache_set_lookup(sb->sb->set_uuid);
+ c = cache_set_lookup(sb->sb->uuid);
if (c) {
err = can_attach_cache(sb->sb, c);
if (err)
@@ -2106,20 +1570,23 @@ int bch_cache_set_add_cache(struct cache_set *c, const char *path)
struct bcache_superblock sb;
const char *err;
struct cache *ca;
- struct cache_member *new_mi = NULL;
- struct cache_member mi;
- unsigned nr_this_dev, nr_in_set, u64s;
+ struct bch_sb_field *f;
+ struct bch_sb_field_members *mi, *dev_mi;
+ struct bch_member saved_mi;
+ unsigned dev_idx, nr_devices, u64s;
int ret = -EINVAL;
mutex_lock(&bch_register_lock);
- err = read_super(&sb, c->opts, path);
+ err = bch_read_super(&sb, c->opts, path);
if (err)
- goto err_unlock;
+ goto err_unlock_register;
- err = validate_cache_super(&sb);
+ err = bch_validate_cache_super(&sb);
if (err)
- goto err_unlock;
+ goto err_unlock_register;
+
+ mutex_lock(&c->sb_lock);
err = can_add_cache(sb.sb, c);
if (err)
@@ -2129,8 +1596,9 @@ int bch_cache_set_add_cache(struct cache_set *c, const char *path)
* Preserve the old cache member information (esp. tier)
* before we start bashing the disk stuff.
*/
- mi = sb.sb->members[sb.sb->nr_this_dev];
- mi.last_mount = cpu_to_le64(ktime_get_seconds());
+ dev_mi = bch_sb_get_members(sb.sb);
+ saved_mi = dev_mi->members[sb.sb->dev_idx];
+ saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
down_read(&c->gc_lock);
@@ -2140,9 +1608,10 @@ int bch_cache_set_add_cache(struct cache_set *c, const char *path)
if (test_bit(CACHE_SET_GC_FAILURE, &c->flags))
goto no_slot;
- for (nr_this_dev = 0; nr_this_dev < MAX_CACHES_PER_SET; nr_this_dev++)
- if (nr_this_dev >= c->sb.nr_in_set ||
- bch_is_zero(c->disk_mi[nr_this_dev].uuid.b,
+ mi = bch_sb_get_members(c->disk_sb);
+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
+ if (dev_idx >= c->sb.nr_devices ||
+ bch_is_zero(mi->members[dev_idx].uuid.b,
sizeof(uuid_le)))
goto have_slot;
no_slot:
@@ -2153,52 +1622,46 @@ no_slot:
goto err_unlock;
have_slot:
- nr_in_set = max_t(unsigned, nr_this_dev + 1, c->sb.nr_in_set);
up_read(&c->gc_lock);
- u64s = nr_in_set * (sizeof(struct cache_member) / sizeof(u64));
+ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+ u64s = (sizeof(struct bch_sb_field_members) +
+ sizeof(struct bch_member) * nr_devices) / sizeof(u64);
err = "no space in superblock for member info";
- if (bch_super_realloc(&sb, u64s))
+
+ f = bch_fs_sb_field_resize(c, &mi->field, u64s);
+ if (!f)
goto err_unlock;
- new_mi = dynamic_fault("bcache:add:member_info_realloc")
- ? NULL
- : kmalloc(sizeof(struct cache_member) * nr_in_set,
- GFP_KERNEL);
- if (!new_mi) {
- err = "cannot allocate memory";
- ret = -ENOMEM;
+ mi = container_of(f, struct bch_sb_field_members, field);
+
+ f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s);
+ if (!f)
goto err_unlock;
- }
- memcpy(new_mi, c->disk_mi,
- sizeof(struct cache_member) * nr_in_set);
- new_mi[nr_this_dev] = mi;
+ dev_mi = container_of(f, struct bch_sb_field_members, field);
+ memcpy(dev_mi, mi, u64s * sizeof(u64));
+ dev_mi->members[dev_idx] = saved_mi;
- sb.sb->nr_this_dev = nr_this_dev;
- sb.sb->nr_in_set = nr_in_set;
- sb.sb->u64s = cpu_to_le16(u64s);
- memcpy(sb.sb->members, new_mi,
- sizeof(struct cache_member) * nr_in_set);
+ sb.sb->dev_idx = dev_idx;
+ sb.sb->nr_devices = nr_devices;
- if (cache_set_mi_update(c, new_mi, nr_in_set)) {
+ if (bch_cache_set_mi_update(c, dev_mi->members, nr_devices)) {
err = "cannot allocate memory";
ret = -ENOMEM;
goto err_unlock;
}
/* commit new member info */
- swap(c->disk_mi, new_mi);
- kfree(new_mi);
- new_mi = NULL;
- c->disk_sb.nr_in_set = nr_in_set;
- c->sb.nr_in_set = nr_in_set;
+ memcpy(mi, dev_mi, u64s * sizeof(u64));
+ c->disk_sb->nr_devices = nr_devices;
+ c->sb.nr_devices = nr_devices;
err = cache_alloc(&sb, c, &ca);
if (err)
goto err_unlock;
- bcache_write_super(c);
+ bch_write_super(c);
err = "journal alloc failed";
if (bch_cache_journal_alloc(ca))
@@ -2206,21 +1669,23 @@ have_slot:
bch_notify_cache_added(ca);
- if (ca->mi.state == CACHE_ACTIVE) {
+ if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) {
err = __bch_cache_read_write(c, ca);
if (err)
goto err_put;
}
kobject_put(&ca->kobj);
+ mutex_unlock(&c->sb_lock);
mutex_unlock(&bch_register_lock);
return 0;
err_put:
bch_cache_stop(ca);
err_unlock:
- kfree(new_mi);
- free_super(&sb);
+ mutex_unlock(&c->sb_lock);
+err_unlock_register:
mutex_unlock(&bch_register_lock);
+ bch_free_super(&sb);
bch_err(c, "Unable to add device: %s", err);
return ret ?: -EINVAL;
@@ -2250,14 +1715,14 @@ const char *bch_register_cache_set(char * const *devices, unsigned nr_devices,
goto err;
/*
- * read_super() needs to happen under register_lock, so that the
+ * bch_read_super() needs to happen under register_lock, so that the
* exclusive open is atomic with adding the new cache set to the list of
* cache sets:
*/
mutex_lock(&bch_register_lock);
for (i = 0; i < nr_devices; i++) {
- err = read_super(&sb[i], opts, devices[i]);
+ err = bch_read_super(&sb[i], opts, devices[i]);
if (err)
goto err_unlock;
@@ -2265,13 +1730,13 @@ const char *bch_register_cache_set(char * const *devices, unsigned nr_devices,
if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
goto err_unlock;
- err = validate_cache_super(&sb[i]);
+ err = bch_validate_cache_super(&sb[i]);
if (err)
goto err_unlock;
}
err = "cache set already registered";
- if (cache_set_lookup(sb->sb->set_uuid))
+ if (cache_set_lookup(sb->sb->uuid))
goto err_unlock;
err = "cannot allocate memory";
@@ -2317,7 +1782,7 @@ err_unlock:
mutex_unlock(&bch_register_lock);
err:
for (i = 0; i < nr_devices; i++)
- free_super(&sb[i]);
+ bch_free_super(&sb[i]);
goto out;
}
@@ -2329,7 +1794,7 @@ const char *bch_register_one(const char *path)
mutex_lock(&bch_register_lock);
- err = read_super(&sb, opts, path);
+ err = bch_read_super(&sb, opts, path);
if (err)
goto err;
@@ -2338,7 +1803,7 @@ const char *bch_register_one(const char *path)
else
err = register_cache(&sb, opts);
- free_super(&sb);
+ bch_free_super(&sb);
err:
mutex_unlock(&bch_register_lock);
return err;
@@ -2440,8 +1905,8 @@ static void bcache_exit(void)
class_destroy(bch_chardev_class);
if (bch_chardev_major > 0)
unregister_chrdev(bch_chardev_major, "bcache");
- if (!IS_ERR_OR_NULL(bch_sha1))
- crypto_free_shash(bch_sha1);
+ if (!IS_ERR_OR_NULL(bch_sha256))
+ crypto_free_shash(bch_sha256);
unregister_reboot_notifier(&reboot);
}
@@ -2459,8 +1924,8 @@ static int __init bcache_init(void)
closure_debug_init();
bkey_pack_test();
- bch_sha1 = crypto_alloc_shash("sha1", 0, 0);
- if (IS_ERR(bch_sha1))
+ bch_sha256 = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(bch_sha256))
goto err;
bch_chardev_major = register_chrdev(0, "bcache-ctl", &bch_chardev_fops);
diff --git a/libbcache/super.h b/libbcache/super.h
index 635e1a6..014d7ae 100644
--- a/libbcache/super.h
+++ b/libbcache/super.h
@@ -18,17 +18,12 @@ static inline sector_t bucket_remainder(const struct cache *ca, sector_t s)
return s & (ca->mi.bucket_size - 1);
}
-#define cache_member_info_get(_c) \
- (rcu_read_lock(), rcu_dereference((_c)->members))
-
-#define cache_member_info_put() rcu_read_unlock()
-
static inline struct cache *bch_next_cache_rcu(struct cache_set *c,
unsigned *iter)
{
struct cache *ret = NULL;
- while (*iter < c->sb.nr_in_set &&
+ while (*iter < c->sb.nr_devices &&
!(ret = rcu_dereference(c->cache[*iter])))
(*iter)++;
@@ -59,40 +54,6 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c,
(ca = bch_get_next_cache(c, &(iter))); \
percpu_ref_put(&ca->ref), (iter)++)
-void bch_check_mark_super_slowpath(struct cache_set *,
- const struct bkey_i *, bool);
-
-static inline bool bch_check_super_marked(struct cache_set *c,
- const struct bkey_i *k, bool meta)
-{
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
- const struct bch_extent_ptr *ptr;
- struct cache_member_cpu *mi = cache_member_info_get(c)->m;
- bool ret = true;
-
- extent_for_each_ptr(e, ptr)
- if (!(meta
- ? mi[ptr->dev].has_metadata
- : mi[ptr->dev].has_data) &&
- bch_extent_ptr_is_dirty(c, e, ptr)) {
- ret = false;
- break;
- }
-
- cache_member_info_put();
-
- return ret;
-}
-
-static inline void bch_check_mark_super(struct cache_set *c,
- const struct bkey_i *k, bool meta)
-{
- if (bch_check_super_marked(c, k, meta))
- return;
-
- bch_check_mark_super_slowpath(c, k, meta);
-}
-
static inline bool bch_cache_may_remove(struct cache *ca)
{
struct cache_set *c = ca->set;
@@ -119,11 +80,6 @@ static inline bool bch_cache_may_remove(struct cache *ca)
rcu_access_pointer(tier->d[0].dev) != ca;
}
-void free_super(struct bcache_superblock *);
-int bch_super_realloc(struct bcache_superblock *, unsigned);
-void bcache_write_super(struct cache_set *);
-void __write_super(struct cache_set *, struct bcache_superblock *);
-
void bch_cache_set_release(struct kobject *);
void bch_cache_release(struct kobject *);
@@ -149,7 +105,7 @@ extern struct mutex bch_register_lock;
extern struct list_head bch_cache_sets;
extern struct idr bch_cache_set_minor;
extern struct workqueue_struct *bcache_io_wq;
-extern struct crypto_shash *bch_sha1;
+extern struct crypto_shash *bch_sha256;
extern struct kobj_type bch_cache_set_ktype;
extern struct kobj_type bch_cache_set_internal_ktype;
diff --git a/libbcache/super_types.h b/libbcache/super_types.h
index d89f780..41eaf0d 100644
--- a/libbcache/super_types.h
+++ b/libbcache/super_types.h
@@ -2,7 +2,7 @@
#define _BCACHE_SUPER_TYPES_H
struct bcache_superblock {
- struct cache_sb *sb;
+ struct bch_sb *sb;
struct block_device *bdev;
struct bio *bio;
unsigned page_order;
diff --git a/libbcache/sysfs.c b/libbcache/sysfs.c
index 58a7125..57b7dd9 100644
--- a/libbcache/sysfs.c
+++ b/libbcache/sysfs.c
@@ -8,9 +8,11 @@
#include "bcache.h"
#include "alloc.h"
#include "blockdev.h"
+#include "compress.h"
#include "sysfs.h"
#include "btree_cache.h"
#include "btree_iter.h"
+#include "btree_update.h"
#include "btree_gc.h"
#include "buckets.h"
#include "inode.h"
@@ -19,6 +21,7 @@
#include "move.h"
#include "opts.h"
#include "request.h"
+#include "super-io.h"
#include "writeback.h"
#include <linux/blkdev.h>
@@ -139,14 +142,14 @@ read_attribute(tier);
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
static struct attribute sysfs_opt_##_name = { \
.name = #_name, \
.mode = S_IRUGO|(_perm ? S_IWUSR : 0) \
};
- CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+ BCH_VISIBLE_OPTS()
+#undef BCH_OPT
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
sysfs_time_stats_attribute(name, frequency_units, duration_units);
@@ -193,8 +196,8 @@ SHOW(bch_cached_dev)
sysfs_print(state, states[BDEV_STATE(dc->disk_sb.sb)]);
if (attr == &sysfs_label) {
- memcpy(buf, dc->disk_sb.sb->label, SB_LABEL_SIZE);
- buf[SB_LABEL_SIZE + 1] = '\0';
+ memcpy(buf, dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
+ buf[BCH_SB_LABEL_SIZE + 1] = '\0';
strcat(buf, "\n");
return strlen(buf);
}
@@ -248,24 +251,25 @@ STORE(__cached_dev)
u64 journal_seq = 0;
int ret = 0;
- if (size > SB_LABEL_SIZE)
+ if (size > BCH_SB_LABEL_SIZE)
return -EINVAL;
mutex_lock(&dc->disk.inode_lock);
memcpy(dc->disk_sb.sb->label, buf, size);
- if (size < SB_LABEL_SIZE)
+ if (size < BCH_SB_LABEL_SIZE)
dc->disk_sb.sb->label[size] = '\0';
if (size && dc->disk_sb.sb->label[size - 1] == '\n')
dc->disk_sb.sb->label[size - 1] = '\0';
memcpy(dc->disk.inode.v.i_label,
- dc->disk_sb.sb->label, SB_LABEL_SIZE);
+ dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
bch_write_bdev_super(dc, NULL);
if (dc->disk.c)
- ret = bch_inode_update(dc->disk.c, &dc->disk.inode.k_i,
+ ret = bch_btree_update(dc->disk.c, BTREE_ID_INODES,
+ &dc->disk.inode.k_i,
&journal_seq);
mutex_unlock(&dc->disk.inode_lock);
@@ -367,8 +371,8 @@ SHOW(bch_blockdev_volume)
sysfs_hprint(size, le64_to_cpu(d->inode.v.i_size));
if (attr == &sysfs_label) {
- memcpy(buf, d->inode.v.i_label, SB_LABEL_SIZE);
- buf[SB_LABEL_SIZE + 1] = '\0';
+ memcpy(buf, d->inode.v.i_label, BCH_SB_LABEL_SIZE);
+ buf[BCH_SB_LABEL_SIZE + 1] = '\0';
strcat(buf, "\n");
return strlen(buf);
}
@@ -397,7 +401,8 @@ STORE(__bch_blockdev_volume)
}
}
d->inode.v.i_size = cpu_to_le64(v);
- ret = bch_inode_update(d->c, &d->inode.k_i, &journal_seq);
+ ret = bch_btree_update(d->c, BTREE_ID_INODES,
+ &d->inode.k_i, &journal_seq);
mutex_unlock(&d->inode_lock);
@@ -417,8 +422,9 @@ STORE(__bch_blockdev_volume)
mutex_lock(&d->inode_lock);
- memcpy(d->inode.v.i_label, buf, SB_LABEL_SIZE);
- ret = bch_inode_update(d->c, &d->inode.k_i, &journal_seq);
+ memcpy(d->inode.v.i_label, buf, BCH_SB_LABEL_SIZE);
+ ret = bch_btree_update(d->c, BTREE_ID_INODES,
+ &d->inode.k_i, &journal_seq);
mutex_unlock(&d->inode_lock);
@@ -677,10 +683,8 @@ SHOW(bch_cache_set)
sysfs_print(tiering_percent, c->tiering_percent);
sysfs_pd_controller_show(tiering, &c->tiering_pd);
- sysfs_printf(meta_replicas_have, "%llu",
- CACHE_SET_META_REPLICAS_HAVE(&c->disk_sb));
- sysfs_printf(data_replicas_have, "%llu",
- CACHE_SET_DATA_REPLICAS_HAVE(&c->disk_sb));
+ sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have);
+ sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have);
/* Debugging: */
@@ -705,7 +709,7 @@ SHOW(bch_cache_set)
if (attr == &sysfs_compression_stats)
return bch_compression_stats(c, buf);
- sysfs_printf(internal_uuid, "%pU", c->disk_sb.set_uuid.b);
+ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
return 0;
}
@@ -945,15 +949,15 @@ SHOW(bch_cache_set_opts_dir)
{
struct cache_set *c = container_of(kobj, struct cache_set, opts_dir);
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
if (attr == &sysfs_opt_##_name) \
return _choices == bch_bool_opt || _choices == bch_uint_opt\
? snprintf(buf, PAGE_SIZE, "%i\n", c->opts._name)\
: bch_snprint_string_list(buf, PAGE_SIZE, \
_choices, c->opts._name);\
- CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+ BCH_VISIBLE_OPTS()
+#undef BCH_OPT
return 0;
}
@@ -962,7 +966,7 @@ STORE(bch_cache_set_opts_dir)
{
struct cache_set *c = container_of(kobj, struct cache_set, opts_dir);
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
if (attr == &sysfs_opt_##_name) { \
ssize_t v = (_choices == bch_bool_opt || \
_choices == bch_uint_opt) \
@@ -972,18 +976,28 @@ STORE(bch_cache_set_opts_dir)
if (v < 0) \
return v; \
\
- c->opts._name = v; \
+ mutex_lock(&c->sb_lock); \
+ if (attr == &sysfs_opt_compression) { \
+ int ret = bch_check_set_has_compressed_data(c, v);\
+ if (ret) { \
+ mutex_unlock(&c->sb_lock); \
+ return ret; \
+ } \
+ } \
\
- if (_sb_opt##_BITS && v != _sb_opt(&c->disk_sb)) { \
- SET_##_sb_opt(&c->disk_sb, v); \
- bcache_write_super(c); \
+ if (_sb_opt##_BITS && v != _sb_opt(c->disk_sb)) { \
+ SET_##_sb_opt(c->disk_sb, v); \
+ bch_write_super(c); \
} \
\
+ c->opts._name = v; \
+ mutex_unlock(&c->sb_lock); \
+ \
return size; \
}
- CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+ BCH_VISIBLE_OPTS()
+#undef BCH_OPT
return size;
}
@@ -993,11 +1007,11 @@ static void bch_cache_set_opts_dir_release(struct kobject *k)
}
static struct attribute *bch_cache_set_opts_dir_files[] = {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \
&sysfs_opt_##_name,
- CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+ BCH_VISIBLE_OPTS()
+#undef BCH_OPT
NULL
};
@@ -1176,7 +1190,7 @@ SHOW(bch_cache)
struct cache_set *c = ca->set;
struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
- sysfs_printf(uuid, "%pU\n", ca->disk_sb.sb->disk_uuid.b);
+ sysfs_printf(uuid, "%pU\n", ca->uuid.b);
sysfs_hprint(bucket_size, bucket_bytes(ca));
sysfs_print(bucket_size_bytes, bucket_bytes(ca));
@@ -1242,17 +1256,21 @@ STORE(__bch_cache)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
struct cache_set *c = ca->set;
- struct cache_member *mi = &c->disk_mi[ca->sb.nr_this_dev];
+ struct bch_member *mi;
sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd);
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
- if (v != CACHE_DISCARD(mi)) {
- SET_CACHE_DISCARD(mi, v);
- bcache_write_super(c);
+ mutex_lock(&c->sb_lock);
+ mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+
+ if (v != BCH_MEMBER_DISCARD(mi)) {
+ SET_BCH_MEMBER_DISCARD(mi, v);
+ bch_write_super(c);
}
+ mutex_unlock(&c->sb_lock);
}
if (attr == &sysfs_cache_replacement_policy) {
@@ -1261,10 +1279,14 @@ STORE(__bch_cache)
if (v < 0)
return v;
- if ((unsigned) v != CACHE_REPLACEMENT(mi)) {
- SET_CACHE_REPLACEMENT(mi, v);
- bcache_write_super(c);
+ mutex_lock(&c->sb_lock);
+ mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+
+ if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
+ SET_BCH_MEMBER_REPLACEMENT(mi, v);
+ bch_write_super(c);
}
+ mutex_unlock(&c->sb_lock);
}
if (attr == &sysfs_state_rw) {
@@ -1279,14 +1301,14 @@ STORE(__bch_cache)
return size;
switch (v) {
- case CACHE_ACTIVE:
+ case BCH_MEMBER_STATE_ACTIVE:
err = bch_cache_read_write(ca);
break;
- case CACHE_RO:
+ case BCH_MEMBER_STATE_RO:
bch_cache_read_only(ca);
break;
- case CACHE_FAILED:
- case CACHE_SPARE:
+ case BCH_MEMBER_STATE_FAILED:
+ case BCH_MEMBER_STATE_SPARE:
/*
* XXX: need to migrate data off and set correct state
*/
diff --git a/libbcache/tier.c b/libbcache/tier.c
index 39b04f7..4686459 100644
--- a/libbcache/tier.c
+++ b/libbcache/tier.c
@@ -8,6 +8,7 @@
#include "io.h"
#include "keylist.h"
#include "move.h"
+#include "super-io.h"
#include "tier.h"
#include <linux/freezer.h>
@@ -40,7 +41,7 @@ static bool tiering_pred(struct cache_set *c,
mi = cache_member_info_get(c);
extent_for_each_ptr(e, ptr)
- if (ptr->dev < mi->nr_in_set &&
+ if (ptr->dev < mi->nr_devices &&
mi->m[ptr->dev].tier >= s->tier_idx)
replicas++;
cache_member_info_put();
diff --git a/libbcache/vstructs.h b/libbcache/vstructs.h
new file mode 100644
index 0000000..ce2cece
--- /dev/null
+++ b/libbcache/vstructs.h
@@ -0,0 +1,62 @@
+#ifndef _VSTRUCTS_H
+#define _VSTRUCTS_H
+
+#include "util.h"
+
+/*
+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this
+ * assumes u64 is little endian:
+ */
+#define __vstruct_u64s(_s) \
+({ \
+ ( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s) \
+ : type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s) \
+ : type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s) \
+ : ((_s)->u64s)); \
+})
+
+#define __vstruct_bytes(_type, _u64s) \
+({ \
+ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \
+ \
+ (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
+})
+
+#define vstruct_bytes(_s) \
+ __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
+
+#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \
+ (round_up(__vstruct_bytes(_type, _u64s), \
+ 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
+
+#define vstruct_blocks(_s, _sector_block_bits) \
+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
+
+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \
+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \
+ __vstruct_u64s(_s) + (_u64s))
+
+#define vstruct_sectors(_s, _sector_block_bits) \
+ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
+
+#define vstruct_next(_s) \
+ ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_last(_s) \
+ ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_end(_s) \
+ ((void *) ((_s)->_data + __vstruct_u64s(_s)))
+
+#define vstruct_for_each(_s, _i) \
+ for (_i = (_s)->start; \
+ _i < vstruct_last(_s); \
+ _i = vstruct_next(_i))
+
+#define vstruct_for_each_safe(_s, _i, _t) \
+ for (_i = (_s)->start; \
+ _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \
+ _i = _t)
+
+#define vstruct_idx(_s, _idx) \
+ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
+
+#endif /* _VSTRUCTS_H */
diff --git a/libbcache/xattr.c b/libbcache/xattr.c
index e9e0a9a..56a8e8f 100644
--- a/libbcache/xattr.c
+++ b/libbcache/xattr.c
@@ -9,7 +9,6 @@
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
-#include <crypto/hash.h>
struct xattr_search_key {
u8 type;
@@ -22,37 +21,13 @@ struct xattr_search_key {
static u64 bch_xattr_hash(const struct bch_hash_info *info,
const struct xattr_search_key *key)
{
- switch (info->type) {
- case BCH_STR_HASH_SHA1: {
- SHASH_DESC_ON_STACK(desc, bch_sha1);
- u8 digest[SHA1_DIGEST_SIZE];
- u64 ret;
+ struct bch_str_hash_ctx ctx;
- desc->tfm = bch_sha1;
- desc->flags = 0;
- crypto_shash_init(desc);
+ bch_str_hash_init(&ctx, info);
+ bch_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
+ bch_str_hash_update(&ctx, info, key->name.name, key->name.len);
- crypto_shash_update(desc, (void *) &info->seed, sizeof(info->seed));
-
- crypto_shash_update(desc, (void *) &key->type, sizeof(key->type));
- crypto_shash_update(desc, (void *) key->name.name, key->name.len);
-
- crypto_shash_final(desc, digest);
- memcpy(&ret, &digest, sizeof(ret));
- return ret >> 1;
- }
- default: {
- struct bch_str_hash_ctx ctx;
-
- bch_str_hash_init(&ctx, info->type);
- bch_str_hash_update(&ctx, info->type, &info->seed, sizeof(info->seed));
-
- bch_str_hash_update(&ctx, info->type, &key->type, sizeof(key->type));
- bch_str_hash_update(&ctx, info->type, key->name.name, key->name.len);
-
- return bch_str_hash_end(&ctx, info->type);
- }
- }
+ return bch_str_hash_end(&ctx, info);
}
#define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len)