diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2016-10-03 19:22:17 -0800 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2017-02-28 03:05:38 -0900 |
commit | a5b5eba7f788bb77cf57f9c94f3474a2d439ab0b (patch) | |
tree | 278813d1b1a9024174531376d41a2ba04a3b27f6 | |
parent | e4d1c93d85a5b86c04599bfc9f658308d741fd41 (diff) |
New on disk format - encryption
99 files changed, 5438 insertions, 5777 deletions
diff --git a/.bcache_revision b/.bcache_revision index 5caaaba2..8fb728e4 100644 --- a/.bcache_revision +++ b/.bcache_revision @@ -1 +1 @@ -BCACHE_REVISION=76e3b2312705df2cb5adb8834bc6df56a288932e +BCACHE_REVISION=561f3067172cbfc63a680cfb670d558724441123 @@ -20,9 +20,10 @@ else LDFLAGS+=-flto endif -PKGCONFIG_LIBS="blkid uuid liburcu" +PKGCONFIG_LIBS="blkid uuid liburcu libsodium" CFLAGS+=`pkg-config --cflags ${PKGCONFIG_LIBS}` -LDLIBS+=`pkg-config --libs ${PKGCONFIG_LIBS}` -lm -lpthread -lrt +LDLIBS+=`pkg-config --libs ${PKGCONFIG_LIBS}` \ + -lm -lpthread -lrt -lscrypt -lkeyutils ifeq ($(PREFIX),/usr) ROOT_SBINDIR=/sbin @@ -48,7 +49,9 @@ OBJS=bcache.o \ cmd_fs.o \ cmd_fsck.o \ cmd_format.o \ + cmd_key.o \ cmd_run.o \ + crypto.o \ libbcache.o \ qcow2.o \ tools-util.o \ diff --git a/bcache-userspace-shim.c b/bcache-userspace-shim.c index 9be5b507..8634d8f7 100644 --- a/bcache-userspace-shim.c +++ b/bcache-userspace-shim.c @@ -144,6 +144,7 @@ enum fsck_err_opts fsck_err_opt; #include "six.c" //#include "stats.c" #include "super.c" +#include "super-io.c" //#include "sysfs.c" #include "tier.c" #include "trace.c" @@ -30,6 +30,7 @@ static void usage(void) "\n" "Commands for formatting, startup and shutdown:\n" " format Format a new filesystem\n" + " unlock Unlock an encrypted filesystem prior to running/mounting\n" " assemble Assemble an existing multi device filesystem\n" " incremental Incrementally assemble an existing multi device filesystem\n" " run Start a partially assembled filesystem\n" @@ -46,6 +47,7 @@ static void usage(void) "\n" "Repair:\n" " bcache fsck Check an existing filesystem for errors\n" + "\n" "Debug:\n" " bcache dump Dump filesystem metadata to a qcow2 image\n" " bcache list List filesystem metadata in textual form\n"); @@ -94,6 +96,9 @@ int main(int argc, char *argv[]) if (!strcmp(cmd, "fsck")) return cmd_fsck(argc, argv); + if (!strcmp(cmd, "unlock")) + return cmd_unlock(argc, argv); + if (!strcmp(cmd, "dump")) return cmd_dump(argc, argv); if (!strcmp(cmd, "list")) diff --git a/cmd_debug.c b/cmd_debug.c index 0813d292..df23ae10 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -27,21 +27,27 @@ static void dump_usage(void) "Report bugs to <linux-bcache@vger.kernel.org>"); } -void dump_one_device(struct cache_set *c, struct cache *ca, int fd) +static void dump_one_device(struct cache_set *c, struct cache *ca, int fd) { - struct cache_sb *sb = ca->disk_sb.sb; + struct bch_sb *sb = ca->disk_sb.sb; sparse_data data; unsigned i; darray_init(data); /* Superblock: */ - data_add(&data, SB_SECTOR << 9, __set_bytes(sb, le16_to_cpu(sb->u64s))); + data_add(&data, BCH_SB_LAYOUT_SECTOR << 9, + sizeof(struct bch_sb_layout)); + + for (i = 0; i < sb->layout.nr_superblocks; i++) + data_add(&data, + le64_to_cpu(sb->layout.sb_offset[i]) << 9, + vstruct_bytes(sb)); /* Journal: */ - for (i = 0; i < bch_nr_journal_buckets(ca->disk_sb.sb); i++) + for (i = 0; i < ca->journal.nr; i++) if (ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) { - u64 bucket = journal_bucket(ca->disk_sb.sb, i); + u64 bucket = ca->journal.buckets[i]; data_add(&data, bucket_bytes(ca) * bucket, @@ -64,7 +70,7 @@ void dump_one_device(struct cache_set *c, struct cache *ca, int fd) struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); extent_for_each_ptr(e, ptr) - if (ptr->dev == ca->sb.nr_this_dev) + if (ptr->dev == ca->dev_idx) data_add(&data, ptr->offset << 9, b->written << 9); @@ -120,13 +126,13 @@ int cmd_dump(int argc, char *argv[]) down_read(&c->gc_lock); - for (i = 0; i < c->sb.nr_in_set; i++) + for (i = 0; i < c->sb.nr_devices; i++) if (c->cache[i]) nr_devices++; BUG_ON(!nr_devices); - for (i = 0; i < c->sb.nr_in_set; i++) { + for (i = 0; i < c->sb.nr_devices; i++) { int mode = O_WRONLY|O_CREAT|O_TRUNC; if (!force) @@ -155,8 +161,8 @@ int cmd_dump(int argc, char *argv[]) return 0; } -void list_keys(struct cache_set *c, enum btree_id btree_id, - struct bpos start, struct bpos end, int mode) +static void list_keys(struct cache_set *c, enum btree_id btree_id, + struct bpos start, struct bpos end, int mode) { struct btree_iter iter; struct bkey_s_c k; @@ -173,8 +179,8 @@ void list_keys(struct cache_set *c, enum btree_id btree_id, bch_btree_iter_unlock(&iter); } -void list_btree_formats(struct cache_set *c, enum btree_id btree_id, - struct bpos start, struct bpos end, int mode) +static void list_btree_formats(struct cache_set *c, enum btree_id btree_id, + struct bpos start, struct bpos end, int mode) { struct btree_iter iter; struct btree *b; @@ -190,7 +196,7 @@ void list_btree_formats(struct cache_set *c, enum btree_id btree_id, bch_btree_iter_unlock(&iter); } -struct bpos parse_pos(char *buf) +static struct bpos parse_pos(char *buf) { char *s = buf; char *inode = strsep(&s, ":"); diff --git a/cmd_device.c b/cmd_device.c index ecb63bb4..1c5208af 100644 --- a/cmd_device.c +++ b/cmd_device.c @@ -103,7 +103,7 @@ int cmd_device_show(int argc, char *argv[]) struct bcache_dev devices[256]; unsigned i, j, nr_devices = 0, nr_active_tiers = 0; - unsigned tiers[CACHE_TIERS]; /* number of devices in each tier */ + unsigned tiers[BCH_TIER_MAX]; /* number of devices in each tier */ memset(tiers, 0, sizeof(tiers)); while ((entry = readdir(fs.sysfs))) { @@ -133,14 +133,14 @@ int cmd_device_show(int argc, char *argv[]) close(fd); } - for (i = 0; i < CACHE_TIERS; i++) + for (i = 0; i < BCH_TIER_MAX; i++) if (tiers[i]) nr_active_tiers++; /* Print out devices sorted by tier: */ bool first = true; - for (i = 0; i < CACHE_TIERS; i++) { + for (i = 0; i < BCH_TIER_MAX; i++) { if (!tiers[i]) continue; @@ -168,7 +168,7 @@ int cmd_device_show(int argc, char *argv[]) int cmd_device_show(int argc, char *argv[]) { - struct cache_sb *sb; + struct bch_sb *sb; if (argc != 2) die("please supply a single device"); diff --git a/cmd_format.c b/cmd_format.c index b955b416..2b1453ee 100644 --- a/cmd_format.c +++ b/cmd_format.c @@ -24,6 +24,7 @@ #include "cmds.h" #include "libbcache.h" +#include "crypto.h" #include "opts.h" #include "util.h" @@ -80,6 +81,7 @@ static void usage(void) " --metadata_checksum_type=(none|crc32c|crc64)\n" " --data_checksum_type=(none|crc32c|crc64)\n" " --compression_type=(none|lz4|gzip)\n" + " --encrypted\n" " --error_action=(continue|readonly|panic)\n" " Action to take on filesystem error\n" " --max_journal_entry_size=size\n" @@ -107,6 +109,7 @@ static void usage(void) OPT(0, metadata_checksum_type, required_argument) \ OPT(0, data_checksum_type, required_argument) \ OPT(0, compression_type, required_argument) \ + OPT(0, encrypted, no_argument) \ OPT('e', error_action, required_argument) \ OPT(0, max_journal_entry_size, required_argument) \ OPT('L', label, required_argument) \ @@ -164,6 +167,7 @@ int cmd_format(int argc, char *argv[]) unsigned meta_csum_type = BCH_CSUM_CRC32C; unsigned data_csum_type = BCH_CSUM_CRC32C; unsigned compression_type = BCH_COMPRESSION_NONE; + bool encrypted = false; unsigned on_error_action = BCH_ON_ERROR_RO; char *label = NULL; uuid_le uuid; @@ -208,6 +212,9 @@ int cmd_format(int argc, char *argv[]) bch_compression_types, "compression type"); break; + case Opt_encrypted: + encrypted = true; + break; case Opt_error_action: case 'e': on_error_action = read_string_list_or_die(optarg, @@ -242,7 +249,7 @@ int cmd_format(int argc, char *argv[]) case Opt_tier: case 't': if (kstrtouint(optarg, 10, &tier) || - tier >= CACHE_TIERS) + tier >= BCH_TIER_MAX) die("invalid tier"); break; case Opt_discard: @@ -270,6 +277,24 @@ int cmd_format(int argc, char *argv[]) if (uuid_is_null(uuid.b)) uuid_generate(uuid.b); + if (encrypted) { + passphrase = read_passphrase("Enter passphrase: "); + + if (isatty(STDIN_FILENO)) { + char *pass2 = + read_passphrase("Enter same passphrase again: "); + + if (strcmp(passphrase, pass2)) { + memzero_explicit(passphrase, strlen(passphrase)); + memzero_explicit(pass2, strlen(pass2)); + die("Passphrases do not match"); + } + + memzero_explicit(pass2, strlen(pass2)); + free(pass2); + } + } + darray_foreach(dev, devices) dev->fd = open_for_format(dev->path, force); @@ -279,6 +304,7 @@ int cmd_format(int argc, char *argv[]) meta_csum_type, data_csum_type, compression_type, + passphrase, 1, 1, on_error_action, diff --git a/cmd_key.c b/cmd_key.c new file mode 100644 index 00000000..587ecbe3 --- /dev/null +++ b/cmd_key.c @@ -0,0 +1,62 @@ +#include <errno.h> +#include <unistd.h> +#include <keyutils.h> +#include <uuid/uuid.h> + +#include "cmds.h" +#include "checksum.h" +#include "crypto.h" +#include "libbcache.h" + +int cmd_unlock(int argc, char *argv[]) +{ + struct bch_encrypted_key sb_key; + struct bch_key passphrase_key; + struct bch_sb *sb; + struct bch_sb_field_crypt *crypt; + char *passphrase; + char uuid[40]; + char description[60]; + + if (argc != 2) + die("please supply a single device"); + + sb = bcache_super_read(argv[1]); + + crypt = bch_sb_get_crypt(sb); + if (!crypt) + die("filesystem is not encrypted"); + + sb_key = crypt->key; + + if (!bch_key_is_encrypted(&sb_key)) + die("filesystem does not have encryption key"); + + passphrase = read_passphrase("Enter passphrase: "); + derive_passphrase(crypt, &passphrase_key, passphrase); + + /* Check if the user supplied the correct passphrase: */ + if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb), + &sb_key, sizeof(sb_key))) + die("error encrypting key"); + + if (bch_key_is_encrypted(&sb_key)) + die("incorrect passphrase"); + + uuid_unparse_lower(sb->user_uuid.b, uuid); + sprintf(description, "bcache:%s", uuid); + + if (add_key("logon", description, + &passphrase_key, sizeof(passphrase_key), + KEY_SPEC_USER_KEYRING) < 0 || + add_key("user", description, + &passphrase_key, sizeof(passphrase_key), + KEY_SPEC_USER_KEYRING) < 0) + die("add_key error: %s", strerror(errno)); + + memzero_explicit(&sb_key, sizeof(sb_key)); + memzero_explicit(&passphrase_key, sizeof(passphrase_key)); + memzero_explicit(passphrase, strlen(passphrase)); + free(passphrase); + return 0; +} @@ -11,6 +11,7 @@ int cmd_format(int argc, char *argv[]); +int cmd_unlock(int argc, char *argv[]); int cmd_assemble(int argc, char *argv[]); int cmd_incremental(int argc, char *argv[]); int cmd_run(int argc, char *argv[]); diff --git a/crypto.c b/crypto.c new file mode 100644 index 00000000..86da70a1 --- /dev/null +++ b/crypto.c @@ -0,0 +1,103 @@ +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <termios.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <time.h> +#include <unistd.h> + +#include <linux/random.h> +#include <libscrypt.h> + +#include "checksum.h" +#include "crypto.h" + +char *read_passphrase(const char *prompt) +{ + char *buf = NULL; + size_t buflen = 0; + ssize_t len; + + if (isatty(STDIN_FILENO)) { + struct termios old, new; + + fprintf(stderr, "%s", prompt); + fflush(stderr); + + if (tcgetattr(STDIN_FILENO, &old)) + die("error getting terminal attrs"); + + new = old; + new.c_lflag &= ~ECHO; + if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &new)) + die("error setting terminal attrs"); + + len = getline(&buf, &buflen, stdin); + + tcsetattr(STDIN_FILENO, TCSAFLUSH, &old); + fprintf(stderr, "\n"); + } else { + len = getline(&buf, &buflen, stdin); + } + + if (len < 0) + die("error reading passphrase"); + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + + return buf; +} + +void derive_passphrase(struct bch_sb_field_crypt *crypt, + struct bch_key *key, + const char *passphrase) +{ + const unsigned char salt[] = "bcache"; + int ret; + + switch (BCH_CRYPT_KDF_TYPE(crypt)) { + case BCH_KDF_SCRYPT: + ret = libscrypt_scrypt((void *) passphrase, strlen(passphrase), + salt, sizeof(salt), + 1ULL << BCH_KDF_SCRYPT_N(crypt), + 1ULL << BCH_KDF_SCRYPT_R(crypt), + 1ULL << BCH_KDF_SCRYPT_P(crypt), + (void *) key, sizeof(*key)); + if (ret) + die("scrypt error: %i", ret); + break; + default: + die("unknown kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); + } +} + +void bch_sb_crypt_init(struct bch_sb *sb, + struct bch_sb_field_crypt *crypt, + const char *passphrase) +{ + struct bch_key passphrase_key; + + SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT); + SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N)); + SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r)); + SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p)); + + derive_passphrase(crypt, &passphrase_key, passphrase); + + crypt->key.magic = BCH_KEY_MAGIC; + get_random_bytes(&crypt->key.key, sizeof(crypt->key.key)); + + assert(!bch_key_is_encrypted(&crypt->key)); + + if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb), + &crypt->key, sizeof(crypt->key))) + die("error encrypting key"); + + assert(bch_key_is_encrypted(&crypt->key)); + + memzero_explicit(&passphrase_key, sizeof(passphrase_key)); +} diff --git a/crypto.h b/crypto.h new file mode 100644 index 00000000..643073eb --- /dev/null +++ b/crypto.h @@ -0,0 +1,13 @@ +#ifndef _CRYPTO_H +#define _CRYPTO_H + +#include "super-io.h" +#include "tools-util.h" + +char *read_passphrase(const char *); +void derive_passphrase(struct bch_sb_field_crypt *, + struct bch_key *, const char *); +void bch_sb_crypt_init(struct bch_sb *sb, struct bch_sb_field_crypt *, + const char *); + +#endif /* _CRYPTO_H */ diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index 31f453ee..d8bfcc1f 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -13,200 +13,24 @@ #define _CRYPTO_ALGAPI_H #include <linux/crypto.h> -#include <linux/device.h> -#include <linux/list.h> -#include <linux/kernel.h> -#include <linux/kthread.h> - -struct crypto_aead; -struct crypto_instance; -struct module; -struct rtattr; -struct seq_file; -struct sk_buff; struct crypto_type { unsigned int (*ctxsize)(struct crypto_alg *alg, u32 type, u32 mask); unsigned int (*extsize)(struct crypto_alg *alg); int (*init)(struct crypto_tfm *tfm, u32 type, u32 mask); int (*init_tfm)(struct crypto_tfm *tfm); - void (*show)(struct seq_file *m, struct crypto_alg *alg); - struct crypto_alg *(*lookup)(const char *name, u32 type, u32 mask); - void (*free)(struct crypto_instance *inst); - - unsigned int type; - unsigned int maskclear; - unsigned int maskset; - unsigned int tfmsize; -}; - -struct crypto_instance { - struct crypto_alg alg; - - struct crypto_template *tmpl; - struct hlist_node list; - - void *__ctx[] CRYPTO_MINALIGN_ATTR; -}; - -struct crypto_template { - struct list_head list; - struct hlist_head instances; - struct module *module; - - struct crypto_instance *(*alloc)(struct rtattr **tb); - void (*free)(struct crypto_instance *inst); - int (*create)(struct crypto_template *tmpl, struct rtattr **tb); - - char name[CRYPTO_MAX_ALG_NAME]; -}; - -struct scatter_walk { - struct scatterlist *sg; - unsigned int offset; -}; - -struct blkcipher_walk { - union { - struct { - struct page *page; - unsigned long offset; - } phys; - - struct { - u8 *page; - u8 *addr; - } virt; - } src, dst; - struct scatter_walk in; - unsigned int nbytes; - - struct scatter_walk out; - unsigned int total; - - void *page; - u8 *buffer; - u8 *iv; - unsigned int ivsize; - - int flags; - unsigned int walk_blocksize; - unsigned int cipher_blocksize; - unsigned int alignmask; + unsigned type; + unsigned maskclear; + unsigned maskset; + unsigned tfmsize; }; extern const struct crypto_type crypto_blkcipher_type; -struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb); -int crypto_check_attr_type(struct rtattr **tb, u32 type); -const char *crypto_attr_alg_name(struct rtattr *rta); -struct crypto_alg *crypto_attr_alg2(struct rtattr *rta, - const struct crypto_type *frontend, - u32 type, u32 mask); - -static inline struct crypto_alg *crypto_attr_alg(struct rtattr *rta, - u32 type, u32 mask) -{ - return crypto_attr_alg2(rta, NULL, type, mask); -} - -int crypto_attr_u32(struct rtattr *rta, u32 *num); - -/* These functions require the input/output to be aligned as u32. */ -void crypto_inc(u8 *a, unsigned int size); -void crypto_xor(u8 *dst, const u8 *src, unsigned int size); - -int blkcipher_walk_done(struct blkcipher_desc *desc, - struct blkcipher_walk *walk, int err); -int blkcipher_walk_virt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk); -int blkcipher_walk_phys(struct blkcipher_desc *desc, - struct blkcipher_walk *walk); -int blkcipher_walk_virt_block(struct blkcipher_desc *desc, - struct blkcipher_walk *walk, - unsigned int blocksize); -int blkcipher_aead_walk_virt_block(struct blkcipher_desc *desc, - struct blkcipher_walk *walk, - struct crypto_aead *tfm, - unsigned int blocksize); - -static inline void *crypto_tfm_ctx_aligned(struct crypto_tfm *tfm) -{ - return PTR_ALIGN(crypto_tfm_ctx(tfm), - crypto_tfm_alg_alignmask(tfm) + 1); -} - -static inline struct crypto_instance *crypto_tfm_alg_instance( - struct crypto_tfm *tfm) -{ - return container_of(tfm->__crt_alg, struct crypto_instance, alg); -} - -static inline void *crypto_instance_ctx(struct crypto_instance *inst) -{ - return inst->__ctx; -} - static inline void *crypto_blkcipher_ctx(struct crypto_blkcipher *tfm) { return crypto_tfm_ctx(&tfm->base); } -static inline void *crypto_blkcipher_ctx_aligned(struct crypto_blkcipher *tfm) -{ - return crypto_tfm_ctx_aligned(&tfm->base); -} - -static inline struct cipher_alg *crypto_cipher_alg(struct crypto_cipher *tfm) -{ - return &crypto_cipher_tfm(tfm)->__crt_alg->cra_cipher; -} - -static inline void blkcipher_walk_init(struct blkcipher_walk *walk, - struct scatterlist *dst, - struct scatterlist *src, - unsigned int nbytes) -{ - walk->in.sg = src; - walk->out.sg = dst; - walk->total = nbytes; -} - -static inline struct crypto_alg *crypto_get_attr_alg(struct rtattr **tb, - u32 type, u32 mask) -{ - return crypto_attr_alg(tb[1], type, mask); -} - -static inline int crypto_requires_sync(u32 type, u32 mask) -{ - return (type ^ CRYPTO_ALG_ASYNC) & mask & CRYPTO_ALG_ASYNC; -} - -noinline unsigned long __crypto_memneq(const void *a, const void *b, size_t size); - -/** - * crypto_memneq - Compare two areas of memory without leaking - * timing information. - * - * @a: One area of memory - * @b: Another area of memory - * @size: The size of the area. - * - * Returns 0 when data is equal, 1 otherwise. - */ -static inline int crypto_memneq(const void *a, const void *b, size_t size) -{ - return __crypto_memneq(a, b, size) != 0UL ? 1 : 0; -} - -static inline void crypto_yield(u32 flags) -{ -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) - if (flags & CRYPTO_TFM_REQ_MAY_SLEEP) - cond_resched(); -#endif -} - #endif /* _CRYPTO_ALGAPI_H */ diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h index 20d20f68..1cdc77ba 100644 --- a/include/crypto/chacha20.h +++ b/include/crypto/chacha20.h @@ -12,15 +12,4 @@ #define CHACHA20_KEY_SIZE 32 #define CHACHA20_BLOCK_SIZE 64 -struct chacha20_ctx { - u32 key[8]; -}; - -void chacha20_block(u32 *state, void *stream); -void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv); -int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keysize); -int crypto_chacha20_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes); - #endif diff --git a/include/crypto/hash.h b/include/crypto/hash.h index 00bd4e7e..97edaa88 100644 --- a/include/crypto/hash.h +++ b/include/crypto/hash.h @@ -16,13 +16,6 @@ #include <linux/crypto.h> #include <linux/string.h> -struct hash_alg_common { - unsigned int digestsize; - unsigned int statesize; - - struct crypto_alg base; -}; - struct shash_desc { struct crypto_shash *tfm; u32 flags; @@ -37,31 +30,21 @@ struct shash_desc { struct shash_alg { int (*init)(struct shash_desc *desc); - int (*update)(struct shash_desc *desc, const u8 *data, - unsigned int len); + int (*update)(struct shash_desc *desc, const u8 *data, unsigned len); int (*final)(struct shash_desc *desc, u8 *out); int (*finup)(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out); + unsigned len, u8 *out); int (*digest)(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out); - int (*export)(struct shash_desc *desc, void *out); - int (*import)(struct shash_desc *desc, const void *in); - int (*setkey)(struct crypto_shash *tfm, const u8 *key, - unsigned int keylen); - - unsigned int descsize; - - /* These fields must match hash_alg_common. */ - unsigned int digestsize - __attribute__ ((aligned(__alignof__(struct hash_alg_common)))); - unsigned int statesize; + unsigned len, u8 *out); - struct crypto_alg base; + unsigned descsize; + unsigned digestsize; + struct crypto_alg base; }; struct crypto_shash { - unsigned int descsize; - struct crypto_tfm base; + unsigned descsize; + struct crypto_tfm base; }; struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, @@ -77,27 +60,6 @@ static inline void crypto_free_shash(struct crypto_shash *tfm) crypto_destroy_tfm(tfm, crypto_shash_tfm(tfm)); } -static inline const char *crypto_shash_alg_name(struct crypto_shash *tfm) -{ - return crypto_tfm_alg_name(crypto_shash_tfm(tfm)); -} - -static inline const char *crypto_shash_driver_name(struct crypto_shash *tfm) -{ - return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm)); -} - -static inline unsigned int crypto_shash_alignmask( - struct crypto_shash *tfm) -{ - return crypto_tfm_alg_alignmask(crypto_shash_tfm(tfm)); -} - -static inline unsigned int crypto_shash_blocksize(struct crypto_shash *tfm) -{ - return crypto_tfm_alg_blocksize(crypto_shash_tfm(tfm)); -} - static inline struct shash_alg *__crypto_shash_alg(struct crypto_alg *alg) { return container_of(alg, struct shash_alg, base); @@ -108,32 +70,12 @@ static inline struct shash_alg *crypto_shash_alg(struct crypto_shash *tfm) return __crypto_shash_alg(crypto_shash_tfm(tfm)->__crt_alg); } -static inline unsigned int crypto_shash_digestsize(struct crypto_shash *tfm) +static inline unsigned crypto_shash_digestsize(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->digestsize; } -static inline unsigned int crypto_shash_statesize(struct crypto_shash *tfm) -{ - return crypto_shash_alg(tfm)->statesize; -} - -static inline u32 crypto_shash_get_flags(struct crypto_shash *tfm) -{ - return crypto_tfm_get_flags(crypto_shash_tfm(tfm)); -} - -static inline void crypto_shash_set_flags(struct crypto_shash *tfm, u32 flags) -{ - crypto_tfm_set_flags(crypto_shash_tfm(tfm), flags); -} - -static inline void crypto_shash_clear_flags(struct crypto_shash *tfm, u32 flags) -{ - crypto_tfm_clear_flags(crypto_shash_tfm(tfm), flags); -} - -static inline unsigned int crypto_shash_descsize(struct crypto_shash *tfm) +static inline unsigned crypto_shash_descsize(struct crypto_shash *tfm) { return tfm->descsize; } @@ -143,39 +85,32 @@ static inline void *shash_desc_ctx(struct shash_desc *desc) return desc->__ctx; } -int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key, - unsigned int keylen); - -int crypto_shash_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out); - -static inline int crypto_shash_export(struct shash_desc *desc, void *out) +static inline int crypto_shash_init(struct shash_desc *desc) { - return crypto_shash_alg(desc->tfm)->export(desc, out); + return crypto_shash_alg(desc->tfm)->init(desc); } -static inline int crypto_shash_import(struct shash_desc *desc, const void *in) +static inline int crypto_shash_update(struct shash_desc *desc, + const u8 *data, unsigned len) { - return crypto_shash_alg(desc->tfm)->import(desc, in); + return crypto_shash_alg(desc->tfm)->update(desc, data, len); } -static inline int crypto_shash_init(struct shash_desc *desc) +static inline int crypto_shash_final(struct shash_desc *desc, u8 *out) { - return crypto_shash_alg(desc->tfm)->init(desc); + return crypto_shash_alg(desc->tfm)->final(desc, out); } -int crypto_shash_update(struct shash_desc *desc, const u8 *data, - unsigned int len); - -int crypto_shash_final(struct shash_desc *desc, u8 *out); - -int crypto_shash_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out); +static inline int crypto_shash_finup(struct shash_desc *desc, const u8 *data, + unsigned len, u8 *out) +{ + return crypto_shash_alg(desc->tfm)->finup(desc, data, len, out); +} -static inline void shash_desc_zero(struct shash_desc *desc) +static inline int crypto_shash_digest(struct shash_desc *desc, const u8 *data, + unsigned len, u8 *out) { - memzero_explicit(desc, - sizeof(*desc) + crypto_shash_descsize(desc->tfm)); + return crypto_shash_alg(desc->tfm)->digest(desc, data, len, out); } #endif /* _CRYPTO_HASH_H */ diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h index 2d85c803..3973047b 100644 --- a/include/crypto/internal/hash.h +++ b/include/crypto/internal/hash.h @@ -5,9 +5,6 @@ #include <crypto/hash.h> int crypto_register_shash(struct shash_alg *alg); -int crypto_unregister_shash(struct shash_alg *alg); -int crypto_register_shashes(struct shash_alg *algs, int count); -int crypto_unregister_shashes(struct shash_alg *algs, int count); static inline struct crypto_shash *__crypto_shash_cast(struct crypto_tfm *tfm) { diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h index 894df59b..9fcfbfeb 100644 --- a/include/crypto/poly1305.h +++ b/include/crypto/poly1305.h @@ -5,37 +5,9 @@ #ifndef _CRYPTO_POLY1305_H #define _CRYPTO_POLY1305_H -#include <linux/types.h> -#include <linux/crypto.h> +#include <sodium/crypto_onetimeauth_poly1305.h> -#define POLY1305_BLOCK_SIZE 16 -#define POLY1305_KEY_SIZE 32 -#define POLY1305_DIGEST_SIZE 16 - -struct poly1305_desc_ctx { - /* key */ - u32 r[5]; - /* finalize key */ - u32 s[4]; - /* accumulator */ - u32 h[5]; - /* partial buffer */ - u8 buf[POLY1305_BLOCK_SIZE]; - /* bytes used in partial buffer */ - unsigned int buflen; - /* r key has been set */ - bool rset; - /* s key has been set */ - bool sset; -}; - -int crypto_poly1305_init(struct shash_desc *desc); -int crypto_poly1305_setkey(struct crypto_shash *tfm, - const u8 *key, unsigned int keylen); -unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen); -int crypto_poly1305_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen); -int crypto_poly1305_final(struct shash_desc *desc, u8 *dst); +#define POLY1305_KEY_SIZE crypto_onetimeauth_poly1305_KEYBYTES +#define POLY1305_DIGEST_SIZE crypto_onetimeauth_poly1305_BYTES #endif diff --git a/include/crypto/sha.h b/include/crypto/sha.h deleted file mode 100644 index c94d3eb1..00000000 --- a/include/crypto/sha.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Common values for SHA algorithms - */ - -#ifndef _CRYPTO_SHA_H -#define _CRYPTO_SHA_H - -#include <linux/types.h> - -#define SHA1_DIGEST_SIZE 20 -#define SHA1_BLOCK_SIZE 64 - -#define SHA224_DIGEST_SIZE 28 -#define SHA224_BLOCK_SIZE 64 - -#define SHA256_DIGEST_SIZE 32 -#define SHA256_BLOCK_SIZE 64 - -#define SHA384_DIGEST_SIZE 48 -#define SHA384_BLOCK_SIZE 128 - -#define SHA512_DIGEST_SIZE 64 -#define SHA512_BLOCK_SIZE 128 - -#define SHA1_H0 0x67452301UL -#define SHA1_H1 0xefcdab89UL -#define SHA1_H2 0x98badcfeUL -#define SHA1_H3 0x10325476UL -#define SHA1_H4 0xc3d2e1f0UL - -#define SHA224_H0 0xc1059ed8UL -#define SHA224_H1 0x367cd507UL -#define SHA224_H2 0x3070dd17UL -#define SHA224_H3 0xf70e5939UL -#define SHA224_H4 0xffc00b31UL -#define SHA224_H5 0x68581511UL -#define SHA224_H6 0x64f98fa7UL -#define SHA224_H7 0xbefa4fa4UL - -#define SHA256_H0 0x6a09e667UL -#define SHA256_H1 0xbb67ae85UL -#define SHA256_H2 0x3c6ef372UL -#define SHA256_H3 0xa54ff53aUL -#define SHA256_H4 0x510e527fUL -#define SHA256_H5 0x9b05688cUL -#define SHA256_H6 0x1f83d9abUL -#define SHA256_H7 0x5be0cd19UL - -#define SHA384_H0 0xcbbb9d5dc1059ed8ULL -#define SHA384_H1 0x629a292a367cd507ULL -#define SHA384_H2 0x9159015a3070dd17ULL -#define SHA384_H3 0x152fecd8f70e5939ULL -#define SHA384_H4 0x67332667ffc00b31ULL -#define SHA384_H5 0x8eb44a8768581511ULL -#define SHA384_H6 0xdb0c2e0d64f98fa7ULL -#define SHA384_H7 0x47b5481dbefa4fa4ULL - -#define SHA512_H0 0x6a09e667f3bcc908ULL -#define SHA512_H1 0xbb67ae8584caa73bULL -#define SHA512_H2 0x3c6ef372fe94f82bULL -#define SHA512_H3 0xa54ff53a5f1d36f1ULL -#define SHA512_H4 0x510e527fade682d1ULL -#define SHA512_H5 0x9b05688c2b3e6c1fULL -#define SHA512_H6 0x1f83d9abfb41bd6bULL -#define SHA512_H7 0x5be0cd19137e2179ULL - -extern const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE]; - -extern const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE]; - -extern const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE]; - -struct sha1_state { - u32 state[SHA1_DIGEST_SIZE / 4]; - u64 count; - u8 buffer[SHA1_BLOCK_SIZE]; -}; - -struct sha256_state { - u32 state[SHA256_DIGEST_SIZE / 4]; - u64 count; - u8 buf[SHA256_BLOCK_SIZE]; -}; - -struct sha512_state { - u64 state[SHA512_DIGEST_SIZE / 8]; - u64 count[2]; - u8 buf[SHA512_BLOCK_SIZE]; -}; - -struct shash_desc; - -extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len); - -extern int crypto_sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *hash); - -extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data, - unsigned int len); - -extern int crypto_sha256_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *hash); - -extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data, - unsigned int len); - -extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *hash); -#endif diff --git a/include/crypto/sha1_base.h b/include/crypto/sha1_base.h deleted file mode 100644 index 01b002de..00000000 --- a/include/crypto/sha1_base.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * sha1_base.h - core logic for SHA-1 implementations - * - * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <crypto/internal/hash.h> -#include <crypto/sha.h> -#include <linux/byteorder.h> -#include <linux/crypto.h> -#include <linux/module.h> - -#include <asm/unaligned.h> - -typedef void (sha1_block_fn)(struct sha1_state *sst, u8 const *src, int blocks); - -static inline int sha1_base_init(struct shash_desc *desc) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA1_H0; - sctx->state[1] = SHA1_H1; - sctx->state[2] = SHA1_H2; - sctx->state[3] = SHA1_H3; - sctx->state[4] = SHA1_H4; - sctx->count = 0; - - return 0; -} - -static inline int sha1_base_do_update(struct shash_desc *desc, - const u8 *data, - unsigned int len, - sha1_block_fn *block_fn) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; - - sctx->count += len; - - if (unlikely((partial + len) >= SHA1_BLOCK_SIZE)) { - int blocks; - - if (partial) { - int p = SHA1_BLOCK_SIZE - partial; - - memcpy(sctx->buffer + partial, data, p); - data += p; - len -= p; - - block_fn(sctx, sctx->buffer, 1); - } - - blocks = len / SHA1_BLOCK_SIZE; - len %= SHA1_BLOCK_SIZE; - - if (blocks) { - block_fn(sctx, data, blocks); - data += blocks * SHA1_BLOCK_SIZE; - } - partial = 0; - } - if (len) - memcpy(sctx->buffer + partial, data, len); - - return 0; -} - -static inline int sha1_base_do_finalize(struct shash_desc *desc, - sha1_block_fn *block_fn) -{ - const int bit_offset = SHA1_BLOCK_SIZE - sizeof(__be64); - struct sha1_state *sctx = shash_desc_ctx(desc); - __be64 *bits = (__be64 *)(sctx->buffer + bit_offset); - unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; - - sctx->buffer[partial++] = 0x80; - if (partial > bit_offset) { - memset(sctx->buffer + partial, 0x0, SHA1_BLOCK_SIZE - partial); - partial = 0; - - block_fn(sctx, sctx->buffer, 1); - } - - memset(sctx->buffer + partial, 0x0, bit_offset - partial); - *bits = cpu_to_be64(sctx->count << 3); - block_fn(sctx, sctx->buffer, 1); - - return 0; -} - -static inline int sha1_base_finish(struct shash_desc *desc, u8 *out) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - __be32 *digest = (__be32 *)out; - int i; - - for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) - put_unaligned_be32(sctx->state[i], digest++); - - *sctx = (struct sha1_state){}; - return 0; -} diff --git a/include/keys/user-type.h b/include/keys/user-type.h new file mode 100644 index 00000000..a7a2ee45 --- /dev/null +++ b/include/keys/user-type.h @@ -0,0 +1,6 @@ +#ifndef _KEYS_USER_TYPE_H +#define _KEYS_USER_TYPE_H + +#include <linux/key.h> + +#endif /* _KEYS_USER_TYPE_H */ diff --git a/include/linux/bcache.h b/include/linux/bcache.h index f09a44a6..4179f8dd 100644 --- a/include/linux/bcache.h +++ b/include/linux/bcache.h @@ -102,9 +102,17 @@ struct bch_val { __u64 __nothing[0]; }; -struct bkey { - __u64 _data[0]; +struct bversion { +#if defined(__LITTLE_ENDIAN) + __u64 lo; + __u32 hi; +#elif defined(__BIG_ENDIAN) + __u32 hi; + __u64 lo; +#endif +} __attribute__((packed, aligned(4))); +struct bkey { /* Size of combined key and value, in u64s */ __u8 u64s; @@ -125,13 +133,13 @@ struct bkey { #if defined(__LITTLE_ENDIAN) __u8 pad[1]; - __u32 version; + struct bversion version; __u32 size; /* extent size, in sectors */ struct bpos p; #elif defined(__BIG_ENDIAN) struct bpos p; __u32 size; /* extent size, in sectors */ - __u32 version; + struct bversion version; __u8 pad[1]; #endif @@ -184,7 +192,8 @@ enum bch_bkey_fields { BKEY_FIELD_OFFSET, BKEY_FIELD_SNAPSHOT, BKEY_FIELD_SIZE, - BKEY_FIELD_VERSION, + BKEY_FIELD_VERSION_HI, + BKEY_FIELD_VERSION_LO, BKEY_NR_FIELDS, }; @@ -200,14 +209,25 @@ enum bch_bkey_fields { bkey_format_field(OFFSET, p.offset), \ bkey_format_field(SNAPSHOT, p.snapshot), \ bkey_format_field(SIZE, size), \ - bkey_format_field(VERSION, version), \ + bkey_format_field(VERSION_HI, version.hi), \ + bkey_format_field(VERSION_LO, version.lo), \ }, \ }) /* bkey with inline value */ struct bkey_i { - struct bkey k; - struct bch_val v; + __u64 _data[0]; + + union { + struct { + /* Size of combined key and value, in u64s */ + __u8 u64s; + }; + struct { + struct bkey k; + struct bch_val v; + }; + }; }; #ifndef __cplusplus @@ -358,20 +378,47 @@ BKEY_VAL_TYPE(cookie, KEY_TYPE_COOKIE); * is neither checksummed nor compressed. */ +/* 128 bits, sufficient for cryptographic MACs: */ +struct bch_csum { + __le64 lo; + __le64 hi; +} __attribute__((packed, aligned(8))); + +#define BCH_CSUM_NONE 0U +#define BCH_CSUM_CRC32C 1U +#define BCH_CSUM_CRC64 2U +#define BCH_CSUM_CHACHA20_POLY1305_80 3U +#define BCH_CSUM_CHACHA20_POLY1305_128 4U +#define BCH_CSUM_NR 5U + +static inline _Bool bch_csum_type_is_encryption(unsigned type) +{ + switch (type) { + case BCH_CSUM_CHACHA20_POLY1305_80: + case BCH_CSUM_CHACHA20_POLY1305_128: + return true; + default: + return false; + } +} + enum bch_extent_entry_type { - BCH_EXTENT_ENTRY_crc32 = 0, - BCH_EXTENT_ENTRY_ptr = 1, + BCH_EXTENT_ENTRY_ptr = 0, + BCH_EXTENT_ENTRY_crc32 = 1, BCH_EXTENT_ENTRY_crc64 = 2, + BCH_EXTENT_ENTRY_crc128 = 3, }; -#define BCH_EXTENT_ENTRY_MAX 3 +#define BCH_EXTENT_ENTRY_MAX 4 +/* Compressed/uncompressed size are stored biased by 1: */ struct bch_extent_crc32 { #if defined(__LITTLE_ENDIAN_BITFIELD) - __u32 type:1, + __u32 type:2, + _compressed_size:7, + _uncompressed_size:7, offset:7, - compressed_size:8, - uncompressed_size:8, + _unused:1, csum_type:4, compression_type:4; __u32 csum; @@ -379,45 +426,80 @@ struct bch_extent_crc32 { __u32 csum; __u32 compression_type:4, csum_type:4, - uncompressed_size:8, - compressed_size:8, + _unused:1, offset:7, - type:1; + _uncompressed_size:7, + _compressed_size:7, + type:2; #endif } __attribute__((packed, aligned(8))); -#define CRC32_EXTENT_SIZE_MAX (1U << 7) - -/* 64k */ -#define BCH_COMPRESSED_EXTENT_MAX 128U +#define CRC32_SIZE_MAX (1U << 7) +#define CRC32_NONCE_MAX 0 struct bch_extent_crc64 { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:3, - offset:17, - compressed_size:18, - uncompressed_size:18, + _compressed_size:9, + _uncompressed_size:9, + offset:9, + nonce:10, + csum_type:4, + compression_type:4, + csum_hi:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 csum_hi:16, + compression_type:4, + csum_type:4, + nonce:10, + offset:9, + _uncompressed_size:9, + _compressed_size:9, + type:3; +#endif + __u64 csum_lo; +} __attribute__((packed, aligned(8))); + +#define CRC64_SIZE_MAX (1U << 9) +#define CRC64_NONCE_MAX ((1U << 10) - 1) + +struct bch_extent_crc128 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:4, + _compressed_size:13, + _uncompressed_size:13, + offset:13, + nonce:13, csum_type:4, compression_type:4; #elif defined (__BIG_ENDIAN_BITFIELD) __u64 compression_type:4, csum_type:4, - uncompressed_size:18, - compressed_size:18, - offset:17, + nonce:14, + offset:13, + _uncompressed_size:13, + _compressed_size:13, type:3; #endif - __u64 csum; + struct bch_csum csum; } __attribute__((packed, aligned(8))); -#define CRC64_EXTENT_SIZE_MAX (1U << 17) +#define CRC128_SIZE_MAX (1U << 13) +#define CRC128_NONCE_MAX ((1U << 13) - 1) + +/* + * Max size of an extent that may require bouncing to read or write + * (checksummed, compressed): 64k + */ +#define BCH_ENCODED_EXTENT_MAX 128U /* * @reservation - pointer hasn't been written to, just reserved */ struct bch_extent_ptr { #if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:2, + __u64 type:1, + cached:1, erasure_coded:1, reservation:1, offset:44, /* 8 petabytes */ @@ -429,10 +511,25 @@ struct bch_extent_ptr { offset:44, reservation:1, erasure_coded:1, - type:2; + cached:1, + type:1; #endif } __attribute__((packed, aligned(8))); +struct bch_extent_reservation { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:5, + unused:23, + replicas:4, + generation:32; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 generation:32, + replicas:4, + unused:23, + type:5; +#endif +}; + union bch_extent_entry { #if defined(__LITTLE_ENDIAN) || __BITS_PER_LONG == 64 unsigned long type; @@ -446,6 +543,7 @@ union bch_extent_entry { #endif struct bch_extent_crc32 crc32; struct bch_extent_crc64 crc64; + struct bch_extent_crc128 crc128; struct bch_extent_ptr ptr; }; @@ -473,9 +571,18 @@ struct bch_extent { } __attribute__((packed, aligned(8))); BKEY_VAL_TYPE(extent, BCH_EXTENT); +struct bch_reservation { + struct bch_val v; + + __le32 generation; + __u8 nr_replicas; + __u8 pad[3]; +} __attribute__((packed, aligned(8))); +BKEY_VAL_TYPE(reservation, BCH_RESERVATION); + /* Maximum size (in u64s) a single pointer could be: */ #define BKEY_EXTENT_PTR_U64s_MAX\ - ((sizeof(struct bch_extent_crc64) + \ + ((sizeof(struct bch_extent_crc128) + \ sizeof(struct bch_extent_ptr)) / sizeof(u64)) /* Maximum possible size of an entire extent value: */ @@ -506,28 +613,26 @@ enum bch_inode_types { struct bch_inode { struct bch_val v; - __le16 i_mode; - __le16 pad; - __le32 i_flags; - - /* Nanoseconds */ - __le64 i_atime; - __le64 i_ctime; - __le64 i_mtime; - - __le64 i_size; - __le64 i_sectors; - - __le32 i_uid; - __le32 i_gid; - __le32 i_nlink; - - __le32 i_dev; - __le64 i_hash_seed; + __le32 i_flags; + __le16 i_mode; + __u8 fields[0]; } __attribute__((packed)); BKEY_VAL_TYPE(inode, BCH_INODE_FS); +#define BCH_INODE_FIELDS() \ + BCH_INODE_FIELD(i_atime, 64) \ + BCH_INODE_FIELD(i_ctime, 64) \ + BCH_INODE_FIELD(i_mtime, 64) \ + BCH_INODE_FIELD(i_otime, 64) \ + BCH_INODE_FIELD(i_size, 64) \ + BCH_INODE_FIELD(i_sectors, 64) \ + BCH_INODE_FIELD(i_uid, 32) \ + BCH_INODE_FIELD(i_gid, 32) \ + BCH_INODE_FIELD(i_nlink, 32) \ + BCH_INODE_FIELD(i_generation, 32) \ + BCH_INODE_FIELD(i_dev, 32) + enum { /* * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL @@ -544,9 +649,9 @@ enum { /* not implemented yet: */ __BCH_INODE_HAS_XATTRS = 7, /* has xattrs in xattr btree */ -}; -LE32_BITMASK(INODE_STR_HASH_TYPE, struct bch_inode, i_flags, 28, 32); + /* bits 20+ reserved for packed fields below: */ +}; #define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) #define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) @@ -557,6 +662,9 @@ LE32_BITMASK(INODE_STR_HASH_TYPE, struct bch_inode, i_flags, 28, 32); #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) #define BCH_INODE_HAS_XATTRS (1 << __BCH_INODE_HAS_XATTRS) +LE32_BITMASK(INODE_STR_HASH, struct bch_inode, i_flags, 20, 24); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, i_flags, 24, 32); + struct bch_inode_blockdev { struct bch_val v; @@ -574,6 +682,7 @@ BKEY_VAL_TYPE(inode_blockdev, BCH_INODE_BLOCKDEV); /* Thin provisioned volume, or cache for another block device? */ LE64_BITMASK(CACHED_DEV, struct bch_inode_blockdev, i_flags, 0, 1) + /* Dirents */ /* @@ -639,6 +748,7 @@ BKEY_VAL_TYPE(xattr, BCH_XATTR); * Version 4: Backing device with data offset * Version 5: All the incompat changes * Version 6: Cache device UUIDs all in superblock, another incompat bset change + * Version 7: Encryption (expanded checksum fields), other random things */ #define BCACHE_SB_VERSION_CDEV_V0 0 #define BCACHE_SB_VERSION_BDEV 1 @@ -646,16 +756,15 @@ BKEY_VAL_TYPE(xattr, BCH_XATTR); #define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 #define BCACHE_SB_VERSION_CDEV_V2 5 #define BCACHE_SB_VERSION_CDEV_V3 6 -#define BCACHE_SB_VERSION_CDEV 6 -#define BCACHE_SB_MAX_VERSION 6 +#define BCACHE_SB_VERSION_CDEV_V4 7 +#define BCACHE_SB_VERSION_CDEV 7 +#define BCACHE_SB_MAX_VERSION 7 -#define SB_SECTOR 8 -#define SB_LABEL_SIZE 32 -#define MAX_CACHES_PER_SET 64 - -#define BDEV_DATA_START_DEFAULT 16 /* sectors */ +#define BCH_SB_SECTOR 8 +#define BCH_SB_LABEL_SIZE 32 +#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ -struct cache_member { +struct bch_member { uuid_le uuid; __le64 nbuckets; /* device size */ __le16 first_bucket; /* index of first bucket used */ @@ -663,164 +772,257 @@ struct cache_member { __le32 pad; __le64 last_mount; /* time_t */ - __le64 f1; - __le64 f2; + __le64 flags[2]; }; -LE64_BITMASK(CACHE_STATE, struct cache_member, f1, 0, 4) -#define CACHE_ACTIVE 0U -#define CACHE_RO 1U -#define CACHE_FAILED 2U -#define CACHE_SPARE 3U -#define CACHE_STATE_NR 4U +LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) +LE64_BITMASK(BCH_MEMBER_TIER, struct bch_member, flags[0], 4, 8) +LE64_BITMASK(BCH_MEMBER_HAS_METADATA, struct bch_member, flags[0], 8, 9) +LE64_BITMASK(BCH_MEMBER_HAS_DATA, struct bch_member, flags[0], 9, 10) +LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) +LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15); -LE64_BITMASK(CACHE_TIER, struct cache_member, f1, 4, 8) -#define CACHE_TIERS 4U +#if 0 +LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); +LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); +#endif -LE64_BITMASK(CACHE_REPLICATION_SET, struct cache_member, f1, 8, 16) +enum bch_member_state { + BCH_MEMBER_STATE_ACTIVE = 0, + BCH_MEMBER_STATE_RO = 1, + BCH_MEMBER_STATE_FAILED = 2, + BCH_MEMBER_STATE_SPARE = 3, + BCH_MEMBER_STATE_NR = 4, +}; -LE64_BITMASK(CACHE_HAS_METADATA, struct cache_member, f1, 24, 25) -LE64_BITMASK(CACHE_HAS_DATA, struct cache_member, f1, 25, 26) +#define BCH_TIER_MAX 4U -LE64_BITMASK(CACHE_REPLACEMENT, struct cache_member, f1, 26, 30) -#define CACHE_REPLACEMENT_LRU 0U -#define CACHE_REPLACEMENT_FIFO 1U -#define CACHE_REPLACEMENT_RANDOM 2U -#define CACHE_REPLACEMENT_NR 3U +enum cache_replacement { + CACHE_REPLACEMENT_LRU = 0, + CACHE_REPLACEMENT_FIFO = 1, + CACHE_REPLACEMENT_RANDOM = 2, + CACHE_REPLACEMENT_NR = 3, +}; -LE64_BITMASK(CACHE_DISCARD, struct cache_member, f1, 30, 31); +struct bch_sb_layout { + uuid_le magic; /* bcache superblock UUID */ + __u8 layout_type; + __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ + __u8 nr_superblocks; + __u8 pad[5]; + __u64 sb_offset[61]; +} __attribute__((packed)); -LE64_BITMASK(CACHE_NR_READ_ERRORS, struct cache_member, f2, 0, 20); -LE64_BITMASK(CACHE_NR_WRITE_ERRORS, struct cache_member, f2, 20, 40); +#define BCH_SB_LAYOUT_SECTOR 7 -struct cache_sb { - __le64 csum; - __le64 offset; /* sector where this sb was written */ - __le64 version; /* of on disk format */ +struct bch_sb_field { + __u64 _data[0]; + __le32 u64s; + __le32 type; +}; - uuid_le magic; /* bcache superblock UUID */ +enum bch_sb_field_types { + BCH_SB_FIELD_journal = 0, + BCH_SB_FIELD_members = 1, + BCH_SB_FIELD_crypt = 2, + BCH_SB_FIELD_NR = 3, +}; - /* Identifies this disk within the cache set: */ - uuid_le disk_uuid; +struct bch_sb_field_journal { + struct bch_sb_field field; + __le64 buckets[0]; +}; - /* - * Internal cache set UUID - xored with various magic numbers and thus - * must never change: - */ - union { - uuid_le set_uuid; - __le64 set_magic; - }; +struct bch_sb_field_members { + struct bch_sb_field field; + struct bch_member members[0]; +}; + +/* Crypto: */ - __u8 label[SB_LABEL_SIZE]; +struct nonce { + __le32 d[4]; +}; + +struct bch_key { + __le64 key[4]; +}; + +#define BCH_KEY_MAGIC \ + (((u64) 'b' << 0)|((u64) 'c' << 8)| \ + ((u64) 'h' << 16)|((u64) '*' << 24)| \ + ((u64) '*' << 32)|((u64) 'k' << 40)| \ + ((u64) 'e' << 48)|((u64) 'y' << 56)) + +struct bch_encrypted_key { + __le64 magic; + struct bch_key key; +}; + +/* + * If this field is present in the superblock, it stores an encryption key which + * is used encrypt all other data/metadata. The key will normally be encrypted + * with the key userspace provides, but if encryption has been turned off we'll + * just store the master key unencrypted in the superblock so we can access the + * previously encrypted data. + */ +struct bch_sb_field_crypt { + struct bch_sb_field field; __le64 flags; + __le64 kdf_flags; + struct bch_encrypted_key key; +}; - /* Incremented each time superblock is written: */ - __le64 seq; +LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); - /* - * User visible UUID for identifying the cache set the user is allowed - * to change: - */ - uuid_le user_uuid; +enum bch_kdf_types { + BCH_KDF_SCRYPT = 0, + BCH_KDF_NR = 1, +}; - __le64 flags2; - __le64 pad1[5]; +/* stored as base 2 log of scrypt params: */ +LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); +LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); +LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); - /* Number of cache_member entries: */ - __u8 nr_in_set; +/* + * @offset - sector where this sb was written + * @version - on disk format version + * @magic - identifies as a bcache superblock (BCACHE_MAGIC) + * @seq - incremented each time superblock is written + * @uuid - used for generating various magic numbers and identifying + * member devices, never changes + * @user_uuid - user visible UUID, may be changed + * @label - filesystem label + * @seq - identifies most recent superblock, incremented each time + * superblock is written + * @features - enabled incompatible features + */ +struct bch_sb { + struct bch_csum csum; + __le64 version; + uuid_le magic; + uuid_le uuid; + uuid_le user_uuid; + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 offset; + __le64 seq; - /* - * Index of this device - for PTR_DEV(), and also this device's - * slot in the cache_member array: - */ - __u8 nr_this_dev; - __le16 pad2[3]; + __le16 block_size; + __u8 dev_idx; + __u8 nr_devices; + __le32 u64s; - __le16 block_size; /* sectors */ - __le16 pad3[6]; + __le64 time_base_lo; + __le32 time_base_hi; + __le32 time_precision; + + __le64 flags[8]; + __le64 features[2]; + __le64 compat[2]; - __le16 u64s; /* size of variable length portion */ + struct bch_sb_layout layout; union { - struct cache_member members[0]; - /* - * Journal buckets also in the variable length portion, after - * the member info: - */ - __le64 _data[0]; + struct bch_sb_field start[0]; + __le64 _data[0]; }; -}; +} __attribute__((packed, aligned(8))); -/* XXX: rename CACHE_SET -> BCH_FS or something? */ +/* + * Flags: + * BCH_SB_INITALIZED - set on first mount + * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect + * behaviour of mount/recovery path: + * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits + * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 + * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides + * DATA/META_CSUM_TYPE. Also indicates encryption + * algorithm in use, if/when we get more than one + */ -LE64_BITMASK(CACHE_SET_SYNC, struct cache_sb, flags, 0, 1); +LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); +LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); +LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); +LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); -LE64_BITMASK(CACHE_SET_ERROR_ACTION, struct cache_sb, flags, 1, 4); -#define BCH_ON_ERROR_CONTINUE 0U -#define BCH_ON_ERROR_RO 1U -#define BCH_ON_ERROR_PANIC 2U -#define BCH_NR_ERROR_ACTIONS 3U +LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); -LE64_BITMASK(CACHE_SET_META_REPLICAS_WANT,struct cache_sb, flags, 4, 8); -LE64_BITMASK(CACHE_SET_DATA_REPLICAS_WANT,struct cache_sb, flags, 8, 12); +LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); +LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); -#define BCH_REPLICAS_MAX 4U +LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); +LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); -LE64_BITMASK(CACHE_SB_CSUM_TYPE, struct cache_sb, flags, 12, 16); +LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); +LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); -LE64_BITMASK(CACHE_SET_META_PREFERRED_CSUM_TYPE,struct cache_sb, flags, 16, 20); -#define BCH_CSUM_NONE 0U -#define BCH_CSUM_CRC32C 1U -#define BCH_CSUM_CRC64 2U -#define BCH_CSUM_NR 3U +LE64_BITMASK(BCH_SB_META_REPLICAS_HAVE, struct bch_sb, flags[0], 56, 60); +LE64_BITMASK(BCH_SB_DATA_REPLICAS_HAVE, struct bch_sb, flags[0], 60, 64); -LE64_BITMASK(CACHE_SET_BTREE_NODE_SIZE, struct cache_sb, flags, 20, 36); +LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); +LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); +LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); -LE64_BITMASK(CACHE_SET_META_REPLICAS_HAVE,struct cache_sb, flags, 36, 40); -LE64_BITMASK(CACHE_SET_DATA_REPLICAS_HAVE,struct cache_sb, flags, 40, 44); +LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); +LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); +LE64_BITMASK(BCH_SB_JOURNAL_ENTRY_SIZE, struct bch_sb, flags[1], 14, 20); -LE64_BITMASK(CACHE_SET_STR_HASH_TYPE,struct cache_sb, flags, 44, 48); -enum bch_str_hash_type { - BCH_STR_HASH_CRC32C = 0, - BCH_STR_HASH_CRC64 = 1, - BCH_STR_HASH_SIPHASH = 2, - BCH_STR_HASH_SHA1 = 3, +/* Features: */ +enum bch_sb_features { + BCH_FEATURE_LZ4 = 0, + BCH_FEATURE_GZIP = 1, }; -#define BCH_STR_HASH_NR 4 +/* options: */ -LE64_BITMASK(CACHE_SET_DATA_PREFERRED_CSUM_TYPE, struct cache_sb, flags, 48, 52); +#define BCH_REPLICAS_MAX 4U -LE64_BITMASK(CACHE_SET_COMPRESSION_TYPE, struct cache_sb, flags, 52, 56); -enum { - BCH_COMPRESSION_NONE = 0, - BCH_COMPRESSION_LZ4 = 1, - BCH_COMPRESSION_GZIP = 2, +#if 0 +#define BCH_ERROR_ACTIONS() \ + x(BCH_ON_ERROR_CONTINUE, 0, "continue") \ + x(BCH_ON_ERROR_RO, 1, "remount-ro") \ + x(BCH_ON_ERROR_PANIC, 2, "panic") \ + x(BCH_NR_ERROR_ACTIONS, 3, NULL) + +enum bch_error_actions { +#define x(_opt, _nr, _str) _opt = _nr, + BCH_ERROR_ACTIONS() +#undef x }; +#endif -#define BCH_COMPRESSION_NR 3U - -/* Limit inode numbers to 32 bits: */ -LE64_BITMASK(CACHE_INODE_32BIT, struct cache_sb, flags, 56, 57); - -LE64_BITMASK(CACHE_SET_GC_RESERVE, struct cache_sb, flags, 57, 63); - -LE64_BITMASK(CACHE_SET_ROOT_RESERVE, struct cache_sb, flags2, 0, 6); +enum bch_error_actions { + BCH_ON_ERROR_CONTINUE = 0, + BCH_ON_ERROR_RO = 1, + BCH_ON_ERROR_PANIC = 2, + BCH_NR_ERROR_ACTIONS = 3, +}; -/* - * Did we shut down cleanly? Just a hint, doesn't affect behaviour of - * mount/recovery path: - */ -LE64_BITMASK(CACHE_SET_CLEAN, struct cache_sb, flags2, 6, 7); +enum bch_csum_opts { + BCH_CSUM_OPT_NONE = 0, + BCH_CSUM_OPT_CRC32C = 1, + BCH_CSUM_OPT_CRC64 = 2, + BCH_CSUM_OPT_NR = 3, +}; -LE64_BITMASK(CACHE_SET_JOURNAL_ENTRY_SIZE, struct cache_sb, flags2, 7, 15); +enum bch_str_hash_opts { + BCH_STR_HASH_CRC32C = 0, + BCH_STR_HASH_CRC64 = 1, + BCH_STR_HASH_SIPHASH = 2, + BCH_STR_HASH_NR = 3, +}; -/* options: */ +enum bch_compression_opts { + BCH_COMPRESSION_NONE = 0, + BCH_COMPRESSION_LZ4 = 1, + BCH_COMPRESSION_GZIP = 2, + BCH_COMPRESSION_NR = 3, +}; /** - * CACHE_SET_OPT(name, choices, min, max, sb_option, sysfs_writeable) + * BCH_OPT(name, choices, min, max, sb_option, sysfs_writeable) * * @name - name of mount option, sysfs attribute, and struct cache_set_opts * member @@ -838,56 +1040,60 @@ LE64_BITMASK(CACHE_SET_JOURNAL_ENTRY_SIZE, struct cache_sb, flags2, 7, 15); * @sysfs_writeable - if true, option will be modifiable at runtime via sysfs */ -#define CACHE_SET_SB_OPTS() \ - CACHE_SET_OPT(errors, \ - bch_error_actions, \ - 0, BCH_NR_ERROR_ACTIONS, \ - CACHE_SET_ERROR_ACTION, \ - true) \ - CACHE_SET_OPT(metadata_replicas, \ - bch_uint_opt, \ - 0, BCH_REPLICAS_MAX, \ - CACHE_SET_META_REPLICAS_WANT, \ - false) \ - CACHE_SET_OPT(data_replicas, \ - bch_uint_opt, \ - 0, BCH_REPLICAS_MAX, \ - CACHE_SET_DATA_REPLICAS_WANT, \ - false) \ - CACHE_SET_OPT(metadata_checksum, \ - bch_csum_types, \ - 0, BCH_CSUM_NR, \ - CACHE_SET_META_PREFERRED_CSUM_TYPE, \ - true) \ - CACHE_SET_OPT(data_checksum, \ - bch_csum_types, \ - 0, BCH_CSUM_NR, \ - CACHE_SET_DATA_PREFERRED_CSUM_TYPE, \ - true) \ - CACHE_SET_OPT(compression, \ - bch_compression_types, \ - 0, BCH_COMPRESSION_NR, \ - CACHE_SET_COMPRESSION_TYPE, \ - true) \ - CACHE_SET_OPT(str_hash, \ - bch_str_hash_types, \ - 0, BCH_STR_HASH_NR, \ - CACHE_SET_STR_HASH_TYPE, \ - true) \ - CACHE_SET_OPT(inodes_32bit, \ - bch_bool_opt, 0, 2, \ - CACHE_INODE_32BIT, \ - true) \ - CACHE_SET_OPT(gc_reserve_percent, \ - bch_uint_opt, \ - 5, 21, \ - CACHE_SET_GC_RESERVE, \ - false) \ - CACHE_SET_OPT(root_reserve_percent, \ - bch_uint_opt, \ - 0, 21, \ - CACHE_SET_ROOT_RESERVE, \ - false) +#define BCH_SB_OPTS() \ + BCH_OPT(errors, \ + bch_error_actions, \ + 0, BCH_NR_ERROR_ACTIONS, \ + BCH_SB_ERROR_ACTION, \ + true) \ + BCH_OPT(metadata_replicas, \ + bch_uint_opt, \ + 0, BCH_REPLICAS_MAX, \ + BCH_SB_META_REPLICAS_WANT, \ + false) \ + BCH_OPT(data_replicas, \ + bch_uint_opt, \ + 0, BCH_REPLICAS_MAX, \ + BCH_SB_DATA_REPLICAS_WANT, \ + false) \ + BCH_OPT(metadata_checksum, \ + bch_csum_types, \ + 0, BCH_CSUM_OPT_NR, \ + BCH_SB_META_CSUM_TYPE, \ + true) \ + BCH_OPT(data_checksum, \ + bch_csum_types, \ + 0, BCH_CSUM_OPT_NR, \ + BCH_SB_DATA_CSUM_TYPE, \ + true) \ + BCH_OPT(compression, \ + bch_compression_types, \ + 0, BCH_COMPRESSION_NR, \ + BCH_SB_COMPRESSION_TYPE, \ + true) \ + BCH_OPT(str_hash, \ + bch_str_hash_types, \ + 0, BCH_STR_HASH_NR, \ + BCH_SB_STR_HASH_TYPE, \ + true) \ + BCH_OPT(inodes_32bit, \ + bch_bool_opt, 0, 2, \ + BCH_SB_INODE_32BIT, \ + true) \ + BCH_OPT(gc_reserve_percent, \ + bch_uint_opt, \ + 5, 21, \ + BCH_SB_GC_RESERVE, \ + false) \ + BCH_OPT(root_reserve_percent, \ + bch_uint_opt, \ + 0, 100, \ + BCH_SB_ROOT_RESERVE, \ + false) \ + BCH_OPT(wide_macs, \ + bch_bool_opt, 0, 2, \ + BCH_SB_128_BIT_MACS, \ + true) /* backing device specific stuff: */ @@ -908,7 +1114,7 @@ struct backingdev_sb { uuid_le set_uuid; __le64 set_magic; }; - __u8 label[SB_LABEL_SIZE]; + __u8 label[BCH_SB_LABEL_SIZE]; __le64 flags; @@ -947,15 +1153,7 @@ LE64_BITMASK(BDEV_STATE, struct backingdev_sb, flags, 61, 63); #define BDEV_STATE_DIRTY 2U #define BDEV_STATE_STALE 3U -static inline unsigned bch_journal_buckets_offset(struct cache_sb *sb) -{ - return sb->nr_in_set * (sizeof(struct cache_member) / sizeof(__u64)); -} - -static inline unsigned bch_nr_journal_buckets(struct cache_sb *sb) -{ - return __le16_to_cpu(sb->u64s) - bch_journal_buckets_offset(sb); -} +#define BDEV_DATA_START_DEFAULT 16 /* sectors */ static inline _Bool __SB_IS_BDEV(__u64 version) { @@ -963,7 +1161,7 @@ static inline _Bool __SB_IS_BDEV(__u64 version) || version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; } -static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) +static inline _Bool SB_IS_BDEV(const struct bch_sb *sb) { return __SB_IS_BDEV(sb->version); } @@ -981,29 +1179,33 @@ static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) #define BCACHE_STATFS_MAGIC 0xca451a4e -#define BCACHE_SB_MAGIC 0xca451a4ef67385c6ULL -#define BCACHE_SB_MAGIC2 0x816dba487ff56582ULL -#define JSET_MAGIC 0x245235c1a3625032ULL -#define PSET_MAGIC 0x6750e15f87337f91ULL -#define BSET_MAGIC 0x90135c78b99e07f5ULL +#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) +#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL) +#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) -static inline __u64 jset_magic(struct cache_sb *sb) +static inline __le64 __bch_sb_magic(struct bch_sb *sb) { - return __le64_to_cpu(sb->set_magic) ^ JSET_MAGIC; + __le64 ret; + memcpy(&ret, &sb->uuid, sizeof(ret)); + return ret; } -static inline __u64 pset_magic(struct cache_sb *sb) +static inline __u64 __jset_magic(struct bch_sb *sb) { - return __le64_to_cpu(sb->set_magic) ^ PSET_MAGIC; + return __le64_to_cpu(__bch_sb_magic(sb) ^ JSET_MAGIC); } -static inline __u64 bset_magic(struct cache_sb *sb) +static inline __u64 __pset_magic(struct bch_sb *sb) { - return __le64_to_cpu(sb->set_magic) ^ BSET_MAGIC; + return __le64_to_cpu(__bch_sb_magic(sb) ^ PSET_MAGIC); } -/* Journal */ +static inline __u64 __bset_magic(struct bch_sb *sb) +{ + return __le64_to_cpu(__bch_sb_magic(sb) ^ BSET_MAGIC); +} +/* Journal */ #define BCACHE_JSET_VERSION_UUIDv1 1 #define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */ @@ -1054,24 +1256,29 @@ enum { * version is for on disk format changes. */ struct jset { - __le64 csum; + struct bch_csum csum; + __le64 magic; + __le64 seq; __le32 version; __le32 flags; - /* Sequence number of oldest dirty journal entry */ - __le64 seq; - __le64 last_seq; + __le32 u64s; /* size of d[] in u64s */ + + __u8 encrypted_start[0]; __le16 read_clock; __le16 write_clock; - __le32 u64s; /* size of d[] in u64s */ + + /* Sequence number of oldest dirty journal entry */ + __le64 last_seq; + union { struct jset_entry start[0]; __u64 _data[0]; }; -}; +} __attribute__((packed)); LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); @@ -1081,10 +1288,14 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); /* Bucket prios/gens */ struct prio_set { - __le64 csum; + struct bch_csum csum; + __le64 magic; - __le32 version; - __le32 flags; + __le32 nonce[3]; + __le16 version; + __le16 flags; + + __u8 encrypted_start[0]; __le64 next_bucket; @@ -1093,7 +1304,7 @@ struct prio_set { __le16 write_prio; __u8 gen; } __attribute__((packed)) data[]; -}; +} __attribute__((packed)); LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4); @@ -1155,28 +1366,49 @@ struct bset { LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); -/* Only used in first bset */ -LE32_BITMASK(BSET_BTREE_LEVEL, struct bset, flags, 4, 8); - -LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 8, 9); +LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, - struct bset, flags, 9, 10); + struct bset, flags, 5, 6); struct btree_node { - __le64 csum; + struct bch_csum csum; __le64 magic; + /* this flags field is encrypted, unlike bset->flags: */ + __le64 flags; + /* Closed interval: */ struct bpos min_key; struct bpos max_key; + struct bch_extent_ptr ptr; struct bkey_format format; + union { struct bset keys; + struct { + __u8 pad[22]; + __le16 u64s; + __u64 _data[0]; + + }; + }; } __attribute__((packed)); +LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); +LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); + struct btree_node_entry { - __le64 csum; + struct bch_csum csum; + + union { struct bset keys; + struct { + __u8 pad[22]; + __le16 u64s; + __u64 _data[0]; + + }; + }; } __attribute__((packed)); /* OBSOLETE */ @@ -1237,7 +1469,7 @@ struct jset_v0 { __u16 btree_level; __u16 pad[3]; - __u64 prio_bucket[MAX_CACHES_PER_SET]; + __u64 prio_bucket[64]; union { struct bkey start[0]; diff --git a/include/linux/crypto.h b/include/linux/crypto.h index cb9ad24f..0dbeaaed 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -24,299 +24,81 @@ #include <linux/slab.h> #include <linux/string.h> -/* - * Autoloaded crypto modules should only use a prefixed name to avoid allowing - * arbitrary modules to be loaded. Loading from userspace may still need the - * unprefixed names, so retains those aliases as well. - * This uses __MODULE_INFO directly instead of MODULE_ALIAS because pre-4.3 - * gcc (e.g. avr32 toolchain) uses __LINE__ for uniqueness, and this macro - * expands twice on the same line. Instead, use a separate base name for the - * alias. - */ -#define MODULE_ALIAS_CRYPTO(name) \ - __MODULE_INFO(alias, alias_userspace, name); \ - __MODULE_INFO(alias, alias_crypto, "crypto-" name) - -/* - * Algorithm masks and types. - */ #define CRYPTO_ALG_TYPE_MASK 0x0000000f -#define CRYPTO_ALG_TYPE_CIPHER 0x00000001 -#define CRYPTO_ALG_TYPE_AEAD 0x00000003 #define CRYPTO_ALG_TYPE_BLKCIPHER 0x00000004 -#define CRYPTO_ALG_TYPE_ABLKCIPHER 0x00000005 -#define CRYPTO_ALG_TYPE_SKCIPHER 0x00000005 -#define CRYPTO_ALG_TYPE_GIVCIPHER 0x00000006 -#define CRYPTO_ALG_TYPE_KPP 0x00000008 -#define CRYPTO_ALG_TYPE_RNG 0x0000000c -#define CRYPTO_ALG_TYPE_AKCIPHER 0x0000000d -#define CRYPTO_ALG_TYPE_DIGEST 0x0000000e -#define CRYPTO_ALG_TYPE_HASH 0x0000000e #define CRYPTO_ALG_TYPE_SHASH 0x0000000e -#define CRYPTO_ALG_TYPE_AHASH 0x0000000f - -#define CRYPTO_ALG_TYPE_HASH_MASK 0x0000000e -#define CRYPTO_ALG_TYPE_AHASH_MASK 0x0000000e #define CRYPTO_ALG_TYPE_BLKCIPHER_MASK 0x0000000c - #define CRYPTO_ALG_ASYNC 0x00000080 -/* - * Set this bit if and only if the algorithm requires another algorithm of - * the same type to handle corner cases. - */ -#define CRYPTO_ALG_NEED_FALLBACK 0x00000100 - -/* - * This bit is set for symmetric key ciphers that have already been wrapped - * with a generic IV generator to prevent them from being wrapped again. - */ -#define CRYPTO_ALG_GENIV 0x00000200 - -/* - * Set if the algorithm is an instance that is build from templates. - */ -#define CRYPTO_ALG_INSTANCE 0x00000800 - -/* Set this bit if the algorithm provided is hardware accelerated but - * not available to userspace via instruction set or so. - */ -#define CRYPTO_ALG_KERN_DRIVER_ONLY 0x00001000 - -/* - * Mark a cipher as a service implementation only usable by another - * cipher and never by a normal user of the kernel crypto API - */ -#define CRYPTO_ALG_INTERNAL 0x00002000 - -/* - * Transform masks and values (for crt_flags). - */ -#define CRYPTO_TFM_REQ_MASK 0x000fff00 -#define CRYPTO_TFM_RES_MASK 0xfff00000 - -#define CRYPTO_TFM_REQ_WEAK_KEY 0x00000100 -#define CRYPTO_TFM_REQ_MAY_SLEEP 0x00000200 -#define CRYPTO_TFM_REQ_MAY_BACKLOG 0x00000400 -#define CRYPTO_TFM_RES_WEAK_KEY 0x00100000 -#define CRYPTO_TFM_RES_BAD_KEY_LEN 0x00200000 -#define CRYPTO_TFM_RES_BAD_KEY_SCHED 0x00400000 -#define CRYPTO_TFM_RES_BAD_BLOCK_LEN 0x00800000 -#define CRYPTO_TFM_RES_BAD_FLAGS 0x01000000 - -/* - * Miscellaneous stuff. - */ #define CRYPTO_MAX_ALG_NAME 64 -/* - * The macro CRYPTO_MINALIGN_ATTR (along with the void * type in the actual - * declaration) is used to ensure that the crypto_tfm context structure is - * aligned correctly for the given architecture so that there are no alignment - * faults for C data types. In particular, this is required on platforms such - * as arm where pointers are 32-bit aligned but there are data types such as - * u64 which require 64-bit alignment. - */ #define CRYPTO_MINALIGN ARCH_KMALLOC_MINALIGN - #define CRYPTO_MINALIGN_ATTR __attribute__ ((__aligned__(CRYPTO_MINALIGN))) struct scatterlist; struct crypto_blkcipher; struct crypto_tfm; struct crypto_type; -struct skcipher_givcrypt_request; struct blkcipher_desc { - struct crypto_blkcipher *tfm; - void *info; - u32 flags; -}; - -struct cipher_desc { - struct crypto_tfm *tfm; - void (*crfn)(struct crypto_tfm *tfm, u8 *dst, const u8 *src); - unsigned int (*prfn)(const struct cipher_desc *desc, u8 *dst, - const u8 *src, unsigned int nbytes); - void *info; + struct crypto_blkcipher *tfm; + void *info; + u32 flags; }; struct blkcipher_alg { int (*setkey)(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen); + unsigned keylen); int (*encrypt)(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes); + unsigned nbytes); int (*decrypt)(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes); - - const char *geniv; - - unsigned int min_keysize; - unsigned int max_keysize; - unsigned int ivsize; -}; - -struct cipher_alg { - unsigned int cia_min_keysize; - unsigned int cia_max_keysize; - int (*cia_setkey)(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen); - void (*cia_encrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src); - void (*cia_decrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src); -}; - -struct compress_alg { - int (*coa_compress)(struct crypto_tfm *tfm, const u8 *src, - unsigned int slen, u8 *dst, unsigned int *dlen); - int (*coa_decompress)(struct crypto_tfm *tfm, const u8 *src, - unsigned int slen, u8 *dst, unsigned int *dlen); + unsigned nbytes); }; - #define cra_blkcipher cra_u.blkcipher -#define cra_cipher cra_u.cipher -#define cra_compress cra_u.compress struct crypto_alg { - struct list_head cra_list; - struct list_head cra_users; - - u32 cra_flags; - unsigned int cra_blocksize; - unsigned int cra_ctxsize; - unsigned int cra_alignmask; - - int cra_priority; - atomic_t cra_refcnt; + struct list_head cra_list; + struct list_head cra_users; - char cra_name[CRYPTO_MAX_ALG_NAME]; - char cra_driver_name[CRYPTO_MAX_ALG_NAME]; + u32 cra_flags; + unsigned cra_ctxsize; + char cra_name[CRYPTO_MAX_ALG_NAME]; const struct crypto_type *cra_type; union { struct blkcipher_alg blkcipher; - struct cipher_alg cipher; - struct compress_alg compress; } cra_u; int (*cra_init)(struct crypto_tfm *tfm); void (*cra_exit)(struct crypto_tfm *tfm); - void (*cra_destroy)(struct crypto_alg *alg); - - struct module *cra_module; } CRYPTO_MINALIGN_ATTR; -/* - * Algorithm registration interface. - */ int crypto_register_alg(struct crypto_alg *alg); -int crypto_unregister_alg(struct crypto_alg *alg); -int crypto_register_algs(struct crypto_alg *algs, int count); -int crypto_unregister_algs(struct crypto_alg *algs, int count); - -/* - * Algorithm query interface. - */ -int crypto_has_alg(const char *name, u32 type, u32 mask); - -/* - * Transforms: user-instantiated objects which encapsulate algorithms - * and core processing logic. Managed via crypto_alloc_*() and - * crypto_free_*(), as well as the various helpers below. - */ struct blkcipher_tfm { - void *iv; int (*setkey)(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen); + unsigned keylen); int (*encrypt)(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes); + struct scatterlist *src, unsigned nbytes); int (*decrypt)(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes); + struct scatterlist *src, unsigned nbytes); }; -struct cipher_tfm { - int (*cit_setkey)(struct crypto_tfm *tfm, - const u8 *key, unsigned int keylen); - void (*cit_encrypt_one)(struct crypto_tfm *tfm, u8 *dst, const u8 *src); - void (*cit_decrypt_one)(struct crypto_tfm *tfm, u8 *dst, const u8 *src); -}; - -struct compress_tfm { - int (*cot_compress)(struct crypto_tfm *tfm, - const u8 *src, unsigned int slen, - u8 *dst, unsigned int *dlen); - int (*cot_decompress)(struct crypto_tfm *tfm, - const u8 *src, unsigned int slen, - u8 *dst, unsigned int *dlen); -}; - -#define crt_blkcipher crt_u.blkcipher -#define crt_cipher crt_u.cipher -#define crt_compress crt_u.compress - struct crypto_tfm { + u32 crt_flags; - u32 crt_flags; - - union { - struct blkcipher_tfm blkcipher; - struct cipher_tfm cipher; - struct compress_tfm compress; - } crt_u; + struct blkcipher_tfm crt_blkcipher; void (*exit)(struct crypto_tfm *tfm); - struct crypto_alg *__crt_alg; - - void *__crt_ctx[] CRYPTO_MINALIGN_ATTR; -}; - -struct crypto_blkcipher { - struct crypto_tfm base; -}; - -struct crypto_cipher { - struct crypto_tfm base; -}; - -struct crypto_comp { - struct crypto_tfm base; + struct crypto_alg *__crt_alg; + void *__crt_ctx[] CRYPTO_MINALIGN_ATTR; }; -enum { - CRYPTOA_UNSPEC, - CRYPTOA_ALG, - CRYPTOA_TYPE, - CRYPTOA_U32, - __CRYPTOA_MAX, -}; - -#define CRYPTOA_MAX (__CRYPTOA_MAX - 1) - -/* Maximum number of (rtattr) parameters for each template. */ -#define CRYPTO_MAX_ATTRS 32 - -struct crypto_attr_alg { - char name[CRYPTO_MAX_ALG_NAME]; -}; - -struct crypto_attr_type { - u32 type; - u32 mask; -}; - -struct crypto_attr_u32 { - u32 num; -}; - -/* - * Transform user interface. - */ - struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask); void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm); @@ -325,110 +107,19 @@ static inline void crypto_free_tfm(struct crypto_tfm *tfm) return crypto_destroy_tfm(tfm, tfm); } -int alg_test(const char *driver, const char *alg, u32 type, u32 mask); - -/* - * Transform helpers which query the underlying algorithm. - */ -static inline const char *crypto_tfm_alg_name(struct crypto_tfm *tfm) -{ - return tfm->__crt_alg->cra_name; -} - -static inline const char *crypto_tfm_alg_driver_name(struct crypto_tfm *tfm) -{ - return tfm->__crt_alg->cra_driver_name; -} - -static inline int crypto_tfm_alg_priority(struct crypto_tfm *tfm) -{ - return tfm->__crt_alg->cra_priority; -} - static inline u32 crypto_tfm_alg_type(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK; } -static inline unsigned int crypto_tfm_alg_blocksize(struct crypto_tfm *tfm) -{ - return tfm->__crt_alg->cra_blocksize; -} - -static inline unsigned int crypto_tfm_alg_alignmask(struct crypto_tfm *tfm) -{ - return tfm->__crt_alg->cra_alignmask; -} - -static inline u32 crypto_tfm_get_flags(struct crypto_tfm *tfm) -{ - return tfm->crt_flags; -} - -static inline void crypto_tfm_set_flags(struct crypto_tfm *tfm, u32 flags) -{ - tfm->crt_flags |= flags; -} - -static inline void crypto_tfm_clear_flags(struct crypto_tfm *tfm, u32 flags) -{ - tfm->crt_flags &= ~flags; -} - static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm) { return tfm->__crt_ctx; } -static inline unsigned int crypto_tfm_ctx_alignment(void) -{ - struct crypto_tfm *tfm; - return __alignof__(tfm->__crt_ctx); -} - -static inline u32 crypto_skcipher_type(u32 type) -{ - type &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV); - type |= CRYPTO_ALG_TYPE_BLKCIPHER; - return type; -} - -static inline u32 crypto_skcipher_mask(u32 mask) -{ - mask &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV); - mask |= CRYPTO_ALG_TYPE_BLKCIPHER_MASK; - return mask; -} - -/** - * DOC: Synchronous Block Cipher API - * - * The synchronous block cipher API is used with the ciphers of type - * CRYPTO_ALG_TYPE_BLKCIPHER (listed as type "blkcipher" in /proc/crypto) - * - * Synchronous calls, have a context in the tfm. But since a single tfm can be - * used in multiple calls and in parallel, this info should not be changeable - * (unless a lock is used). This applies, for example, to the symmetric key. - * However, the IV is changeable, so there is an iv field in blkcipher_tfm - * structure for synchronous blkcipher api. So, its the only state info that can - * be kept for synchronous calls without using a big lock across a tfm. - * - * The block cipher API allows the use of a complete cipher, i.e. a cipher - * consisting of a template (a block chaining mode) and a single block cipher - * primitive (e.g. AES). - * - * The plaintext data buffer and the ciphertext data buffer are pointed to - * by using scatter/gather lists. The cipher operation is performed - * on all segments of the provided scatter/gather lists. - * - * The kernel crypto API supports a cipher operation "in-place" which means that - * the caller may provide the same scatter/gather list for the plaintext and - * cipher text. After the completion of the cipher operation, the plaintext - * data is replaced with the ciphertext data in case of an encryption and vice - * versa for a decryption. The caller must ensure that the scatter/gather lists - * for the output data point to sufficiently large buffers, i.e. multiples of - * the block size of the cipher. - */ +struct crypto_blkcipher { + struct crypto_tfm base; +}; static inline struct crypto_blkcipher *__crypto_blkcipher_cast( struct crypto_tfm *tfm) @@ -443,20 +134,6 @@ static inline struct crypto_blkcipher *crypto_blkcipher_cast( return __crypto_blkcipher_cast(tfm); } -/** - * crypto_alloc_blkcipher() - allocate synchronous block cipher handle - * @alg_name: is the cra_name / name or cra_driver_name / driver name of the - * blkcipher cipher - * @type: specifies the type of the cipher - * @mask: specifies the mask for the cipher - * - * Allocate a cipher handle for a block cipher. The returned struct - * crypto_blkcipher is the cipher handle that is required for any subsequent - * API invocation for that block cipher. - * - * Return: allocated cipher handle in case of success; IS_ERR() is true in case - * of an error, PTR_ERR() returns the error code. - */ static inline struct crypto_blkcipher *crypto_alloc_blkcipher( const char *alg_name, u32 type, u32 mask) { @@ -467,455 +144,30 @@ static inline struct crypto_blkcipher *crypto_alloc_blkcipher( return __crypto_blkcipher_cast(crypto_alloc_base(alg_name, type, mask)); } -static inline struct crypto_tfm *crypto_blkcipher_tfm( - struct crypto_blkcipher *tfm) -{ - return &tfm->base; -} - -/** - * crypto_free_blkcipher() - zeroize and free the block cipher handle - * @tfm: cipher handle to be freed - */ static inline void crypto_free_blkcipher(struct crypto_blkcipher *tfm) { - crypto_free_tfm(crypto_blkcipher_tfm(tfm)); -} - -/** - * crypto_has_blkcipher() - Search for the availability of a block cipher - * @alg_name: is the cra_name / name or cra_driver_name / driver name of the - * block cipher - * @type: specifies the type of the cipher - * @mask: specifies the mask for the cipher - * - * Return: true when the block cipher is known to the kernel crypto API; false - * otherwise - */ -static inline int crypto_has_blkcipher(const char *alg_name, u32 type, u32 mask) -{ - type &= ~CRYPTO_ALG_TYPE_MASK; - type |= CRYPTO_ALG_TYPE_BLKCIPHER; - mask |= CRYPTO_ALG_TYPE_MASK; - - return crypto_has_alg(alg_name, type, mask); -} - -/** - * crypto_blkcipher_name() - return the name / cra_name from the cipher handle - * @tfm: cipher handle - * - * Return: The character string holding the name of the cipher - */ -static inline const char *crypto_blkcipher_name(struct crypto_blkcipher *tfm) -{ - return crypto_tfm_alg_name(crypto_blkcipher_tfm(tfm)); + crypto_free_tfm(&tfm->base); } static inline struct blkcipher_tfm *crypto_blkcipher_crt( struct crypto_blkcipher *tfm) { - return &crypto_blkcipher_tfm(tfm)->crt_blkcipher; + return &tfm->base.crt_blkcipher; } -static inline struct blkcipher_alg *crypto_blkcipher_alg( - struct crypto_blkcipher *tfm) -{ - return &crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher; -} - -/** - * crypto_blkcipher_ivsize() - obtain IV size - * @tfm: cipher handle - * - * The size of the IV for the block cipher referenced by the cipher handle is - * returned. This IV size may be zero if the cipher does not need an IV. - * - * Return: IV size in bytes - */ -static inline unsigned int crypto_blkcipher_ivsize(struct crypto_blkcipher *tfm) -{ - return crypto_blkcipher_alg(tfm)->ivsize; -} - -/** - * crypto_blkcipher_blocksize() - obtain block size of cipher - * @tfm: cipher handle - * - * The block size for the block cipher referenced with the cipher handle is - * returned. The caller may use that information to allocate appropriate - * memory for the data returned by the encryption or decryption operation. - * - * Return: block size of cipher - */ -static inline unsigned int crypto_blkcipher_blocksize( - struct crypto_blkcipher *tfm) -{ - return crypto_tfm_alg_blocksize(crypto_blkcipher_tfm(tfm)); -} - -static inline unsigned int crypto_blkcipher_alignmask( - struct crypto_blkcipher *tfm) -{ - return crypto_tfm_alg_alignmask(crypto_blkcipher_tfm(tfm)); -} - -static inline u32 crypto_blkcipher_get_flags(struct crypto_blkcipher *tfm) -{ - return crypto_tfm_get_flags(crypto_blkcipher_tfm(tfm)); -} - -static inline void crypto_blkcipher_set_flags(struct crypto_blkcipher *tfm, - u32 flags) -{ - crypto_tfm_set_flags(crypto_blkcipher_tfm(tfm), flags); -} - -static inline void crypto_blkcipher_clear_flags(struct crypto_blkcipher *tfm, - u32 flags) -{ - crypto_tfm_clear_flags(crypto_blkcipher_tfm(tfm), flags); -} - -/** - * crypto_blkcipher_setkey() - set key for cipher - * @tfm: cipher handle - * @key: buffer holding the key - * @keylen: length of the key in bytes - * - * The caller provided key is set for the block cipher referenced by the cipher - * handle. - * - * Note, the key length determines the cipher type. Many block ciphers implement - * different cipher modes depending on the key size, such as AES-128 vs AES-192 - * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 - * is performed. - * - * Return: 0 if the setting of the key was successful; < 0 if an error occurred - */ static inline int crypto_blkcipher_setkey(struct crypto_blkcipher *tfm, - const u8 *key, unsigned int keylen) + const u8 *key, unsigned keylen) { - return crypto_blkcipher_crt(tfm)->setkey(crypto_blkcipher_tfm(tfm), - key, keylen); + return crypto_blkcipher_crt(tfm)->setkey(&tfm->base, key, keylen); } -/** - * crypto_blkcipher_encrypt() - encrypt plaintext - * @desc: reference to the block cipher handle with meta data - * @dst: scatter/gather list that is filled by the cipher operation with the - * ciphertext - * @src: scatter/gather list that holds the plaintext - * @nbytes: number of bytes of the plaintext to encrypt. - * - * Encrypt plaintext data using the IV set by the caller with a preceding - * call of crypto_blkcipher_set_iv. - * - * The blkcipher_desc data structure must be filled by the caller and can - * reside on the stack. The caller must fill desc as follows: desc.tfm is filled - * with the block cipher handle; desc.flags is filled with either - * CRYPTO_TFM_REQ_MAY_SLEEP or 0. - * - * Return: 0 if the cipher operation was successful; < 0 if an error occurred - */ -static inline int crypto_blkcipher_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, - unsigned int nbytes) -{ - desc->info = crypto_blkcipher_crt(desc->tfm)->iv; - return crypto_blkcipher_crt(desc->tfm)->encrypt(desc, dst, src, nbytes); -} - -/** - * crypto_blkcipher_encrypt_iv() - encrypt plaintext with dedicated IV - * @desc: reference to the block cipher handle with meta data - * @dst: scatter/gather list that is filled by the cipher operation with the - * ciphertext - * @src: scatter/gather list that holds the plaintext - * @nbytes: number of bytes of the plaintext to encrypt. - * - * Encrypt plaintext data with the use of an IV that is solely used for this - * cipher operation. Any previously set IV is not used. - * - * The blkcipher_desc data structure must be filled by the caller and can - * reside on the stack. The caller must fill desc as follows: desc.tfm is filled - * with the block cipher handle; desc.info is filled with the IV to be used for - * the current operation; desc.flags is filled with either - * CRYPTO_TFM_REQ_MAY_SLEEP or 0. - * - * Return: 0 if the cipher operation was successful; < 0 if an error occurred - */ static inline int crypto_blkcipher_encrypt_iv(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) + unsigned nbytes) { return crypto_blkcipher_crt(desc->tfm)->encrypt(desc, dst, src, nbytes); } -/** - * crypto_blkcipher_decrypt() - decrypt ciphertext - * @desc: reference to the block cipher handle with meta data - * @dst: scatter/gather list that is filled by the cipher operation with the - * plaintext - * @src: scatter/gather list that holds the ciphertext - * @nbytes: number of bytes of the ciphertext to decrypt. - * - * Decrypt ciphertext data using the IV set by the caller with a preceding - * call of crypto_blkcipher_set_iv. - * - * The blkcipher_desc data structure must be filled by the caller as documented - * for the crypto_blkcipher_encrypt call above. - * - * Return: 0 if the cipher operation was successful; < 0 if an error occurred - * - */ -static inline int crypto_blkcipher_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, - unsigned int nbytes) -{ - desc->info = crypto_blkcipher_crt(desc->tfm)->iv; - return crypto_blkcipher_crt(desc->tfm)->decrypt(desc, dst, src, nbytes); -} - -/** - * crypto_blkcipher_decrypt_iv() - decrypt ciphertext with dedicated IV - * @desc: reference to the block cipher handle with meta data - * @dst: scatter/gather list that is filled by the cipher operation with the - * plaintext - * @src: scatter/gather list that holds the ciphertext - * @nbytes: number of bytes of the ciphertext to decrypt. - * - * Decrypt ciphertext data with the use of an IV that is solely used for this - * cipher operation. Any previously set IV is not used. - * - * The blkcipher_desc data structure must be filled by the caller as documented - * for the crypto_blkcipher_encrypt_iv call above. - * - * Return: 0 if the cipher operation was successful; < 0 if an error occurred - */ -static inline int crypto_blkcipher_decrypt_iv(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, - unsigned int nbytes) -{ - return crypto_blkcipher_crt(desc->tfm)->decrypt(desc, dst, src, nbytes); -} - -/** - * crypto_blkcipher_set_iv() - set IV for cipher - * @tfm: cipher handle - * @src: buffer holding the IV - * @len: length of the IV in bytes - * - * The caller provided IV is set for the block cipher referenced by the cipher - * handle. - */ -static inline void crypto_blkcipher_set_iv(struct crypto_blkcipher *tfm, - const u8 *src, unsigned int len) -{ - memcpy(crypto_blkcipher_crt(tfm)->iv, src, len); -} - -/** - * crypto_blkcipher_get_iv() - obtain IV from cipher - * @tfm: cipher handle - * @dst: buffer filled with the IV - * @len: length of the buffer dst - * - * The caller can obtain the IV set for the block cipher referenced by the - * cipher handle and store it into the user-provided buffer. If the buffer - * has an insufficient space, the IV is truncated to fit the buffer. - */ -static inline void crypto_blkcipher_get_iv(struct crypto_blkcipher *tfm, - u8 *dst, unsigned int len) -{ - memcpy(dst, crypto_blkcipher_crt(tfm)->iv, len); -} - -/** - * DOC: Single Block Cipher API - * - * The single block cipher API is used with the ciphers of type - * CRYPTO_ALG_TYPE_CIPHER (listed as type "cipher" in /proc/crypto). - * - * Using the single block cipher API calls, operations with the basic cipher - * primitive can be implemented. These cipher primitives exclude any block - * chaining operations including IV handling. - * - * The purpose of this single block cipher API is to support the implementation - * of templates or other concepts that only need to perform the cipher operation - * on one block at a time. Templates invoke the underlying cipher primitive - * block-wise and process either the input or the output data of these cipher - * operations. - */ - -static inline struct crypto_cipher *__crypto_cipher_cast(struct crypto_tfm *tfm) -{ - return (struct crypto_cipher *)tfm; -} - -static inline struct crypto_cipher *crypto_cipher_cast(struct crypto_tfm *tfm) -{ - BUG_ON(crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER); - return __crypto_cipher_cast(tfm); -} - -/** - * crypto_alloc_cipher() - allocate single block cipher handle - * @alg_name: is the cra_name / name or cra_driver_name / driver name of the - * single block cipher - * @type: specifies the type of the cipher - * @mask: specifies the mask for the cipher - * - * Allocate a cipher handle for a single block cipher. The returned struct - * crypto_cipher is the cipher handle that is required for any subsequent API - * invocation for that single block cipher. - * - * Return: allocated cipher handle in case of success; IS_ERR() is true in case - * of an error, PTR_ERR() returns the error code. - */ -static inline struct crypto_cipher *crypto_alloc_cipher(const char *alg_name, - u32 type, u32 mask) -{ - type &= ~CRYPTO_ALG_TYPE_MASK; - type |= CRYPTO_ALG_TYPE_CIPHER; - mask |= CRYPTO_ALG_TYPE_MASK; - - return __crypto_cipher_cast(crypto_alloc_base(alg_name, type, mask)); -} - -static inline struct crypto_tfm *crypto_cipher_tfm(struct crypto_cipher *tfm) -{ - return &tfm->base; -} - -/** - * crypto_free_cipher() - zeroize and free the single block cipher handle - * @tfm: cipher handle to be freed - */ -static inline void crypto_free_cipher(struct crypto_cipher *tfm) -{ - crypto_free_tfm(crypto_cipher_tfm(tfm)); -} - -/** - * crypto_has_cipher() - Search for the availability of a single block cipher - * @alg_name: is the cra_name / name or cra_driver_name / driver name of the - * single block cipher - * @type: specifies the type of the cipher - * @mask: specifies the mask for the cipher - * - * Return: true when the single block cipher is known to the kernel crypto API; - * false otherwise - */ -static inline int crypto_has_cipher(const char *alg_name, u32 type, u32 mask) -{ - type &= ~CRYPTO_ALG_TYPE_MASK; - type |= CRYPTO_ALG_TYPE_CIPHER; - mask |= CRYPTO_ALG_TYPE_MASK; - - return crypto_has_alg(alg_name, type, mask); -} - -static inline struct cipher_tfm *crypto_cipher_crt(struct crypto_cipher *tfm) -{ - return &crypto_cipher_tfm(tfm)->crt_cipher; -} - -/** - * crypto_cipher_blocksize() - obtain block size for cipher - * @tfm: cipher handle - * - * The block size for the single block cipher referenced with the cipher handle - * tfm is returned. The caller may use that information to allocate appropriate - * memory for the data returned by the encryption or decryption operation - * - * Return: block size of cipher - */ -static inline unsigned int crypto_cipher_blocksize(struct crypto_cipher *tfm) -{ - return crypto_tfm_alg_blocksize(crypto_cipher_tfm(tfm)); -} - -static inline unsigned int crypto_cipher_alignmask(struct crypto_cipher *tfm) -{ - return crypto_tfm_alg_alignmask(crypto_cipher_tfm(tfm)); -} - -static inline u32 crypto_cipher_get_flags(struct crypto_cipher *tfm) -{ - return crypto_tfm_get_flags(crypto_cipher_tfm(tfm)); -} - -static inline void crypto_cipher_set_flags(struct crypto_cipher *tfm, - u32 flags) -{ - crypto_tfm_set_flags(crypto_cipher_tfm(tfm), flags); -} - -static inline void crypto_cipher_clear_flags(struct crypto_cipher *tfm, - u32 flags) -{ - crypto_tfm_clear_flags(crypto_cipher_tfm(tfm), flags); -} - -/** - * crypto_cipher_setkey() - set key for cipher - * @tfm: cipher handle - * @key: buffer holding the key - * @keylen: length of the key in bytes - * - * The caller provided key is set for the single block cipher referenced by the - * cipher handle. - * - * Note, the key length determines the cipher type. Many block ciphers implement - * different cipher modes depending on the key size, such as AES-128 vs AES-192 - * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 - * is performed. - * - * Return: 0 if the setting of the key was successful; < 0 if an error occurred - */ -static inline int crypto_cipher_setkey(struct crypto_cipher *tfm, - const u8 *key, unsigned int keylen) -{ - return crypto_cipher_crt(tfm)->cit_setkey(crypto_cipher_tfm(tfm), - key, keylen); -} - -/** - * crypto_cipher_encrypt_one() - encrypt one block of plaintext - * @tfm: cipher handle - * @dst: points to the buffer that will be filled with the ciphertext - * @src: buffer holding the plaintext to be encrypted - * - * Invoke the encryption operation of one block. The caller must ensure that - * the plaintext and ciphertext buffers are at least one block in size. - */ -static inline void crypto_cipher_encrypt_one(struct crypto_cipher *tfm, - u8 *dst, const u8 *src) -{ - crypto_cipher_crt(tfm)->cit_encrypt_one(crypto_cipher_tfm(tfm), - dst, src); -} - -/** - * crypto_cipher_decrypt_one() - decrypt one block of ciphertext - * @tfm: cipher handle - * @dst: points to the buffer that will be filled with the plaintext - * @src: buffer holding the ciphertext to be decrypted - * - * Invoke the decryption operation of one block. The caller must ensure that - * the plaintext and ciphertext buffers are at least one block in size. - */ -static inline void crypto_cipher_decrypt_one(struct crypto_cipher *tfm, - u8 *dst, const u8 *src) -{ - crypto_cipher_crt(tfm)->cit_decrypt_one(crypto_cipher_tfm(tfm), - dst, src); -} - #endif /* _LINUX_CRYPTO_H */ diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h deleted file mode 100644 index 8dfcb83b..00000000 --- a/include/linux/cryptohash.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef __CRYPTOHASH_H -#define __CRYPTOHASH_H - -#include <linux/types.h> - -#define SHA_DIGEST_WORDS 5 -#define SHA_MESSAGE_BYTES (512 /*bits*/ / 8) -#define SHA_WORKSPACE_WORDS 16 - -void sha_init(__u32 *buf); -void sha_transform(__u32 *digest, const char *data, __u32 *W); - -#define MD5_DIGEST_WORDS 4 -#define MD5_MESSAGE_BYTES 64 - -void md5_transform(__u32 *hash, __u32 const *in); - -__u32 half_md4_transform(__u32 buf[4], __u32 const in[8]); - -#endif diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2233350b..ac72858b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -207,6 +207,4 @@ int __must_check kstrtoint(const char *s, unsigned int base, int *res); BUILD_BUG_ON_ZERO((perms) & 2) + \ (perms)) -#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) - #endif diff --git a/include/linux/key.h b/include/linux/key.h new file mode 100644 index 00000000..adc12a9e --- /dev/null +++ b/include/linux/key.h @@ -0,0 +1,50 @@ +#ifndef _LINUX_KEY_H +#define _LINUX_KEY_H + +#include <linux/types.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/rcupdate.h> +#include <linux/sysctl.h> +#include <linux/rwsem.h> +#include <linux/atomic.h> + +#include <keyutils.h> + +struct key; + +struct user_key_payload { + size_t datalen; /* length of this data */ + char data[0]; /* actual data */ +}; + +struct key { + atomic_t usage; /* number of references */ + key_serial_t serial; /* key serial number */ + struct rw_semaphore sem; /* change vs change sem */ + struct user_key_payload payload; +}; + +static inline const struct user_key_payload *user_key_payload(const struct key *key) +{ + return &key->payload; +} + +static inline void key_put(struct key *key) +{ + if (atomic_dec_and_test(&key->usage)) + free(key); +} + +static inline struct key *__key_get(struct key *key) +{ + atomic_inc(&key->usage); + return key; +} + +static inline struct key *key_get(struct key *key) +{ + return key ? __key_get(key) : key; +} + +#endif /* _LINUX_KEY_H */ diff --git a/include/linux/mempool.h b/include/linux/mempool.h index c2789f93..ddf6f941 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -14,6 +14,11 @@ typedef struct mempool_s { size_t elem_size; } mempool_t; +static inline bool mempool_initialized(mempool_t *pool) +{ + return true; +} + extern int mempool_resize(mempool_t *pool, int new_min_nr); static inline void mempool_free(void *element, mempool_t *pool) diff --git a/include/linux/page.h b/include/linux/page.h index c99d9de3..8d6413ce 100644 --- a/include/linux/page.h +++ b/include/linux/page.h @@ -5,8 +5,11 @@ struct page; -#define virt_to_page(kaddr) ((struct page *) (kaddr)) -#define page_address(kaddr) ((void *) (kaddr)) +#define virt_to_page(p) \ + ((struct page *) (((unsigned long) (p)) & PAGE_MASK)) +#define offset_in_page(p) ((unsigned long) (p) & ~PAGE_MASK) + +#define page_address(p) ((void *) (p)) #define kmap_atomic(page) page_address(page) #define kunmap_atomic(addr) do {} while (0) diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h new file mode 100644 index 00000000..04bf59df --- /dev/null +++ b/include/linux/scatterlist.h @@ -0,0 +1,111 @@ +#ifndef _LINUX_SCATTERLIST_H +#define _LINUX_SCATTERLIST_H + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bug.h> +#include <linux/mm.h> + +struct scatterlist { + unsigned long page_link; + unsigned int offset; + unsigned int length; +}; + +#define sg_is_chain(sg) ((sg)->page_link & 0x01) +#define sg_is_last(sg) ((sg)->page_link & 0x02) +#define sg_chain_ptr(sg) \ + ((struct scatterlist *) ((sg)->page_link & ~0x03)) + +static inline void sg_assign_page(struct scatterlist *sg, struct page *page) +{ + unsigned long page_link = sg->page_link & 0x3; + + /* + * In order for the low bit stealing approach to work, pages + * must be aligned at a 32-bit boundary as a minimum. + */ + BUG_ON((unsigned long) page & 0x03); + sg->page_link = page_link | (unsigned long) page; +} + +static inline void sg_set_page(struct scatterlist *sg, struct page *page, + unsigned int len, unsigned int offset) +{ + sg_assign_page(sg, page); + sg->offset = offset; + sg->length = len; +} + +static inline struct page *sg_page(struct scatterlist *sg) +{ + return (struct page *)((sg)->page_link & ~0x3); +} + +static inline void sg_set_buf(struct scatterlist *sg, const void *buf, + unsigned int buflen) +{ + sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); +} + +static inline struct scatterlist *sg_next(struct scatterlist *sg) +{ + if (sg_is_last(sg)) + return NULL; + + sg++; + if (unlikely(sg_is_chain(sg))) + sg = sg_chain_ptr(sg); + + return sg; +} + +#define for_each_sg(sglist, sg, nr, __i) \ + for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg)) + +static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, + struct scatterlist *sgl) +{ + /* + * offset and length are unused for chain entry. Clear them. + */ + prv[prv_nents - 1].offset = 0; + prv[prv_nents - 1].length = 0; + + /* + * Set lowest bit to indicate a link pointer, and make sure to clear + * the termination bit if it happens to be set. + */ + prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02; +} + +static inline void sg_mark_end(struct scatterlist *sg) +{ + sg->page_link |= 0x02; + sg->page_link &= ~0x01; +} + +static inline void sg_unmark_end(struct scatterlist *sg) +{ + sg->page_link &= ~0x02; +} + +static inline void *sg_virt(struct scatterlist *sg) +{ + return page_address(sg_page(sg)) + sg->offset; +} + +static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents) +{ + memset(sgl, 0, sizeof(*sgl) * nents); + sg_mark_end(&sgl[nents - 1]); +} + +static inline void sg_init_one(struct scatterlist *sg, const void *buf, + unsigned int buflen) +{ + sg_init_table(sg, 1); + sg_set_buf(sg, buf, buflen); +} + +#endif /* _LINUX_SCATTERLIST_H */ diff --git a/include/linux/time64.h b/include/linux/time64.h index 2e1ad82e..2d9f8291 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -38,6 +38,19 @@ struct itimerspec64 { #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) +static inline struct timespec ns_to_timespec(const u64 nsec) +{ + return (struct timespec) { + .tv_sec = nsec / NSEC_PER_SEC, + .tv_nsec = nsec % NSEC_PER_SEC, + }; +} + +static inline s64 timespec_to_ns(const struct timespec *ts) +{ + return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; +} + #if __BITS_PER_LONG == 64 static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) @@ -61,11 +74,6 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) # define ns_to_timespec64 ns_to_timespec # define timespec64_add_ns timespec_add_ns -static inline s64 timespec_to_ns(const struct timespec *ts) -{ - return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; -} - #else static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index d4968c54..01e4b79d 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -185,7 +185,7 @@ TRACE_EVENT(bcache_write, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->inode = inode; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; @@ -215,7 +215,7 @@ TRACE_EVENT(bcache_write_throttle, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->inode = inode; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; @@ -245,7 +245,7 @@ DECLARE_EVENT_CLASS(page_alloc_fail, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->size = size; ), @@ -263,7 +263,7 @@ DECLARE_EVENT_CLASS(cache_set, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ), TP_printk("%pU", __entry->uuid) @@ -285,7 +285,7 @@ TRACE_EVENT(bcache_journal_next_bucket, ), TP_fast_assign( - memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16); + memcpy(__entry->uuid, ca->uuid.b, 16); __entry->cur_idx = cur_idx; __entry->last_idx = last_idx; ), @@ -304,7 +304,7 @@ TRACE_EVENT(bcache_journal_write_oldest, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->seq = seq; ), @@ -322,7 +322,7 @@ TRACE_EVENT(bcache_journal_write_oldest_done, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->seq = seq; __entry->written = written; ), @@ -368,7 +368,7 @@ DECLARE_EVENT_CLASS(cache, ), TP_fast_assign( - memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16); + memcpy(__entry->uuid, ca->uuid.b, 16); __entry->tier = ca->mi.tier; ), @@ -418,7 +418,7 @@ DECLARE_EVENT_CLASS(btree_node, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->bucket = PTR_BUCKET_NR_TRACE(c, &b->key, 0); __entry->level = b->level; __entry->id = b->btree_id; @@ -471,7 +471,7 @@ TRACE_EVENT(bcache_btree_node_alloc_fail, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->id = id; ), @@ -514,7 +514,7 @@ TRACE_EVENT(bcache_mca_scan, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->touched = touched; __entry->freed = freed; __entry->can_free = can_free; @@ -535,7 +535,7 @@ DECLARE_EVENT_CLASS(mca_cannibalize_lock, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ), TP_printk("%pU", __entry->uuid) @@ -675,7 +675,7 @@ TRACE_EVENT(bcache_btree_gc_coalesce_fail, TP_fast_assign( __entry->reason = reason; - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->disk_sb->user_uuid.b, 16); ), TP_printk("%pU: %u", __entry->uuid, __entry->reason) @@ -696,7 +696,7 @@ TRACE_EVENT(bcache_btree_node_alloc_replacement, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->old_bucket = PTR_BUCKET_NR_TRACE(c, &old->key, 0); __entry->bucket = PTR_BUCKET_NR_TRACE(c, &b->key, 0); @@ -778,7 +778,7 @@ TRACE_EVENT(bcache_mark_bucket, ), TP_fast_assign( - memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16); + memcpy(__entry->uuid, ca->uuid.b, 16); __entry->inode = k->p.inode; __entry->offset = k->p.offset; __entry->sectors = sectors; @@ -804,7 +804,7 @@ TRACE_EVENT(bcache_alloc_batch, ), TP_fast_assign( - memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16); + memcpy(__entry->uuid, ca->uuid.b, 16); __entry->free = free; __entry->total = total; ), @@ -824,7 +824,7 @@ TRACE_EVENT(bcache_btree_reserve_get_fail, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->required = required; __entry->cl = cl; ), @@ -879,7 +879,7 @@ DECLARE_EVENT_CLASS(cache_bucket_alloc, ), TP_fast_assign( - memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16); + memcpy(__entry->uuid, ca->uuid.b, 16); __entry->reserve = reserve; ), @@ -908,7 +908,7 @@ DECLARE_EVENT_CLASS(cache_set_bucket_alloc, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->reserve = reserve; __entry->cl = cl; ), @@ -933,7 +933,7 @@ DECLARE_EVENT_CLASS(open_bucket_alloc, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->cl = cl; ), @@ -1054,7 +1054,7 @@ TRACE_EVENT(bcache_moving_gc_end, ), TP_fast_assign( - memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16); + memcpy(__entry->uuid, ca->uuid.b, 16); __entry->sectors_moved = sectors_moved; __entry->keys_moved = keys_moved; __entry->buckets_moved = buckets_moved; @@ -1114,7 +1114,7 @@ TRACE_EVENT(bcache_tiering_end, ), TP_fast_assign( - memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->sectors_moved = sectors_moved; __entry->keys_moved = keys_moved; ), diff --git a/libbcache.c b/libbcache.c index 802d3b4c..cc294bd4 100644 --- a/libbcache.c +++ b/libbcache.c @@ -7,6 +7,7 @@ #include <string.h> #include <sys/stat.h> #include <sys/types.h> +#include <time.h> #include <unistd.h> #include <uuid/uuid.h> @@ -14,30 +15,17 @@ #include "linux/bcache.h" #include "libbcache.h" #include "checksum.h" +#include "crypto.h" #include "opts.h" +#include "super-io.h" + +#define NSEC_PER_SEC 1000000000L #define BCH_MIN_NR_NBUCKETS (1 << 10) /* first bucket should start 1 mb in, in sectors: */ #define FIRST_BUCKET_OFFSET (1 << 11) -void __do_write_sb(int fd, void *sb, size_t bytes) -{ - char zeroes[SB_SECTOR << 9] = {0}; - - /* Zero start of disk */ - xpwrite(fd, zeroes, SB_SECTOR << 9, 0); - - /* Write superblock */ - xpwrite(fd, sb, bytes, SB_SECTOR << 9); - - fsync(fd); - close(fd); -} - -#define do_write_sb(_fd, _sb) \ - __do_write_sb(_fd, _sb, ((void *) __bset_bkey_last(_sb)) - (void *) _sb); - /* minimum size filesystem we can create, given a bucket size: */ static u64 min_size(unsigned bucket_size) { @@ -45,12 +33,26 @@ static u64 min_size(unsigned bucket_size) BCH_MIN_NR_NBUCKETS) * bucket_size; } +static void init_layout(struct bch_sb_layout *l) +{ + memset(l, 0, sizeof(*l)); + + l->magic = BCACHE_MAGIC; + l->layout_type = 0; + l->nr_superblocks = 2; + l->sb_max_size_bits = 7; + l->sb_offset[0] = cpu_to_le64(BCH_SB_SECTOR); + l->sb_offset[1] = cpu_to_le64(BCH_SB_SECTOR + + (1 << l->sb_max_size_bits)); +} + void bcache_format(struct dev_opts *devs, size_t nr_devs, unsigned block_size, unsigned btree_node_size, unsigned meta_csum_type, unsigned data_csum_type, unsigned compression_type, + const char *passphrase, unsigned meta_replicas, unsigned data_replicas, unsigned on_error_action, @@ -58,8 +60,10 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs, char *label, uuid_le uuid) { - struct cache_sb *sb; + struct bch_sb *sb; struct dev_opts *i; + struct bch_sb_field_members *mi; + unsigned u64s, j; /* calculate block size: */ if (!block_size) @@ -124,16 +128,20 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs, max_journal_entry_size = roundup_pow_of_two(max_journal_entry_size); - sb = calloc(1, sizeof(*sb) + sizeof(struct cache_member) * nr_devs); + sb = calloc(1, sizeof(*sb) + + sizeof(struct bch_sb_field_members) + + sizeof(struct bch_member) * nr_devs + + sizeof(struct bch_sb_field_crypt)); - sb->offset = __cpu_to_le64(SB_SECTOR); - sb->version = __cpu_to_le64(BCACHE_SB_VERSION_CDEV_V3); + sb->version = cpu_to_le64(BCACHE_SB_VERSION_CDEV_V4); sb->magic = BCACHE_MAGIC; - sb->block_size = __cpu_to_le16(block_size); + sb->block_size = cpu_to_le16(block_size); sb->user_uuid = uuid; - sb->nr_in_set = nr_devs; + sb->nr_devices = nr_devs; + + init_layout(&sb->layout); - uuid_generate(sb->set_uuid.b); + uuid_generate(sb->uuid.b); if (label) strncpy((char *) sb->label, label, sizeof(sb->label)); @@ -142,44 +150,85 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs, * don't have a userspace crc32c implementation handy, just always use * crc64 */ - SET_CACHE_SB_CSUM_TYPE(sb, BCH_CSUM_CRC64); - SET_CACHE_SET_META_PREFERRED_CSUM_TYPE(sb, meta_csum_type); - SET_CACHE_SET_DATA_PREFERRED_CSUM_TYPE(sb, data_csum_type); - SET_CACHE_SET_COMPRESSION_TYPE(sb, compression_type); - - SET_CACHE_SET_BTREE_NODE_SIZE(sb, btree_node_size); - SET_CACHE_SET_META_REPLICAS_WANT(sb, meta_replicas); - SET_CACHE_SET_META_REPLICAS_HAVE(sb, meta_replicas); - SET_CACHE_SET_DATA_REPLICAS_WANT(sb, data_replicas); - SET_CACHE_SET_DATA_REPLICAS_HAVE(sb, data_replicas); - SET_CACHE_SET_ERROR_ACTION(sb, on_error_action); - SET_CACHE_SET_STR_HASH_TYPE(sb, BCH_STR_HASH_SIPHASH); - SET_CACHE_SET_JOURNAL_ENTRY_SIZE(sb, ilog2(max_journal_entry_size)); + SET_BCH_SB_CSUM_TYPE(sb, BCH_CSUM_CRC64); + SET_BCH_SB_META_CSUM_TYPE(sb, meta_csum_type); + SET_BCH_SB_DATA_CSUM_TYPE(sb, data_csum_type); + SET_BCH_SB_COMPRESSION_TYPE(sb, compression_type); + + SET_BCH_SB_BTREE_NODE_SIZE(sb, btree_node_size); + SET_BCH_SB_GC_RESERVE(sb, 8); + SET_BCH_SB_META_REPLICAS_WANT(sb, meta_replicas); + SET_BCH_SB_META_REPLICAS_HAVE(sb, meta_replicas); + SET_BCH_SB_DATA_REPLICAS_WANT(sb, data_replicas); + SET_BCH_SB_DATA_REPLICAS_HAVE(sb, data_replicas); + SET_BCH_SB_ERROR_ACTION(sb, on_error_action); + SET_BCH_SB_STR_HASH_TYPE(sb, BCH_STR_HASH_SIPHASH); + SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb, ilog2(max_journal_entry_size)); + + struct timespec now; + if (clock_gettime(CLOCK_REALTIME, &now)) + die("error getting current time: %s", strerror(errno)); + + sb->time_base_lo = cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec); + sb->time_precision = cpu_to_le32(1); + + if (passphrase) { + struct bch_sb_field_crypt *crypt = vstruct_end(sb); + + u64s = sizeof(struct bch_sb_field_crypt) / sizeof(u64); + + le32_add_cpu(&sb->u64s, u64s); + crypt->field.u64s = cpu_to_le32(u64s); + crypt->field.type = BCH_SB_FIELD_crypt; + + bch_sb_crypt_init(sb, crypt, passphrase); + SET_BCH_SB_ENCRYPTION_TYPE(sb, 1); + } + + mi = vstruct_end(sb); + u64s = (sizeof(struct bch_sb_field_members) + + sizeof(struct bch_member) * nr_devs) / sizeof(u64); + + le32_add_cpu(&sb->u64s, u64s); + mi->field.u64s = cpu_to_le32(u64s); + mi->field.type = BCH_SB_FIELD_members; for (i = devs; i < devs + nr_devs; i++) { - struct cache_member *m = sb->members + (i - devs); + struct bch_member *m = mi->members + (i - devs); uuid_generate(m->uuid.b); - m->nbuckets = __cpu_to_le64(i->nbuckets); - m->first_bucket = __cpu_to_le16(i->first_bucket); - m->bucket_size = __cpu_to_le16(i->bucket_size); + m->nbuckets = cpu_to_le64(i->nbuckets); + m->first_bucket = cpu_to_le16(i->first_bucket); + m->bucket_size = cpu_to_le16(i->bucket_size); - SET_CACHE_TIER(m, i->tier); - SET_CACHE_REPLACEMENT(m, CACHE_REPLACEMENT_LRU); - SET_CACHE_DISCARD(m, i->discard); + SET_BCH_MEMBER_TIER(m, i->tier); + SET_BCH_MEMBER_REPLACEMENT(m, CACHE_REPLACEMENT_LRU); + SET_BCH_MEMBER_DISCARD(m, i->discard); } - sb->u64s = __cpu_to_le16(bch_journal_buckets_offset(sb)); - for (i = devs; i < devs + nr_devs; i++) { - struct cache_member *m = sb->members + (i - devs); + sb->dev_idx = i - devs; + + static const char zeroes[BCH_SB_SECTOR << 9]; + struct nonce nonce = { 0 }; + + /* Zero start of disk */ + xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0); + + xpwrite(i->fd, &sb->layout, sizeof(sb->layout), + BCH_SB_LAYOUT_SECTOR << 9); - sb->disk_uuid = m->uuid; - sb->nr_this_dev = i - devs; - sb->csum = __cpu_to_le64(__csum_set(sb, __le16_to_cpu(sb->u64s), - CACHE_SB_CSUM_TYPE(sb))); + for (j = 0; j < sb->layout.nr_superblocks; j++) { + sb->offset = sb->layout.sb_offset[j]; - do_write_sb(i->fd, sb); + sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb), + nonce, sb); + xpwrite(i->fd, sb, vstruct_bytes(sb), + le64_to_cpu(sb->offset) << 9); + } + + fsync(i->fd); + close(i->fd); } bcache_super_print(sb, HUMAN_READABLE); @@ -187,16 +236,39 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs, free(sb); } -void bcache_super_print(struct cache_sb *sb, int units) +struct bch_sb *bcache_super_read(const char *path) { - unsigned i; + struct bch_sb sb, *ret; + + int fd = open(path, O_RDONLY); + if (fd < 0) + die("couldn't open %s", path); + + xpread(fd, &sb, sizeof(sb), BCH_SB_SECTOR << 9); + + if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic))) + die("not a bcache superblock"); + + size_t bytes = vstruct_bytes(&sb); + + ret = malloc(bytes); + + xpread(fd, ret, bytes, BCH_SB_SECTOR << 9); + + return ret; +} + +void bcache_super_print(struct bch_sb *sb, int units) +{ + struct bch_sb_field_members *mi; char user_uuid_str[40], internal_uuid_str[40], member_uuid_str[40]; - char label[SB_LABEL_SIZE + 1]; + char label[BCH_SB_LABEL_SIZE + 1]; + unsigned i; memset(label, 0, sizeof(label)); memcpy(label, sb->label, sizeof(sb->label)); uuid_unparse(sb->user_uuid.b, user_uuid_str); - uuid_unparse(sb->set_uuid.b, internal_uuid_str); + uuid_unparse(sb->uuid.b, internal_uuid_str); printf("External UUID: %s\n" "Internal UUID: %s\n" @@ -226,44 +298,50 @@ void bcache_super_print(struct cache_sb *sb, int units) label, le64_to_cpu(sb->version), pr_units(le16_to_cpu(sb->block_size), units), - pr_units(CACHE_SET_BTREE_NODE_SIZE(sb), units), - pr_units(1U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb), units), + pr_units(BCH_SB_BTREE_NODE_SIZE(sb), units), + pr_units(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb), units), - CACHE_SET_ERROR_ACTION(sb) < BCH_NR_ERROR_ACTIONS - ? bch_error_actions[CACHE_SET_ERROR_ACTION(sb)] + BCH_SB_ERROR_ACTION(sb) < BCH_NR_ERROR_ACTIONS + ? bch_error_actions[BCH_SB_ERROR_ACTION(sb)] : "unknown", - CACHE_SET_CLEAN(sb), + BCH_SB_CLEAN(sb), - CACHE_SET_META_REPLICAS_HAVE(sb), - CACHE_SET_META_REPLICAS_WANT(sb), - CACHE_SET_DATA_REPLICAS_HAVE(sb), - CACHE_SET_DATA_REPLICAS_WANT(sb), + BCH_SB_META_REPLICAS_HAVE(sb), + BCH_SB_META_REPLICAS_WANT(sb), + BCH_SB_DATA_REPLICAS_HAVE(sb), + BCH_SB_DATA_REPLICAS_WANT(sb), - CACHE_SET_META_PREFERRED_CSUM_TYPE(sb) < BCH_CSUM_NR - ? bch_csum_types[CACHE_SET_META_PREFERRED_CSUM_TYPE(sb)] + BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_NR + ? bch_csum_types[BCH_SB_META_CSUM_TYPE(sb)] : "unknown", - CACHE_SET_DATA_PREFERRED_CSUM_TYPE(sb) < BCH_CSUM_NR - ? bch_csum_types[CACHE_SET_DATA_PREFERRED_CSUM_TYPE(sb)] + BCH_SB_DATA_CSUM_TYPE(sb) < BCH_CSUM_NR + ? bch_csum_types[BCH_SB_DATA_CSUM_TYPE(sb)] : "unknown", - CACHE_SET_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_NR - ? bch_compression_types[CACHE_SET_COMPRESSION_TYPE(sb)] + BCH_SB_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_NR + ? bch_compression_types[BCH_SB_COMPRESSION_TYPE(sb)] : "unknown", - CACHE_SET_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR - ? bch_str_hash_types[CACHE_SET_STR_HASH_TYPE(sb)] + BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR + ? bch_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)] : "unknown", - CACHE_INODE_32BIT(sb), - CACHE_SET_GC_RESERVE(sb), - CACHE_SET_ROOT_RESERVE(sb), + BCH_SB_INODE_32BIT(sb), + BCH_SB_GC_RESERVE(sb), + BCH_SB_ROOT_RESERVE(sb), - sb->nr_in_set); + sb->nr_devices); - for (i = 0; i < sb->nr_in_set; i++) { - struct cache_member *m = sb->members + i; + mi = bch_sb_get_members(sb); + if (!mi) { + printf("Member info section missing\n"); + return; + } + + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; time_t last_mount = le64_to_cpu(m->last_mount); uuid_unparse(m->uuid.b, member_uuid_str); @@ -290,41 +368,18 @@ void bcache_super_print(struct cache_sb *sb, int units) le64_to_cpu(m->nbuckets), last_mount ? ctime(&last_mount) : "(never)", - CACHE_STATE(m) < CACHE_STATE_NR - ? bch_cache_state[CACHE_STATE(m)] + BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR + ? bch_cache_state[BCH_MEMBER_STATE(m)] : "unknown", - CACHE_TIER(m), - CACHE_HAS_METADATA(m), - CACHE_HAS_DATA(m), + BCH_MEMBER_TIER(m), + BCH_MEMBER_HAS_METADATA(m), + BCH_MEMBER_HAS_DATA(m), - CACHE_REPLACEMENT(m) < CACHE_REPLACEMENT_NR - ? bch_cache_replacement_policies[CACHE_REPLACEMENT(m)] + BCH_MEMBER_REPLACEMENT(m) < CACHE_REPLACEMENT_NR + ? bch_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)] : "unknown", - CACHE_DISCARD(m)); + BCH_MEMBER_DISCARD(m)); } } - -struct cache_sb *bcache_super_read(const char *path) -{ - struct cache_sb sb, *ret; - size_t bytes; - - int fd = open(path, O_RDONLY); - if (fd < 0) - die("couldn't open %s", path); - - xpread(fd, &sb, sizeof(sb), SB_SECTOR << 9); - - if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic))) - die("not a bcache superblock"); - - bytes = sizeof(sb) + le16_to_cpu(sb.u64s) * sizeof(u64); - - ret = calloc(1, bytes); - - xpread(fd, ret, bytes, SB_SECTOR << 9); - - return ret; -} diff --git a/libbcache.h b/libbcache.h index 07329cd1..6ec3f42d 100644 --- a/libbcache.h +++ b/libbcache.h @@ -2,6 +2,8 @@ #define _LIBBCACHE_H #include <linux/uuid.h> +#include "tools-util.h" +#include "vstructs.h" #include "stdbool.h" #include "tools-util.h" @@ -34,6 +36,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs, unsigned meta_csum_type, unsigned data_csum_type, unsigned compression_type, + const char *passphrase, unsigned meta_replicas, unsigned data_replicas, unsigned on_error_action, @@ -41,8 +44,8 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs, char *label, uuid_le uuid); -void bcache_super_print(struct cache_sb *, int); +struct bch_sb *bcache_super_read(const char *); -struct cache_sb *bcache_super_read(const char *); +void bcache_super_print(struct bch_sb *, int); #endif /* _LIBBCACHE_H */ diff --git a/libbcache/acl.c b/libbcache/acl.c index 64d56165..468d98da 100644 --- a/libbcache/acl.c +++ b/libbcache/acl.c @@ -187,7 +187,7 @@ int bch_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (ret < 0) return ret; else { - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_fs_time(inode->i_sb); mark_inode_dirty(inode); if (ret == 0) acl = NULL; diff --git a/libbcache/alloc.c b/libbcache/alloc.c index 4fe08b57..cd22c381 100644 --- a/libbcache/alloc.c +++ b/libbcache/alloc.c @@ -64,7 +64,7 @@ #include "extents.h" #include "io.h" #include "journal.h" -#include "super.h" +#include "super-io.h" #include <linux/blkdev.h> #include <linux/kthread.h> @@ -105,7 +105,7 @@ void bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca) if (rcu_access_pointer(grp->d[i].dev) == ca) goto out; - BUG_ON(grp->nr_devices >= MAX_CACHES_PER_SET); + BUG_ON(grp->nr_devices >= BCH_SB_MEMBERS_MAX); rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca); out: @@ -124,9 +124,9 @@ static void pd_controllers_update(struct work_struct *work) int i; /* All units are in bytes */ - u64 tier_size[CACHE_TIERS]; - u64 tier_free[CACHE_TIERS]; - u64 tier_dirty[CACHE_TIERS]; + u64 tier_size[BCH_TIER_MAX]; + u64 tier_free[BCH_TIER_MAX]; + u64 tier_dirty[BCH_TIER_MAX]; u64 tier0_can_free = 0; memset(tier_size, 0, sizeof(tier_size)); @@ -134,7 +134,7 @@ static void pd_controllers_update(struct work_struct *work) memset(tier_dirty, 0, sizeof(tier_dirty)); rcu_read_lock(); - for (i = CACHE_TIERS - 1; i >= 0; --i) + for (i = BCH_TIER_MAX - 1; i >= 0; --i) group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) { struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca); unsigned bucket_bits = ca->bucket_bits + 9; @@ -246,6 +246,16 @@ static int prio_io(struct cache *ca, uint64_t bucket, int op) return submit_bio_wait(ca->bio_prio); } +static struct nonce prio_nonce(struct prio_set *p) +{ + return (struct nonce) {{ + [0] = 0, + [1] = p->nonce[0], + [2] = p->nonce[1], + [3] = p->nonce[2]^BCH_NONCE_PRIO, + }}; +} + static int bch_prio_write(struct cache *ca) { struct cache_set *c = ca->set; @@ -279,12 +289,8 @@ static int bch_prio_write(struct cache *ca) } p->next_bucket = cpu_to_le64(ca->prio_buckets[i + 1]); - p->magic = cpu_to_le64(pset_magic(&c->disk_sb)); - - SET_PSET_CSUM_TYPE(p, c->opts.metadata_checksum); - p->csum = cpu_to_le64(bch_checksum(PSET_CSUM_TYPE(p), - &p->magic, - bucket_bytes(ca) - 8)); + p->magic = cpu_to_le64(pset_magic(c)); + get_random_bytes(&p->nonce, sizeof(p->nonce)); spin_lock(&ca->prio_buckets_lock); r = bch_bucket_alloc(ca, RESERVE_PRIO); @@ -298,6 +304,19 @@ static int bch_prio_write(struct cache *ca) bch_mark_metadata_bucket(ca, ca->buckets + r, false); spin_unlock(&ca->prio_buckets_lock); + SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c)); + + bch_encrypt(c, PSET_CSUM_TYPE(p), + prio_nonce(p), + p->encrypted_start, + bucket_bytes(ca) - + offsetof(struct prio_set, encrypted_start)); + + p->csum = bch_checksum(c, PSET_CSUM_TYPE(p), + prio_nonce(p), + (void *) p + sizeof(p->csum), + bucket_bytes(ca) - sizeof(p->csum)); + ret = prio_io(ca, r, REQ_OP_WRITE); if (cache_fatal_io_err_on(ret, ca, "prio write to bucket %zu", r) || @@ -306,9 +325,9 @@ static int bch_prio_write(struct cache *ca) } spin_lock(&j->lock); - j->prio_buckets[ca->sb.nr_this_dev] = cpu_to_le64(ca->prio_buckets[0]); + j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]); j->nr_prio_buckets = max_t(unsigned, - ca->sb.nr_this_dev + 1, + ca->dev_idx + 1, j->nr_prio_buckets); spin_unlock(&j->lock); @@ -320,7 +339,7 @@ static int bch_prio_write(struct cache *ca) return ret; need_new_journal_entry = j->buf[res.idx].nr_prio_buckets < - ca->sb.nr_this_dev + 1; + ca->dev_idx + 1; bch_journal_res_put(j, &res); ret = bch_journal_flush_seq(j, res.seq); @@ -355,13 +374,14 @@ int bch_prio_read(struct cache *ca) struct prio_set *p = ca->disk_buckets; struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; struct bucket_mark new; + struct bch_csum csum; unsigned bucket_nr = 0; u64 bucket, expect, got; size_t b; int ret = 0; spin_lock(&c->journal.lock); - bucket = le64_to_cpu(c->journal.prio_buckets[ca->sb.nr_this_dev]); + bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]); spin_unlock(&c->journal.lock); /* @@ -387,18 +407,28 @@ int bch_prio_read(struct cache *ca) return -EIO; got = le64_to_cpu(p->magic); - expect = pset_magic(&c->disk_sb); + expect = pset_magic(c); unfixable_fsck_err_on(got != expect, c, "bad magic (got %llu expect %llu) while reading prios from bucket %llu", got, expect, bucket); - got = le64_to_cpu(p->csum); - expect = bch_checksum(PSET_CSUM_TYPE(p), - &p->magic, - bucket_bytes(ca) - 8); - unfixable_fsck_err_on(got != expect, c, - "bad checksum (got %llu expect %llu) while reading prios from bucket %llu", - got, expect, bucket); + unfixable_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c, + "prio bucket with unknown csum type %llu bucket %lluu", + PSET_CSUM_TYPE(p), bucket); + + csum = bch_checksum(c, PSET_CSUM_TYPE(p), + prio_nonce(p), + (void *) p + sizeof(p->csum), + bucket_bytes(ca) - sizeof(p->csum)); + unfixable_fsck_err_on(bch_crc_cmp(csum, p->csum), c, + "bad checksum reading prios from bucket %llu", + bucket); + + bch_encrypt(c, PSET_CSUM_TYPE(p), + prio_nonce(p), + p->encrypted_start, + bucket_bytes(ca) - + offsetof(struct prio_set, encrypted_start)); bucket = le64_to_cpu(p->next_bucket); d = p->data; @@ -1029,7 +1059,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c, spin_lock(&devs->lock); for (i = 0; i < devs->nr_devices; i++) - available += !test_bit(devs->d[i].dev->sb.nr_this_dev, + available += !test_bit(devs->d[i].dev->dev_idx, caches_used); recalc_alloc_group_weights(c, devs); @@ -1054,7 +1084,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c, ca = devs->d[i].dev; - if (test_bit(ca->sb.nr_this_dev, caches_used)) + if (test_bit(ca->dev_idx, caches_used)) continue; if (fail_idx == -1 && @@ -1082,11 +1112,11 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c, ob->ptrs[0] = (struct bch_extent_ptr) { .gen = ca->buckets[bucket].mark.gen, .offset = bucket_to_sector(ca, bucket), - .dev = ca->sb.nr_this_dev, + .dev = ca->dev_idx, }; ob->ptr_offset[0] = 0; - __set_bit(ca->sb.nr_this_dev, caches_used); + __set_bit(ca->dev_idx, caches_used); available--; devs->cur_device = i; } @@ -1334,7 +1364,7 @@ static int open_bucket_add_buckets(struct cache_set *c, enum alloc_reserve reserve, struct closure *cl) { - long caches_used[BITS_TO_LONGS(MAX_CACHES_PER_SET)]; + long caches_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; int i, dst; /* @@ -1475,6 +1505,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e, EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev)); tmp = ob->ptrs[i]; + tmp.cached = bkey_extent_is_cached(&e->k); tmp.offset += ob->ptr_offset[i]; extent_ptr_append(e, tmp); @@ -1657,7 +1688,7 @@ static void bch_stop_write_point(struct cache *ca, return; for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++) - if (ptr->dev == ca->sb.nr_this_dev) + if (ptr->dev == ca->dev_idx) goto found; mutex_unlock(&ob->lock); @@ -1682,7 +1713,7 @@ static bool bch_dev_has_open_write_point(struct cache *ca) if (atomic_read(&ob->pin)) { mutex_lock(&ob->lock); for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++) - if (ptr->dev == ca->sb.nr_this_dev) { + if (ptr->dev == ca->dev_idx) { mutex_unlock(&ob->lock); return true; } diff --git a/libbcache/alloc_types.h b/libbcache/alloc_types.h index 337b6e46..fbe8b75c 100644 --- a/libbcache/alloc_types.h +++ b/libbcache/alloc_types.h @@ -56,7 +56,7 @@ struct cache_group { struct { u64 weight; struct cache *dev; - } d[MAX_CACHES_PER_SET]; + } d[BCH_SB_MEMBERS_MAX]; }; /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */ diff --git a/libbcache/bcache.h b/libbcache/bcache.h index 309d3728..8a0262fb 100644 --- a/libbcache/bcache.h +++ b/libbcache/bcache.h @@ -314,6 +314,8 @@ do { \ struct btree; struct cache; +struct crypto_blkcipher; +struct crypto_ahash; enum gc_phase { GC_PHASE_PENDING_DELETE = BTREE_ID_NR + 1, @@ -332,7 +334,6 @@ struct cache_member_cpu { u16 bucket_size; /* sectors */ u8 state; u8 tier; - u8 replication_set; u8 has_metadata; u8 has_data; u8 replacement; @@ -342,7 +343,7 @@ struct cache_member_cpu { struct cache_member_rcu { struct rcu_head rcu; - unsigned nr_in_set; + unsigned nr_devices; struct cache_member_cpu m[]; }; @@ -363,14 +364,13 @@ struct cache { struct cache_group self; + u8 dev_idx; /* * Cached version of this device's member info from superblock - * Committed by write_super() + * Committed by bch_write_super() -> bch_cache_set_mi_update() */ - struct { - u8 nr_this_dev; - } sb; struct cache_member_cpu mi; + uuid_le uuid; struct bcache_superblock disk_sb; @@ -518,36 +518,45 @@ struct cache_set { struct percpu_ref writes; struct work_struct read_only_work; - struct cache __rcu *cache[MAX_CACHES_PER_SET]; - - struct mutex mi_lock; - struct cache_member_rcu __rcu *members; - struct cache_member *disk_mi; /* protected by register_lock */ + struct cache __rcu *cache[BCH_SB_MEMBERS_MAX]; struct cache_set_opts opts; /* * Cached copy in native endianness: - * Set by cache_sb_to_cache_set: + * Set by bch_cache_set_mi_update(): */ + struct cache_member_rcu __rcu *members; + + /* Updated by bch_sb_update():*/ struct { + uuid_le uuid; + uuid_le user_uuid; + u16 block_size; u16 btree_node_size; - u8 nr_in_set; + u8 nr_devices; u8 clean; u8 meta_replicas_have; u8 data_replicas_have; u8 str_hash_type; + u8 encryption_type; + + u64 time_base_lo; + u32 time_base_hi; + u32 time_precision; } sb; - struct cache_sb disk_sb; + struct bch_sb *disk_sb; + unsigned disk_sb_order; + unsigned short block_bits; /* ilog2(block_size) */ struct closure sb_write; - struct semaphore sb_write_mutex; + struct mutex sb_lock; struct backing_dev_info bdi; @@ -631,7 +640,7 @@ struct cache_set { * allocate from: */ struct cache_group cache_all; - struct cache_group cache_tiers[CACHE_TIERS]; + struct cache_group cache_tiers[BCH_TIER_MAX]; u64 capacity; /* sectors */ @@ -724,6 +733,11 @@ struct cache_set { struct bio_decompress_worker __percpu *bio_decompress_worker; + struct crypto_blkcipher *chacha20; + struct crypto_shash *poly1305; + + atomic64_t key_version; + /* For punting bio submissions to workqueue, io.c */ struct bio_list bio_submit_list; struct work_struct bio_submit_work; diff --git a/libbcache/bkey.c b/libbcache/bkey.c index 64d2c845..374237e2 100644 --- a/libbcache/bkey.c +++ b/libbcache/bkey.c @@ -81,9 +81,9 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - p("u64s %u type %u %llu:%llu snap %u len %u ver %u", + p("u64s %u type %u %llu:%llu snap %u len %u ver %llu", k->u64s, k->type, k->p.inode, k->p.offset, - k->p.snapshot, k->size, k->version); + k->p.snapshot, k->size, k->version.lo); BUG_ON(bkey_packed(k)); @@ -258,13 +258,21 @@ bool bch_bkey_transform(const struct bkey_format *out_f, return true; } +#define bkey_fields() \ + x(BKEY_FIELD_INODE, p.inode) \ + x(BKEY_FIELD_OFFSET, p.offset) \ + x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ + x(BKEY_FIELD_SIZE, size) \ + x(BKEY_FIELD_VERSION_HI, version.hi) \ + x(BKEY_FIELD_VERSION_LO, version.lo) + struct bkey __bkey_unpack_key(const struct bkey_format *format, const struct bkey_packed *in) { struct unpack_state state = unpack_state_init(format, in); struct bkey out; - EBUG_ON(format->nr_fields != 5); + EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); EBUG_ON(in->u64s < format->key_u64s); EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); @@ -274,11 +282,10 @@ struct bkey __bkey_unpack_key(const struct bkey_format *format, out.needs_whiteout = in->needs_whiteout; out.type = in->type; out.pad[0] = 0; - out.p.inode = get_inc_field(&state, BKEY_FIELD_INODE); - out.p.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); - out.p.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); - out.size = get_inc_field(&state, BKEY_FIELD_SIZE); - out.version = get_inc_field(&state, BKEY_FIELD_VERSION); + +#define x(id, field) out.field = get_inc_field(&state, id); + bkey_fields() +#undef x return out; } @@ -290,7 +297,7 @@ struct bpos __bkey_unpack_pos(const struct bkey_format *format, struct unpack_state state = unpack_state_init(format, in); struct bpos out; - EBUG_ON(format->nr_fields != 5); + EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); EBUG_ON(in->u64s < format->key_u64s); EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); @@ -311,17 +318,14 @@ bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in, struct pack_state state = pack_state_init(format, out); EBUG_ON((void *) in == (void *) out); - EBUG_ON(format->nr_fields != 5); + EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); EBUG_ON(in->format != KEY_FORMAT_CURRENT); out->_data[0] = 0; - if (!set_inc_field(&state, BKEY_FIELD_INODE, in->p.inode) || - !set_inc_field(&state, BKEY_FIELD_OFFSET, in->p.offset) || - !set_inc_field(&state, BKEY_FIELD_SNAPSHOT, in->p.snapshot) || - !set_inc_field(&state, BKEY_FIELD_SIZE, in->size) || - !set_inc_field(&state, BKEY_FIELD_VERSION, in->version)) - return false; +#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; + bkey_fields() +#undef x /* * Extents - we have to guarantee that if an extent is packed, a trimmed @@ -340,47 +344,6 @@ bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in, return true; } -/* - * Alternate implementations using bch_bkey_transform_key() - unfortunately, too - * slow - */ -#if 0 -struct bkey __bkey_unpack_key(const struct bkey_format *format, - const struct bkey_packed *in) -{ - struct bkey out; - bool s; - - EBUG_ON(format->nr_fields != 5); - EBUG_ON(in->u64s < format->key_u64s); - EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); - - s = bch_bkey_transform_key(&bch_bkey_format_current, (void *) &out, - format, in); - EBUG_ON(!s); - - out.format = KEY_FORMAT_CURRENT; - - return out; -} - -bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in, - const struct bkey_format *format) -{ - EBUG_ON(format->nr_fields != 5); - EBUG_ON(in->format != KEY_FORMAT_CURRENT); - - if (!bch_bkey_transform_key(format, out, - &bch_bkey_format_current, (void *) in)) - return false; - - out->format = KEY_FORMAT_LOCAL_BTREE; - - bch_bkey_pack_verify(out, in, format); - return true; -} -#endif - /** * bkey_unpack -- unpack the key and the value */ @@ -588,12 +551,10 @@ static void __bkey_format_add(struct bkey_format_state *s, */ void bch_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) { - __bkey_format_add(s, BKEY_FIELD_INODE, k->p.inode); - __bkey_format_add(s, BKEY_FIELD_OFFSET, k->p.offset); +#define x(id, field) __bkey_format_add(s, id, k->field); + bkey_fields() +#undef x __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k)); - __bkey_format_add(s, BKEY_FIELD_SNAPSHOT, k->p.snapshot); - __bkey_format_add(s, BKEY_FIELD_SIZE, k->size); - __bkey_format_add(s, BKEY_FIELD_VERSION, k->version); } void bch_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) @@ -636,6 +597,12 @@ struct bkey_format bch_bkey_format_done(struct bkey_format_state *s) bits += ret.bits_per_field[i]; } + /* allow for extent merging: */ + if (ret.bits_per_field[BKEY_FIELD_SIZE]) { + ret.bits_per_field[BKEY_FIELD_SIZE] += 4; + bits += 4; + } + ret.key_u64s = DIV_ROUND_UP(bits, 64); /* if we have enough spare bits, round fields up to nearest byte */ @@ -1014,25 +981,13 @@ int bch_compile_bkey_format(const struct bkey_format *format, void *_out) /* mov [rdi], eax */ I2(0x89, 0x07); - out = compile_bkey_field(format, out, BKEY_FIELD_INODE, - offsetof(struct bkey, p.inode), 8, - &eax_zeroed); - - out = compile_bkey_field(format, out, BKEY_FIELD_OFFSET, - offsetof(struct bkey, p.offset), 8, - &eax_zeroed); - - out = compile_bkey_field(format, out, BKEY_FIELD_SNAPSHOT, - offsetof(struct bkey, p.snapshot), 4, - &eax_zeroed); - - out = compile_bkey_field(format, out, BKEY_FIELD_SIZE, - offsetof(struct bkey, size), 4, - &eax_zeroed); - - out = compile_bkey_field(format, out, BKEY_FIELD_VERSION, - offsetof(struct bkey, version), 4, +#define x(id, field) \ + out = compile_bkey_field(format, out, id, \ + offsetof(struct bkey, field), \ + sizeof(((struct bkey *) NULL)->field), \ &eax_zeroed); + bkey_fields() +#undef x /* retq */ I1(0xc3); @@ -1078,43 +1033,6 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, } #endif -/* - * Would like to use this if we can make __bkey_cmp_bits() fast enough, it'll be - * a decent reduction in code size - */ -#if 0 -static int bkey_cmp_verify(const struct bkey *l, const struct bkey *r) -{ - if (l->p.inode != r->p.inode) - return l->p.inode < r->p.inode ? -1 : 1; - - if (l->p.offset != r->p.offset) - return l->p.offset < r->p.offset ? -1 : 1; - - if (l->p.snapshot != r->p.snapshot) - return l->p.snapshot < r->p.snapshot ? -1 : 1; - - return 0; -} - -int bkey_cmp(const struct bkey *l, const struct bkey *r) -{ - int ret; - - EBUG_ON(bkey_packed(l) || bkey_packed(r)); - - ret = __bkey_cmp_bits((sizeof(l->inode) + - sizeof(l->offset) + - sizeof(l->snapshot)) * BITS_PER_BYTE, - __high_word(BKEY_U64s, l), - __high_word(BKEY_U64s, r)); - - BUG_ON(ret != bkey_cmp_verify(l, r)); - - return ret; -} -#endif - __pure int __bkey_cmp_packed_format_checked(const struct bkey_packed *l, const struct bkey_packed *r, @@ -1214,7 +1132,7 @@ void bkey_pack_test(void) struct bkey_format test_format = { .key_u64s = 2, - .nr_fields = 5, + .nr_fields = BKEY_NR_FIELDS, .bits_per_field = { 13, 64, @@ -1230,21 +1148,9 @@ void bkey_pack_test(void) u64 a, v = get_inc_field(&in_s, i); switch (i) { - case 0: - a = t.p.inode; - break; - case 1: - a = t.p.offset; - break; - case 2: - a = t.p.snapshot; - break; - case 3: - a = t.size; - break; - case 4: - a = t.version; - break; +#define x(id, field) case id: a = t.field; break; + bkey_fields() +#undef x default: BUG(); } diff --git a/libbcache/bkey.h b/libbcache/bkey.h index 3e29cdde..0893134f 100644 --- a/libbcache/bkey.h +++ b/libbcache/bkey.h @@ -5,6 +5,7 @@ #include <linux/bcache.h> #include "util.h" +#include "vstructs.h" void bch_to_binary(char *, const u64 *, unsigned); int bch_bkey_to_text(char *, size_t, const struct bkey *); @@ -28,15 +29,7 @@ struct bkey_s { }; }; -#define bkey_next(_k) \ -({ \ - BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ - !type_is(_k, struct bkey_i *) && \ - !type_is(_k, struct bkey_packed *)); \ - \ - ((typeof(_k)) __bkey_idx(((struct bkey *) (_k)), \ - ((struct bkey *) (_k))->u64s)); \ -}) +#define bkey_next(_k) vstruct_next(_k) static inline unsigned bkey_val_u64s(const struct bkey *k) { @@ -218,6 +211,22 @@ static inline struct bpos bpos_min(struct bpos l, struct bpos r) void bch_bpos_swab(struct bpos *); void bch_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); +static __always_inline int bversion_cmp(struct bversion l, struct bversion r) +{ + if (l.hi != r.hi) + return l.hi < r.hi ? -1 : 1; + if (l.lo != r.lo) + return l.lo < r.lo ? -1 : 1; + return 0; +} + +#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) + +static __always_inline int bversion_zero(struct bversion v) +{ + return !bversion_cmp(v, ZERO_VERSION); +} + #ifdef CONFIG_BCACHE_DEBUG /* statement expressions confusing unlikely()? */ #define bkey_packed(_k) \ @@ -555,6 +564,7 @@ static inline void __bch_extent_assert(u8 type, u8 nr) } __BKEY_VAL_ACCESSORS(extent, BCH_EXTENT, __bch_extent_assert); +BKEY_VAL_ACCESSORS(reservation, BCH_RESERVATION); BKEY_VAL_ACCESSORS(inode, BCH_INODE_FS); BKEY_VAL_ACCESSORS(inode_blockdev, BCH_INODE_BLOCKDEV); diff --git a/libbcache/blockdev.c b/libbcache/blockdev.c index cd231f5e..d3a373c2 100644 --- a/libbcache/blockdev.c +++ b/libbcache/blockdev.c @@ -2,11 +2,12 @@ #include "bcache.h" #include "blockdev.h" #include "btree_iter.h" +#include "btree_update.h" #include "checksum.h" #include "error.h" #include "inode.h" #include "request.h" -#include "super.h" +#include "super-io.h" #include "writeback.h" #include <linux/kthread.h> @@ -42,15 +43,22 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) down(&dc->sb_write_mutex); closure_init(cl, parent); + sb->csum = csum_vstruct(NULL, BCH_CSUM_CRC64, + (struct nonce) { 0 }, sb).lo; + bio_reset(bio); - bio->bi_end_io = write_bdev_super_endio; - bio->bi_private = dc; + bio->bi_bdev = dc->disk_sb.bdev; + bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); + bio->bi_iter.bi_size = + roundup(vstruct_bytes(sb), + bdev_logical_block_size(dc->disk_sb.bdev)); + bio->bi_end_io = write_bdev_super_endio; + bio->bi_private = dc; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FUA|REQ_META); + bch_bio_map(bio, sb); closure_get(cl); - sb->csum = cpu_to_le64(__csum_set(sb, 0, BCH_CSUM_CRC64)); - __write_super(dc->disk.c, (void *) &dc->disk_sb); - closure_return_with_destructor(cl, bch_write_bdev_super_unlock); } @@ -263,7 +271,7 @@ static void calc_cached_dev_sectors(struct cache_set *c) void bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; - char buf[SB_LABEL_SIZE + 1]; + char buf[BCH_SB_LABEL_SIZE + 1]; char *env[] = { "DRIVER=bcache", kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", @@ -272,8 +280,8 @@ void bch_cached_dev_run(struct cached_dev *dc) NULL, }; - memcpy(buf, dc->disk_sb.sb->label, SB_LABEL_SIZE); - buf[SB_LABEL_SIZE] = '\0'; + memcpy(buf, dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE); + buf[BCH_SB_LABEL_SIZE] = '\0'; env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); if (atomic_xchg(&dc->running, 1)) { @@ -370,8 +378,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) bdevname(dc->disk_sb.bdev, buf); if (memcmp(&dc->disk_sb.sb->set_uuid, - &c->disk_sb.set_uuid, - sizeof(c->disk_sb.set_uuid))) + &c->sb.uuid, + sizeof(c->sb.uuid))) return -ENOENT; if (dc->disk.c) { @@ -424,7 +432,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) SET_CACHED_DEV(&dc->disk.inode.v, true); dc->disk.inode.v.i_uuid = dc->disk_sb.sb->disk_uuid; memcpy(dc->disk.inode.v.i_label, - dc->disk_sb.sb->label, SB_LABEL_SIZE); + dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE); dc->disk.inode.v.i_ctime = rtime; dc->disk.inode.v.i_mtime = rtime; @@ -438,14 +446,15 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) pr_info("attached inode %llu", bcache_dev_inum(&dc->disk)); - dc->disk_sb.sb->set_uuid = c->disk_sb.set_uuid; + dc->disk_sb.sb->set_uuid = c->sb.uuid; SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN); bch_write_bdev_super(dc, &cl); closure_sync(&cl); } else { dc->disk.inode.v.i_mtime = rtime; - bch_inode_update(c, &dc->disk.inode.k_i, NULL); + bch_btree_update(c, BTREE_ID_INODES, + &dc->disk.inode.k_i, NULL); } /* Count dirty sectors before attaching */ @@ -479,7 +488,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) pr_info("Caching %s as %s on set %pU", bdevname(dc->disk_sb.bdev, buf), dc->disk.disk->disk_name, - dc->disk.c->disk_sb.set_uuid.b); + dc->disk.c->sb.uuid.b); return 0; } @@ -517,7 +526,7 @@ static void cached_dev_free(struct closure *cl) mutex_unlock(&bch_register_lock); - free_super((void *) &dc->disk_sb); + bch_free_super((void *) &dc->disk_sb); kobject_put(&dc->disk.kobj); } diff --git a/libbcache/bset.c b/libbcache/bset.c index 34880952..a88d8017 100644 --- a/libbcache/bset.c +++ b/libbcache/bset.c @@ -59,7 +59,7 @@ void bch_dump_bset(struct btree *b, struct bset *i, unsigned set) return; for (_k = i->start, k = bkey_unpack_key(b, _k); - _k < bset_bkey_last(i); + _k < vstruct_last(i); _k = _n, k = n) { _n = bkey_next(_k); @@ -67,7 +67,7 @@ void bch_dump_bset(struct btree *b, struct bset *i, unsigned set) printk(KERN_ERR "block %u key %zi/%u: %s\n", set, _k->_data - i->_data, i->u64s, buf); - if (_n == bset_bkey_last(i)) + if (_n == vstruct_last(i)) continue; n = bkey_unpack_key(b, _n); diff --git a/libbcache/bset.h b/libbcache/bset.h index f03e6b86..70868c51 100644 --- a/libbcache/bset.h +++ b/libbcache/bset.h @@ -9,6 +9,7 @@ #include "bkey_methods.h" #include "btree_types.h" #include "util.h" /* for time_stats */ +#include "vstructs.h" /* * BKEYS: @@ -302,15 +303,6 @@ static inline void btree_node_set_format(struct btree *b, bch_bset_set_no_aux_tree(b, b->set); } -#define __set_bytes(_i, _u64s) (sizeof(*(_i)) + (_u64s) * sizeof(u64)) -#define set_bytes(_i) __set_bytes(_i, (_i)->u64s) - -#define __set_blocks(_i, _u64s, _block_bytes) \ - DIV_ROUND_UP((size_t) __set_bytes((_i), (_u64s)), (_block_bytes)) - -#define set_blocks(_i, _block_bytes) \ - __set_blocks((_i), (_i)->u64s, (_block_bytes)) - static inline struct bset *bset_next_set(struct btree *b, unsigned block_bytes) { @@ -318,7 +310,7 @@ static inline struct bset *bset_next_set(struct btree *b, EBUG_ON(!is_power_of_2(block_bytes)); - return ((void *) i) + round_up(set_bytes(i), block_bytes); + return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); } void bch_btree_keys_free(struct btree *); @@ -387,11 +379,6 @@ static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b, (cmp == 0 && !strictly_greater && !bkey_deleted(k)); } -static inline struct bkey_packed *bset_bkey_idx(struct bset *i, unsigned idx) -{ - return bkey_idx(i, idx); -} - struct bset_tree *bch_bkey_to_bset(struct btree *, struct bkey_packed *); struct bkey_packed *bkey_prev_all(struct btree *, struct bset_tree *, struct bkey_packed *); diff --git a/libbcache/btree_cache.c b/libbcache/btree_cache.c index ca6064af..4d5efdbd 100644 --- a/libbcache/btree_cache.c +++ b/libbcache/btree_cache.c @@ -695,7 +695,7 @@ retry: EBUG_ON(!b->written); EBUG_ON(b->btree_id != iter->btree_id || - BSET_BTREE_LEVEL(&b->data->keys) != level || + BTREE_NODE_LEVEL(b->data) != level || bkey_cmp(b->data->max_key, k->k.p)); return b; diff --git a/libbcache/btree_gc.c b/libbcache/btree_gc.c index 84171875..5c77b267 100644 --- a/libbcache/btree_gc.c +++ b/libbcache/btree_gc.c @@ -18,6 +18,7 @@ #include "journal.h" #include "keylist.h" #include "move.h" +#include "super-io.h" #include "writeback.h" #include <linux/slab.h> @@ -118,8 +119,8 @@ u8 bch_btree_key_recalc_oldest_gen(struct cache_set *c, struct bkey_s_c k) /* * For runtime mark and sweep: */ -u8 __bch_btree_mark_key(struct cache_set *c, enum bkey_type type, - struct bkey_s_c k) +static u8 bch_btree_mark_key(struct cache_set *c, enum bkey_type type, + struct bkey_s_c k) { switch (type) { case BKEY_TYPE_BTREE: @@ -133,10 +134,14 @@ u8 __bch_btree_mark_key(struct cache_set *c, enum bkey_type type, } } -static u8 btree_mark_key(struct cache_set *c, struct btree *b, - struct bkey_s_c k) +u8 bch_btree_mark_key_initial(struct cache_set *c, enum bkey_type type, + struct bkey_s_c k) { - return __bch_btree_mark_key(c, btree_node_type(b), k); + atomic64_set(&c->key_version, + max_t(u64, k.k->version.lo, + atomic64_read(&c->key_version))); + + return bch_btree_mark_key(c, type, k); } static bool btree_gc_mark_node(struct cache_set *c, struct btree *b) @@ -151,7 +156,8 @@ static bool btree_gc_mark_node(struct cache_set *c, struct btree *b) btree_node_is_extents(b), &unpacked) { bkey_debugcheck(c, b, k); - stale = max(stale, btree_mark_key(c, b, k)); + stale = max(stale, bch_btree_mark_key(c, + btree_node_type(b), k)); } if (btree_gc_rewrite_disabled(c)) @@ -218,7 +224,7 @@ static int bch_gc_btree(struct cache_set *c, enum btree_id btree_id) mutex_lock(&c->btree_root_lock); b = c->btree_roots[btree_id].b; - __bch_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key)); + bch_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key)); gc_pos_set(c, gc_pos_btree_root(b->btree_id)); mutex_unlock(&c->btree_root_lock); @@ -265,22 +271,21 @@ static void bch_mark_allocator_buckets(struct cache_set *c) static void bch_mark_metadata(struct cache_set *c) { struct cache *ca; - unsigned i; + unsigned i, j; + u64 b; for_each_cache(ca, c, i) { - unsigned j; - u64 *i; - - for (j = 0; j < bch_nr_journal_buckets(ca->disk_sb.sb); j++) - bch_mark_metadata_bucket(ca, - &ca->buckets[journal_bucket(ca->disk_sb.sb, j)], - true); + for (j = 0; j < ca->journal.nr; j++) { + b = ca->journal.buckets[j]; + bch_mark_metadata_bucket(ca, ca->buckets + b, true); + } spin_lock(&ca->prio_buckets_lock); - for (i = ca->prio_buckets; - i < ca->prio_buckets + prio_buckets(ca) * 2; i++) - bch_mark_metadata_bucket(ca, &ca->buckets[*i], true); + for (j = 0; j < prio_buckets(ca) * 2; j++) { + b = ca->prio_buckets[j]; + bch_mark_metadata_bucket(ca, ca->buckets + b, true); + } spin_unlock(&ca->prio_buckets_lock); } @@ -476,9 +481,8 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], /* Check if all keys in @old_nodes could fit in one fewer node */ if (nr_old_nodes <= 1 || - __set_blocks(old_nodes[0]->data, - DIV_ROUND_UP(u64s, nr_old_nodes - 1), - block_bytes(c)) > blocks) + __vstruct_blocks(struct btree_node, c->block_bits, + DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) return; res = bch_btree_reserve_get(c, parent, nr_old_nodes, @@ -542,9 +546,9 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], u64s = 0; for (k = s2->start; - k < bset_bkey_last(s2) && - __set_blocks(n1->data, le16_to_cpu(s1->u64s) + u64s + k->u64s, - block_bytes(c)) <= blocks; + k < vstruct_last(s2) && + vstruct_blocks_plus(n1->data, c->block_bits, + u64s + k->u64s) <= blocks; k = bkey_next(k)) { last = k; u64s += k->u64s; @@ -554,7 +558,7 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], /* n2 fits entirely in n1 */ n1->key.k.p = n1->data->max_key = n2->data->max_key; - memcpy_u64s(bset_bkey_last(s1), + memcpy_u64s(vstruct_last(s1), s2->start, le16_to_cpu(s2->u64s)); le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); @@ -578,12 +582,12 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], btree_type_successor(iter->btree_id, n1->data->max_key); - memcpy_u64s(bset_bkey_last(s1), + memcpy_u64s(vstruct_last(s1), s2->start, u64s); le16_add_cpu(&s1->u64s, u64s); memmove(s2->start, - bset_bkey_idx(s2, u64s), + vstruct_idx(s2, u64s), (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); @@ -866,7 +870,7 @@ static void bch_initial_gc_btree(struct cache_set *c, enum btree_id id) for_each_btree_node_key_unpack(b, k, &node_iter, btree_node_is_extents(b), &unpacked) - btree_mark_key(c, b, k); + bch_btree_mark_key_initial(c, btree_node_type(b), k); } bch_btree_iter_cond_resched(&iter); @@ -874,8 +878,8 @@ static void bch_initial_gc_btree(struct cache_set *c, enum btree_id id) bch_btree_iter_unlock(&iter); - __bch_btree_mark_key(c, BKEY_TYPE_BTREE, - bkey_i_to_s_c(&c->btree_roots[id].b->key)); + bch_btree_mark_key(c, BKEY_TYPE_BTREE, + bkey_i_to_s_c(&c->btree_roots[id].b->key)); } int bch_initial_gc(struct cache_set *c, struct list_head *journal) @@ -889,6 +893,13 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal) bch_journal_mark(c, journal); } + /* + * Skip past versions that might have possibly been used (as nonces), + * but hadn't had their pointers written: + */ + if (c->sb.encryption_type) + atomic64_add(1 << 16, &c->key_version); + bch_mark_metadata(c); gc_pos_set(c, gc_phase(GC_PHASE_DONE)); diff --git a/libbcache/btree_gc.h b/libbcache/btree_gc.h index 91d31c05..0607187f 100644 --- a/libbcache/btree_gc.h +++ b/libbcache/btree_gc.h @@ -11,7 +11,7 @@ void bch_gc_thread_stop(struct cache_set *); int bch_gc_thread_start(struct cache_set *); int bch_initial_gc(struct cache_set *, struct list_head *); u8 bch_btree_key_recalc_oldest_gen(struct cache_set *, struct bkey_s_c); -u8 __bch_btree_mark_key(struct cache_set *, enum bkey_type, +u8 bch_btree_mark_key_initial(struct cache_set *, enum bkey_type, struct bkey_s_c); /* diff --git a/libbcache/btree_io.c b/libbcache/btree_io.c index 4c295af1..e772c6ad 100644 --- a/libbcache/btree_io.c +++ b/libbcache/btree_io.c @@ -13,6 +13,7 @@ #include "extents.h" #include "io.h" #include "journal.h" +#include "super-io.h" #include <trace/events/bcache.h> @@ -39,7 +40,7 @@ static void clear_needs_whiteout(struct bset *i) { struct bkey_packed *k; - for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k)) + for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) k->needs_whiteout = false; } @@ -47,7 +48,7 @@ static void set_needs_whiteout(struct bset *i) { struct bkey_packed *k; - for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k)) + for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) k->needs_whiteout = true; } @@ -341,7 +342,7 @@ bool __bch_compact_whiteouts(struct cache_set *c, struct btree *b, compacting = true; u_start = u_pos; start = i->start; - end = bset_bkey_last(i); + end = vstruct_last(i); if (src != dst) { memmove(dst, src, sizeof(*src)); @@ -574,7 +575,7 @@ static void btree_node_sort(struct cache_set *c, struct btree *b, order = sorting_entire_node ? btree_page_order(c) - : get_order(__set_bytes(b->data, u64s)); + : get_order(__vstruct_bytes(struct btree_node, u64s)); out = btree_bounce_alloc(c, order, &used_mempool); @@ -589,8 +590,7 @@ static void btree_node_sort(struct cache_set *c, struct btree *b, out->keys.u64s = cpu_to_le16(u64s); - BUG_ON((void *) bset_bkey_last(&out->keys) > - (void *) out + (PAGE_SIZE << order)); + BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); if (sorting_entire_node) bch_time_stats_update(&c->btree_sort_time, start_time); @@ -654,7 +654,7 @@ static struct btree_nr_keys sort_repack(struct bset *dst, bool filter_whiteouts) { struct bkey_format *in_f = &src->format; - struct bkey_packed *in, *out = bset_bkey_last(dst); + struct bkey_packed *in, *out = vstruct_last(dst); struct btree_nr_keys nr; memset(&nr, 0, sizeof(nr)); @@ -723,7 +723,7 @@ static struct btree_nr_keys sort_repack_merge(struct cache_set *c, btree_keys_account_key_add(&nr, 0, prev); prev = bkey_next(prev); } else { - prev = bset_bkey_last(dst); + prev = vstruct_last(dst); } bkey_copy(prev, &tmp.k); @@ -734,7 +734,7 @@ static struct btree_nr_keys sort_repack_merge(struct cache_set *c, btree_keys_account_key_add(&nr, 0, prev); out = bkey_next(prev); } else { - out = bset_bkey_last(dst); + out = vstruct_last(dst); } dst->u64s = cpu_to_le16((u64 *) out - dst->_data); @@ -854,22 +854,23 @@ void bch_btree_init_next(struct cache_set *c, struct btree *b, bch_btree_iter_reinit_node(iter, b); } -/* - * We seed the checksum with the entire first pointer (dev, gen and offset), - * since for btree nodes we have to store the checksum with the data instead of - * the pointer - this helps guard against reading a valid btree node that is not - * the node we actually wanted: - */ -#define btree_csum_set(_b, _i) \ -({ \ - void *_data = (void *) (_i) + 8; \ - void *_end = bset_bkey_last(&(_i)->keys); \ - \ - bch_checksum_update(BSET_CSUM_TYPE(&(_i)->keys), \ - bkey_i_to_extent_c(&(_b)->key)->v._data[0], \ - _data, \ - _end - _data) ^ 0xffffffffffffffffULL; \ -}) +static struct nonce btree_nonce(struct btree *b, + struct bset *i, + unsigned offset) +{ + return (struct nonce) {{ + [0] = cpu_to_le32(offset), + [1] = ((__le32 *) &i->seq)[0], + [2] = ((__le32 *) &i->seq)[1], + [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, + }}; +} + +static void bset_encrypt(struct cache_set *c, struct bset *i, struct nonce nonce) +{ + bch_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, + vstruct_end(i) - (void *) i->_data); +} #define btree_node_error(b, c, ptr, fmt, ...) \ cache_set_inconsistent(c, \ @@ -877,7 +878,7 @@ void bch_btree_init_next(struct cache_set *c, struct btree *b, (b)->btree_id, (b)->level, btree_node_root(c, b) \ ? btree_node_root(c, b)->level : -1, \ PTR_BUCKET_NR(ca, ptr), (b)->written, \ - (i)->u64s, ##__VA_ARGS__) + le16_to_cpu((i)->u64s), ##__VA_ARGS__) static const char *validate_bset(struct cache_set *c, struct btree *b, struct cache *ca, @@ -886,6 +887,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, unsigned *whiteout_u64s) { struct bkey_packed *k, *prev = NULL; + struct bpos prev_pos = POS_MIN; bool seen_non_whiteout = false; if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) @@ -903,7 +905,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, } for (k = i->start; - k != bset_bkey_last(i);) { + k != vstruct_last(i);) { struct bkey_s_c u; struct bkey tmp; const char *invalid; @@ -911,13 +913,13 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, if (!k->u64s) { btree_node_error(b, c, ptr, "KEY_U64s 0: %zu bytes of metadata lost", - (void *) bset_bkey_last(i) - (void *) k); + vstruct_end(i) - (void *) k); i->u64s = cpu_to_le16((u64 *) k - i->_data); break; } - if (bkey_next(k) > bset_bkey_last(i)) { + if (bkey_next(k) > vstruct_last(i)) { btree_node_error(b, c, ptr, "key extends past end of bset"); @@ -931,7 +933,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), - (u64 *) bset_bkey_last(i) - (u64 *) k); + (u64 *) vstruct_end(i) - (u64 *) k); continue; } @@ -951,7 +953,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), - (u64 *) bset_bkey_last(i) - (u64 *) k); + (u64 *) vstruct_end(i) - (u64 *) k); continue; } @@ -963,22 +965,40 @@ static const char *validate_bset(struct cache_set *c, struct btree *b, if (!seen_non_whiteout && (!bkey_whiteout(k) || - (prev && bkey_cmp_left_packed_byval(b, prev, - bkey_start_pos(u.k)) > 0))) { + (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) { *whiteout_u64s = k->_data - i->_data; seen_non_whiteout = true; + } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) { + btree_node_error(b, c, ptr, + "keys out of order: %llu:%llu > %llu:%llu", + prev_pos.inode, + prev_pos.offset, + u.k->p.inode, + bkey_start_offset(u.k)); + /* XXX: repair this */ } + prev_pos = u.k->p; prev = k; k = bkey_next(k); } SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - - b->written += sectors; return NULL; } +static bool extent_contains_ptr(struct bkey_s_c_extent e, + struct bch_extent_ptr match) +{ + const struct bch_extent_ptr *ptr; + + extent_for_each_ptr(e, ptr) + if (!memcmp(ptr, &match, sizeof(*ptr))) + return true; + + return false; +} + void bch_btree_node_read_done(struct cache_set *c, struct btree *b, struct cache *ca, const struct bch_extent_ptr *ptr) @@ -990,6 +1010,8 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b, bool used_mempool; unsigned u64s; const char *err; + struct bch_csum csum; + struct nonce nonce; int ret; iter = mempool_alloc(&c->fill_iter, GFP_NOIO); @@ -1005,40 +1027,62 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b, if (!b->written) { i = &b->data->keys; + err = "bad magic"; + if (le64_to_cpu(b->data->magic) != bset_magic(c)) + goto err; + + err = "bad btree header"; + if (!b->data->keys.seq) + goto err; + err = "unknown checksum type"; - if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR) + if (!bch_checksum_type_valid(c, BSET_CSUM_TYPE(i))) goto err; /* XXX: retry checksum errors */ + nonce = btree_nonce(b, i, b->written << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + err = "bad checksum"; - if (le64_to_cpu(b->data->csum) != - btree_csum_set(b, b->data)) + if (bch_crc_cmp(csum, b->data->csum)) goto err; - sectors = __set_blocks(b->data, - le16_to_cpu(b->data->keys.u64s), - block_bytes(c)) << c->block_bits; + bch_encrypt(c, BSET_CSUM_TYPE(i), nonce, + &b->data->flags, + (void *) &b->data->keys - + (void *) &b->data->flags); + nonce = nonce_add(nonce, + round_up((void *) &b->data->keys - + (void *) &b->data->flags, + CHACHA20_BLOCK_SIZE)); + bset_encrypt(c, i, nonce); - err = "bad magic"; - if (le64_to_cpu(b->data->magic) != bset_magic(&c->disk_sb)) - goto err; - - err = "bad btree header"; - if (!b->data->keys.seq) - goto err; + sectors = vstruct_sectors(b->data, c->block_bits); if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { + u64 *p = (u64 *) &b->data->ptr; + + *p = swab64(*p); bch_bpos_swab(&b->data->min_key); bch_bpos_swab(&b->data->max_key); } + err = "incorrect btree id"; + if (BTREE_NODE_ID(b->data) != b->btree_id) + goto err; + + err = "incorrect level"; + if (BTREE_NODE_LEVEL(b->data) != b->level) + goto err; + err = "incorrect max key"; if (bkey_cmp(b->data->max_key, b->key.k.p)) goto err; - err = "incorrect level"; - if (BSET_BTREE_LEVEL(i) != b->level) + err = "incorrect backpointer"; + if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), + b->data->ptr)) goto err; err = bch_bkey_format_validate(&b->data->format); @@ -1056,23 +1100,27 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b, break; err = "unknown checksum type"; - if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR) + if (!bch_checksum_type_valid(c, BSET_CSUM_TYPE(i))) goto err; + nonce = btree_nonce(b, i, b->written << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + err = "bad checksum"; - if (le64_to_cpu(bne->csum) != - btree_csum_set(b, bne)) + if (memcmp(&csum, &bne->csum, sizeof(csum))) goto err; - sectors = __set_blocks(bne, - le16_to_cpu(bne->keys.u64s), - block_bytes(c)) << c->block_bits; + bset_encrypt(c, i, nonce); + + sectors = vstruct_sectors(bne, c->block_bits); } err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s); if (err) goto err; + b->written += sectors; + err = "insufficient memory"; ret = bch_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b); if (ret < 0) @@ -1083,11 +1131,11 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b, __bch_btree_node_iter_push(iter, b, i->start, - bkey_idx(i, whiteout_u64s)); + vstruct_idx(i, whiteout_u64s)); __bch_btree_node_iter_push(iter, b, - bkey_idx(i, whiteout_u64s), - bset_bkey_last(i)); + vstruct_idx(i, whiteout_u64s), + vstruct_last(i)); } err = "corrupted btree"; @@ -1290,6 +1338,7 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b, struct bch_extent_ptr *ptr; struct cache *ca; struct sort_iter sort_iter; + struct nonce nonce; unsigned bytes_to_write, sectors_to_write, order, bytes, u64s; u64 seq = 0; bool used_mempool; @@ -1330,7 +1379,7 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b, BUG_ON(b->written >= c->sb.btree_node_size); BUG_ON(bset_written(b, btree_bset_last(b))); - BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(&c->disk_sb)); + BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); if (lock_type_held == SIX_LOCK_intent) { @@ -1396,7 +1445,7 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b, b->whiteout_u64s = 0; u64s = btree_node_is_extents(b) - ? sort_extents(bset_bkey_last(i), &sort_iter, false) + ? sort_extents(vstruct_last(i), &sort_iter, false) : sort_keys(i->start, &sort_iter, false); le16_add_cpu(&i->u64s, u64s); @@ -1413,14 +1462,30 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b, BUG_ON(i->seq != b->data->keys.seq); i->version = cpu_to_le16(BCACHE_BSET_VERSION); - SET_BSET_CSUM_TYPE(i, c->opts.metadata_checksum); + SET_BSET_CSUM_TYPE(i, bch_meta_checksum_type(c)); + + nonce = btree_nonce(b, i, b->written << 9); + + if (bn) { + bch_encrypt(c, BSET_CSUM_TYPE(i), nonce, + &bn->flags, + (void *) &b->data->keys - + (void *) &b->data->flags); + nonce = nonce_add(nonce, + round_up((void *) &b->data->keys - + (void *) &b->data->flags, + CHACHA20_BLOCK_SIZE)); + bset_encrypt(c, i, nonce); + + nonce = btree_nonce(b, i, b->written << 9); + bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); + } else { + bset_encrypt(c, i, nonce); - if (bn) - bn->csum = cpu_to_le64(btree_csum_set(b, bn)); - else - bne->csum = cpu_to_le64(btree_csum_set(b, bne)); + bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + } - bytes_to_write = (void *) bset_bkey_last(i) - data; + bytes_to_write = vstruct_end(i) - data; sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; memset(data + bytes_to_write, 0, @@ -1548,7 +1613,7 @@ bool bch_btree_post_write_cleanup(struct cache_set *c, struct btree *b) * If later we don't unconditionally sort down to a single bset, we have * to ensure this is still true: */ - BUG_ON((void *) bset_bkey_last(btree_bset_last(b)) > write_block(b)); + BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); bne = want_new_bset(c, b); if (bne) diff --git a/libbcache/btree_types.h b/libbcache/btree_types.h index 176d42a7..4cbec7fe 100644 --- a/libbcache/btree_types.h +++ b/libbcache/btree_types.h @@ -202,24 +202,12 @@ __btree_node_offset_to_key(const struct btree *b, u16 k) return (void *) ((u64 *) b->data + k + 1); } -#define __bkey_idx(_set, _offset) \ - ((_set)->_data + (_offset)) - -#define bkey_idx(_set, _offset) \ - ((typeof(&(_set)->start[0])) __bkey_idx((_set), (_offset))) - -#define __bset_bkey_last(_set) \ - __bkey_idx((_set), (_set)->u64s) - -#define bset_bkey_last(_set) \ - bkey_idx((_set), le16_to_cpu((_set)->u64s)) - #define btree_bkey_first(_b, _t) (bset(_b, _t)->start) #define btree_bkey_last(_b, _t) \ ({ \ EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ - bset_bkey_last(bset(_b, _t))); \ + vstruct_last(bset(_b, _t))); \ \ __btree_node_offset_to_key(_b, (_t)->end_offset); \ }) @@ -227,7 +215,7 @@ __btree_node_offset_to_key(const struct btree *b, u16 k) static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) { t->end_offset = - __btree_node_key_to_offset(b, bset_bkey_last(bset(b, t))); + __btree_node_key_to_offset(b, vstruct_last(bset(b, t))); btree_bkey_last(b, t); } diff --git a/libbcache/btree_update.c b/libbcache/btree_update.c index 95406a44..c3bb2092 100644 --- a/libbcache/btree_update.c +++ b/libbcache/btree_update.c @@ -12,7 +12,7 @@ #include "extents.h" #include "journal.h" #include "keylist.h" -#include "super.h" +#include "super-io.h" #include <linux/random.h> #include <linux/sort.h> @@ -80,7 +80,7 @@ bool bch_btree_node_format_fits(struct cache_set *c, struct btree *b, { size_t u64s = btree_node_u64s_with_format(b, new_f); - return __set_bytes(b->data, u64s) < btree_bytes(c); + return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); } /* Btree node freeing/allocation: */ @@ -298,8 +298,11 @@ static struct btree *bch_btree_node_alloc(struct cache_set *c, bch_bset_init_first(b, &b->data->keys); memset(&b->nr, 0, sizeof(b->nr)); - b->data->magic = cpu_to_le64(bset_magic(&c->disk_sb)); - SET_BSET_BTREE_LEVEL(&b->data->keys, level); + b->data->magic = cpu_to_le64(bset_magic(c)); + b->data->flags = 0; + SET_BTREE_NODE_ID(b->data, id); + SET_BTREE_NODE_LEVEL(b->data, level); + b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr; bch_btree_build_aux_trees(b); @@ -1292,7 +1295,7 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n */ k = set1->start; while (1) { - if (bkey_next(k) == bset_bkey_last(set1)) + if (bkey_next(k) == vstruct_last(set1)) break; if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) break; @@ -1313,7 +1316,7 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n n2->data->min_key = btree_type_successor(n1->btree_id, n1->key.k.p); - set2->u64s = cpu_to_le16((u64 *) bset_bkey_last(set1) - (u64 *) k); + set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); set_btree_bset_end(n1, n1->set); @@ -1333,7 +1336,7 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n BUG_ON(!set2->u64s); memcpy_u64s(set2->start, - bset_bkey_last(set1), + vstruct_end(set1), le16_to_cpu(set2->u64s)); btree_node_reset_sib_u64s(n1); @@ -1393,12 +1396,12 @@ static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b, */ i = btree_bset_first(b); p = i->start; - while (p != bset_bkey_last(i)) + while (p != vstruct_last(i)) if (bkey_deleted(p)) { le16_add_cpu(&i->u64s, -p->u64s); set_btree_bset_end(b, b->set); memmove_u64s_down(p, bkey_next(p), - (u64 *) bset_bkey_last(i) - + (u64 *) vstruct_last(i) - (u64 *) p); } else p = bkey_next(p); @@ -1428,9 +1431,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter, if (b->level) btree_split_insert_keys(iter, n1, insert_keys, reserve); - if (__set_blocks(n1->data, - le16_to_cpu(n1->data->keys.u64s), - block_bytes(c)) > BTREE_SPLIT_THRESHOLD(c)) { + if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) { trace_bcache_btree_node_split(c, b, b->nr.live_u64s); n2 = __btree_split_node(iter, n1, reserve); @@ -1939,7 +1940,7 @@ retry: u64s = 0; trans_for_each_entry(trans, i) if (!i->done) - u64s += jset_u64s(i->k->k.u64s); + u64s += jset_u64s(i->k->k.u64s + i->extra_res); memset(&trans->journal_res, 0, sizeof(trans->journal_res)); @@ -1966,7 +1967,7 @@ retry: * written one */ if (!i->done) { - u64s += i->k->k.u64s; + u64s += i->k->k.u64s + i->extra_res; if (!bch_btree_node_insert_fits(c, i->iter->nodes[0], u64s)) { split = i->iter; @@ -2217,7 +2218,7 @@ int bch_btree_update(struct cache_set *c, enum btree_id id, int bch_btree_delete_range(struct cache_set *c, enum btree_id id, struct bpos start, struct bpos end, - u64 version, + struct bversion version, struct disk_reservation *disk_res, struct extent_insert_hook *hook, u64 *journal_seq) diff --git a/libbcache/btree_update.h b/libbcache/btree_update.h index 5fc1b1aa..8ff089da 100644 --- a/libbcache/btree_update.h +++ b/libbcache/btree_update.h @@ -5,6 +5,7 @@ #include "btree_iter.h" #include "buckets.h" #include "journal.h" +#include "vstructs.h" struct cache_set; struct bkey_format_state; @@ -200,7 +201,7 @@ static inline bool bset_unwritten(struct btree *b, struct bset *i) static inline unsigned bset_end_sector(struct cache_set *c, struct btree *b, struct bset *i) { - return round_up(bset_byte_offset(b, bset_bkey_last(i)), + return round_up(bset_byte_offset(b, vstruct_end(i)), block_bytes(c)) >> 9; } @@ -208,7 +209,7 @@ static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c, struct btree *b) { struct bset *i = btree_bset_last(b); - unsigned used = bset_byte_offset(b, bset_bkey_last(i)) / sizeof(u64) + + unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) + b->whiteout_u64s + b->uncompacted_whiteout_u64s; unsigned total = c->sb.btree_node_size << 6; @@ -235,7 +236,7 @@ static inline struct btree_node_entry *want_new_bset(struct cache_set *c, { struct bset *i = btree_bset_last(b); unsigned offset = max_t(unsigned, b->written << 9, - bset_byte_offset(b, bset_bkey_last(i))); + bset_byte_offset(b, vstruct_end(i))); ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t) (offset + sizeof(struct btree_node_entry) + b->whiteout_u64s * sizeof(u64) + @@ -244,8 +245,8 @@ static inline struct btree_node_entry *want_new_bset(struct cache_set *c, EBUG_ON(offset > btree_bytes(c)); if ((unlikely(bset_written(b, i)) && n > 0) || - (unlikely(__set_bytes(i, le16_to_cpu(i->u64s)) > - btree_write_set_buffer(b)) && n > btree_write_set_buffer(b))) + (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) && + n > btree_write_set_buffer(b))) return (void *) b->data + offset; return NULL; @@ -308,6 +309,7 @@ struct btree_insert { struct btree_insert_entry { struct btree_iter *iter; struct bkey_i *k; + unsigned extra_res; /* * true if entire key was inserted - can only be false for * extents @@ -329,6 +331,14 @@ int __bch_btree_insert_at(struct btree_insert *); .done = false, \ }) +#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra) \ + ((struct btree_insert_entry) { \ + .iter = (_iter), \ + .k = (_k), \ + .extra_res = (_extra), \ + .done = false, \ + }) + /** * bch_btree_insert_at - insert one or more keys at iterator positions * @iter: btree iterator @@ -391,7 +401,7 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans, return true; for (i = insert; i < trans->entries + trans->nr; i++) - u64s += jset_u64s(i->k->k.u64s); + u64s += jset_u64s(i->k->k.u64s + i->extra_res); return u64s <= trans->journal_res.u64s; } @@ -404,7 +414,7 @@ int bch_btree_update(struct cache_set *, enum btree_id, struct bkey_i *, u64 *); int bch_btree_delete_range(struct cache_set *, enum btree_id, - struct bpos, struct bpos, u64, + struct bpos, struct bpos, struct bversion, struct disk_reservation *, struct extent_insert_hook *, u64 *); diff --git a/libbcache/buckets.c b/libbcache/buckets.c index 3398b255..757bc035 100644 --- a/libbcache/buckets.c +++ b/libbcache/buckets.c @@ -534,12 +534,10 @@ static void bch_mark_extent(struct cache_set *c, struct bkey_s_c_extent e, rcu_read_lock(); extent_for_each_online_device_crc(c, e, crc, ptr, ca) { - bool dirty = bch_extent_ptr_is_dirty(c, e, ptr); - - trace_bcache_mark_bucket(ca, e.k, ptr, sectors, dirty); + trace_bcache_mark_bucket(ca, e.k, ptr, sectors, !ptr->cached); bch_mark_pointer(c, e, ca, crc, ptr, sectors, - dirty ? type : S_CACHED, + ptr->cached ? S_CACHED : type, may_make_unavailable, stats, gc_will_visit, journal_seq); } @@ -559,10 +557,13 @@ static void __bch_mark_key(struct cache_set *c, struct bkey_s_c k, may_make_unavailable, stats, gc_will_visit, journal_seq); break; - case BCH_RESERVATION: - stats->persistent_reserved += sectors; + case BCH_RESERVATION: { + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + stats->persistent_reserved += r.v->nr_replicas * sectors; break; } + } } void __bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k, diff --git a/libbcache/buckets.h b/libbcache/buckets.h index 35100eba..8194dd9b 100644 --- a/libbcache/buckets.h +++ b/libbcache/buckets.h @@ -42,7 +42,7 @@ static inline u8 bucket_gc_gen(struct cache *ca, struct bucket *g) static inline struct cache *PTR_CACHE(const struct cache_set *c, const struct bch_extent_ptr *ptr) { - EBUG_ON(ptr->dev > rcu_dereference(c->members)->nr_in_set); + EBUG_ON(ptr->dev > rcu_dereference(c->members)->nr_devices); return rcu_dereference(c->cache[ptr->dev]); } diff --git a/libbcache/chardev.c b/libbcache/chardev.c index 0b020c84..b361b092 100644 --- a/libbcache/chardev.c +++ b/libbcache/chardev.c @@ -9,6 +9,7 @@ #include "bcache.h" #include "super.h" +#include "super-io.h" #include <linux/module.h> #include <linux/fs.h> @@ -202,16 +203,16 @@ static long bch_ioctl_disk_fail(struct cache_set *c, return ret; } -static struct cache_member *bch_uuid_lookup(struct cache_set *c, uuid_le uuid) +static struct bch_member *bch_uuid_lookup(struct cache_set *c, uuid_le uuid) { - struct cache_member *mi = c->disk_mi; + struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb); unsigned i; - lockdep_assert_held(&bch_register_lock); + lockdep_assert_held(&c->sb_lock); - for (i = 0; i < c->disk_sb.nr_in_set; i++) - if (!memcmp(&mi[i].uuid, &uuid, sizeof(uuid))) - return &mi[i]; + for (i = 0; i < c->disk_sb->nr_devices; i++) + if (!memcmp(&mi->members[i].uuid, &uuid, sizeof(uuid))) + return &mi->members[i]; return NULL; } @@ -220,20 +221,20 @@ static long bch_ioctl_disk_remove_by_uuid(struct cache_set *c, struct bch_ioctl_disk_remove_by_uuid __user *user_arg) { struct bch_ioctl_disk_fail_by_uuid arg; - struct cache_member *m; + struct bch_member *m; int ret = -ENOENT; if (copy_from_user(&arg, user_arg, sizeof(arg))) return -EFAULT; - mutex_lock(&bch_register_lock); + mutex_lock(&c->sb_lock); if ((m = bch_uuid_lookup(c, arg.dev))) { /* XXX: */ - SET_CACHE_STATE(m, CACHE_FAILED); - bcache_write_super(c); + SET_BCH_MEMBER_STATE(m, BCH_MEMBER_STATE_FAILED); + bch_write_super(c); ret = 0; } - mutex_unlock(&bch_register_lock); + mutex_unlock(&c->sb_lock); return ret; } @@ -242,19 +243,19 @@ static long bch_ioctl_disk_fail_by_uuid(struct cache_set *c, struct bch_ioctl_disk_fail_by_uuid __user *user_arg) { struct bch_ioctl_disk_fail_by_uuid arg; - struct cache_member *m; + struct bch_member *m; int ret = -ENOENT; if (copy_from_user(&arg, user_arg, sizeof(arg))) return -EFAULT; - mutex_lock(&bch_register_lock); + mutex_lock(&c->sb_lock); if ((m = bch_uuid_lookup(c, arg.dev))) { - SET_CACHE_STATE(m, CACHE_FAILED); - bcache_write_super(c); + SET_BCH_MEMBER_STATE(m, BCH_MEMBER_STATE_FAILED); + bch_write_super(c); ret = 0; } - mutex_unlock(&bch_register_lock); + mutex_unlock(&c->sb_lock); return ret; } @@ -263,8 +264,8 @@ static long bch_ioctl_query_uuid(struct cache_set *c, struct bch_ioctl_query_uuid __user *user_arg) { return copy_to_user(&user_arg->uuid, - &c->disk_sb.user_uuid, - sizeof(c->disk_sb.user_uuid)); + &c->sb.user_uuid, + sizeof(c->sb.user_uuid)); } long bch_cache_set_ioctl(struct cache_set *c, unsigned cmd, void __user *arg) diff --git a/libbcache/checksum.c b/libbcache/checksum.c index beae0b26..eb41f2ea 100644 --- a/libbcache/checksum.c +++ b/libbcache/checksum.c @@ -1,11 +1,19 @@ #include "bcache.h" #include "checksum.h" +#include "super.h" +#include "super-io.h" #include <linux/crc32c.h> +#include <linux/crypto.h> +#include <linux/key.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <crypto/algapi.h> #include <crypto/chacha20.h> #include <crypto/hash.h> #include <crypto/poly1305.h> +#include <keys/user-type.h> /* * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any @@ -129,7 +137,35 @@ u64 bch_crc64_update(u64 crc, const void *_data, size_t len) return crc; } -u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len) +static u64 bch_checksum_init(unsigned type) +{ + switch (type) { + case BCH_CSUM_NONE: + return 0; + case BCH_CSUM_CRC32C: + return U32_MAX; + case BCH_CSUM_CRC64: + return U64_MAX; + default: + BUG(); + } +} + +static u64 bch_checksum_final(unsigned type, u64 crc) +{ + switch (type) { + case BCH_CSUM_NONE: + return 0; + case BCH_CSUM_CRC32C: + return crc ^ U32_MAX; + case BCH_CSUM_CRC64: + return crc ^ U64_MAX; + default: + BUG(); + } +} + +static u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len) { switch (type) { case BCH_CSUM_NONE: @@ -143,32 +179,416 @@ u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len) } } -u64 bch_checksum(unsigned type, const void *data, size_t len) +static inline void do_encrypt_sg(struct crypto_blkcipher *tfm, + struct nonce nonce, + struct scatterlist *sg, size_t len) +{ + struct blkcipher_desc desc = { .tfm = tfm, .info = nonce.d }; + int ret; + + ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len); + BUG_ON(ret); +} + +static inline void do_encrypt(struct crypto_blkcipher *tfm, + struct nonce nonce, + void *buf, size_t len) +{ + struct scatterlist sg; + + sg_init_one(&sg, buf, len); + do_encrypt_sg(tfm, nonce, &sg, len); +} + +int bch_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, + void *buf, size_t len) +{ + struct crypto_blkcipher *chacha20 = + crypto_alloc_blkcipher("chacha20", 0, CRYPTO_ALG_ASYNC); + int ret; + + if (!chacha20) + return PTR_ERR(chacha20); + + ret = crypto_blkcipher_setkey(chacha20, (void *) key, sizeof(*key)); + if (ret) + goto err; + + do_encrypt(chacha20, nonce, buf, len); +err: + crypto_free_blkcipher(chacha20); + return ret; +} + +static void gen_poly_key(struct cache_set *c, struct shash_desc *desc, + struct nonce nonce) +{ + u8 key[POLY1305_KEY_SIZE]; + + nonce.d[3] ^= BCH_NONCE_POLY; + + memset(key, 0, sizeof(key)); + do_encrypt(c->chacha20, nonce, key, sizeof(key)); + + desc->tfm = c->poly1305; + desc->flags = 0; + crypto_shash_init(desc); + crypto_shash_update(desc, key, sizeof(key)); +} + +struct bch_csum bch_checksum(struct cache_set *c, unsigned type, + struct nonce nonce, const void *data, size_t len) { - u64 crc = 0xffffffffffffffffULL; + switch (type) { + case BCH_CSUM_NONE: + case BCH_CSUM_CRC32C: + case BCH_CSUM_CRC64: { + u64 crc = bch_checksum_init(type); + + crc = bch_checksum_update(type, crc, data, len); + crc = bch_checksum_final(type, crc); + + return (struct bch_csum) { .lo = crc }; + } + + case BCH_CSUM_CHACHA20_POLY1305_80: + case BCH_CSUM_CHACHA20_POLY1305_128: { + SHASH_DESC_ON_STACK(desc, c->poly1305); + u8 digest[POLY1305_DIGEST_SIZE]; + struct bch_csum ret = { 0 }; + + gen_poly_key(c, desc, nonce); + + crypto_shash_update(desc, data, len); + crypto_shash_final(desc, digest); + + memcpy(&ret, digest, bch_crc_bytes[type]); + return ret; + } + default: + BUG(); + } +} - crc = bch_checksum_update(type, crc, data, len); +void bch_encrypt(struct cache_set *c, unsigned type, + struct nonce nonce, void *data, size_t len) +{ + if (!bch_csum_type_is_encryption(type)) + return; - return crc ^ 0xffffffffffffffffULL; + do_encrypt(c->chacha20, nonce, data, len); } -u32 bch_checksum_bio(struct bio *bio, unsigned type) +struct bch_csum bch_checksum_bio(struct cache_set *c, unsigned type, + struct nonce nonce, struct bio *bio) { struct bio_vec bv; struct bvec_iter iter; - u32 csum = U32_MAX; - if (type == BCH_CSUM_NONE) - return 0; + switch (type) { + case BCH_CSUM_NONE: + return (struct bch_csum) { 0 }; + case BCH_CSUM_CRC32C: + case BCH_CSUM_CRC64: { + u64 crc = bch_checksum_init(type); + + bio_for_each_segment(bv, bio, iter) { + void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; + + crc = bch_checksum_update(type, + crc, p, bv.bv_len); + kunmap_atomic(p); + } + + crc = bch_checksum_final(type, crc); + return (struct bch_csum) { .lo = crc }; + } + + case BCH_CSUM_CHACHA20_POLY1305_80: + case BCH_CSUM_CHACHA20_POLY1305_128: { + SHASH_DESC_ON_STACK(desc, c->poly1305); + u8 digest[POLY1305_DIGEST_SIZE]; + struct bch_csum ret = { 0 }; + + gen_poly_key(c, desc, nonce); + + bio_for_each_segment(bv, bio, iter) { + void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; + + crypto_shash_update(desc, p, bv.bv_len); + kunmap_atomic(p); + } + + crypto_shash_final(desc, digest); + + memcpy(&ret, digest, bch_crc_bytes[type]); + return ret; + } + default: + BUG(); + } +} + +void bch_encrypt_bio(struct cache_set *c, unsigned type, + struct nonce nonce, struct bio *bio) +{ + struct bio_vec bv; + struct bvec_iter iter; + struct scatterlist sgl[16], *sg = sgl; + size_t bytes = 0; + + if (!bch_csum_type_is_encryption(type)) + return; + + sg_init_table(sgl, ARRAY_SIZE(sgl)); bio_for_each_segment(bv, bio, iter) { - void *p = kmap_atomic(bv.bv_page); + if (sg == sgl + ARRAY_SIZE(sgl)) { + sg_mark_end(sg - 1); + do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + + le32_add_cpu(nonce.d, bytes / CHACHA20_BLOCK_SIZE); + bytes = 0; + + sg_init_table(sgl, ARRAY_SIZE(sgl)); + sg = sgl; + } + + sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); + bytes += bv.bv_len; + + } + + sg_mark_end(sg - 1); + do_encrypt_sg(c->chacha20, nonce, sgl, bytes); +} + +#ifdef __KERNEL__ +int bch_request_key(struct bch_sb *sb, struct bch_key *key) +{ + char key_description[60]; + struct key *keyring_key; + const struct user_key_payload *ukp; + int ret; + + snprintf(key_description, sizeof(key_description), + "bcache:%pUb", &sb->user_uuid); + + keyring_key = request_key(&key_type_logon, key_description, NULL); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); + + down_read(&keyring_key->sem); + ukp = user_key_payload(keyring_key); + if (ukp->datalen == sizeof(*key)) { + memcpy(key, ukp->data, ukp->datalen); + ret = 0; + } else { + ret = -EINVAL; + } + up_read(&keyring_key->sem); + key_put(keyring_key); + + return ret; +} +#else +#include <keyutils.h> +#include <uuid/uuid.h> + +int bch_request_key(struct bch_sb *sb, struct bch_key *key) +{ + key_serial_t key_id; + char key_description[60]; + char uuid[40]; + + uuid_unparse_lower(sb->user_uuid.b, uuid); + sprintf(key_description, "bcache:%s", uuid); + + key_id = request_key("user", key_description, NULL, + KEY_SPEC_USER_KEYRING); + if (key_id < 0) + return -errno; + + if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) + return -1; + + return 0; +} +#endif - csum = bch_checksum_update(type, csum, - p + bv.bv_offset, - bv.bv_len); - kunmap_atomic(p); +static int bch_decrypt_sb_key(struct cache_set *c, + struct bch_sb_field_crypt *crypt, + struct bch_key *key) +{ + struct bch_encrypted_key sb_key = crypt->key; + struct bch_key user_key; + int ret = 0; + + /* is key encrypted? */ + if (!bch_key_is_encrypted(&sb_key)) + goto out; + + ret = bch_request_key(c->disk_sb, &user_key); + if (ret) { + bch_err(c, "error requesting encryption key"); + goto err; } - return csum ^= U32_MAX; + /* decrypt real key: */ + ret = bch_chacha_encrypt_key(&user_key, bch_sb_key_nonce(c), + &sb_key, sizeof(sb_key)); + if (ret) + goto err; + + if (bch_key_is_encrypted(&sb_key)) { + bch_err(c, "incorrect encryption key"); + ret = -EINVAL; + goto err; + } +out: + *key = sb_key.key; +err: + memzero_explicit(&sb_key, sizeof(sb_key)); + memzero_explicit(&user_key, sizeof(user_key)); + return ret; +} + +static int bch_alloc_ciphers(struct cache_set *c) +{ + if (!c->chacha20) + c->chacha20 = crypto_alloc_blkcipher("chacha20", 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(c->chacha20)) + return PTR_ERR(c->chacha20); + + if (!c->poly1305) + c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); + if (IS_ERR(c->poly1305)) + return PTR_ERR(c->poly1305); + + return 0; +} + +int bch_disable_encryption(struct cache_set *c) +{ + struct bch_sb_field_crypt *crypt; + struct bch_key key; + int ret = -EINVAL; + + mutex_lock(&c->sb_lock); + + crypt = bch_sb_get_crypt(c->disk_sb); + if (!crypt) + goto out; + + /* is key encrypted? */ + ret = 0; + if (bch_key_is_encrypted(&crypt->key)) + goto out; + + ret = bch_decrypt_sb_key(c, crypt, &key); + if (ret) + goto out; + + crypt->key.magic = BCH_KEY_MAGIC; + crypt->key.key = key; + + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 0); + bch_write_super(c); +out: + mutex_unlock(&c->sb_lock); + + return ret; +} + +int bch_enable_encryption(struct cache_set *c, bool keyed) +{ + struct bch_encrypted_key key; + struct bch_key user_key; + struct bch_sb_field_crypt *crypt; + int ret = -EINVAL; + + mutex_lock(&c->sb_lock); + + /* Do we already have an encryption key? */ + if (bch_sb_get_crypt(c->disk_sb)) + goto err; + + ret = bch_alloc_ciphers(c); + if (ret) + goto err; + + key.magic = BCH_KEY_MAGIC; + get_random_bytes(&key.key, sizeof(key.key)); + + if (keyed) { + ret = bch_request_key(c->disk_sb, &user_key); + if (ret) { + bch_err(c, "error requesting encryption key"); + goto err; + } + + ret = bch_chacha_encrypt_key(&user_key, bch_sb_key_nonce(c), + &key, sizeof(key)); + if (ret) + goto err; + } + + ret = crypto_blkcipher_setkey(c->chacha20, + (void *) &key.key, sizeof(key.key)); + if (ret) + goto err; + + crypt = container_of_or_null(bch_fs_sb_field_resize(c, NULL, + sizeof(*crypt) / sizeof(u64)), + struct bch_sb_field_crypt, field); + if (!crypt) { + ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ + goto err; + } + + crypt->field.type = BCH_SB_FIELD_crypt; + crypt->key = key; + + /* write superblock */ + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 1); + bch_write_super(c); +err: + mutex_unlock(&c->sb_lock); + memzero_explicit(&user_key, sizeof(user_key)); + memzero_explicit(&key, sizeof(key)); + return ret; +} + +void bch_cache_set_encryption_free(struct cache_set *c) +{ + if (!IS_ERR_OR_NULL(c->poly1305)) + crypto_free_shash(c->poly1305); + if (!IS_ERR_OR_NULL(c->chacha20)) + crypto_free_blkcipher(c->chacha20); +} + +int bch_cache_set_encryption_init(struct cache_set *c) +{ + struct bch_sb_field_crypt *crypt; + struct bch_key key; + int ret; + + crypt = bch_sb_get_crypt(c->disk_sb); + if (!crypt) + return 0; + + ret = bch_alloc_ciphers(c); + if (ret) + return ret; + + ret = bch_decrypt_sb_key(c, crypt, &key); + if (ret) + goto err; + + ret = crypto_blkcipher_setkey(c->chacha20, + (void *) &key.key, sizeof(key.key)); +err: + memzero_explicit(&key, sizeof(key)); + return ret; } diff --git a/libbcache/checksum.h b/libbcache/checksum.h index 196b7e8c..a9a17587 100644 --- a/libbcache/checksum.h +++ b/libbcache/checksum.h @@ -1,24 +1,133 @@ #ifndef _BCACHE_CHECKSUM_H #define _BCACHE_CHECKSUM_H -#include "btree_types.h" +#include "bcache.h" +#include "super-io.h" + +#include <crypto/chacha20.h> u64 bch_crc64_update(u64, const void *, size_t); -u64 bch_checksum_update(unsigned, u64, const void *, size_t); -u64 bch_checksum(unsigned, const void *, size_t); -u32 bch_checksum_bio(struct bio *, unsigned); +#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) +#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) +#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) +#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) +#define BCH_NONCE_POLY cpu_to_le32(1 << 31) + +struct bch_csum bch_checksum(struct cache_set *, unsigned, struct nonce, + const void *, size_t); /* - * This is used for various on disk data structures - cache_sb, prio_set, bset, - * jset: The checksum is _always_ the first 8 bytes of these structs + * This is used for various on disk data structures - bch_sb, prio_set, bset, + * jset: The checksum is _always_ the first field of these structs */ -#define __csum_set(i, u64s, type) \ +#define csum_vstruct(_c, _type, _nonce, _i) \ ({ \ - const void *start = ((const void *) (i)) + sizeof(u64); \ - const void *end = __bkey_idx(i, u64s); \ + const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ + const void *end = vstruct_end(_i); \ \ - bch_checksum(type, start, end - start); \ + bch_checksum(_c, _type, _nonce, start, end - start); \ }) +int bch_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); +int bch_request_key(struct bch_sb *, struct bch_key *); + +void bch_encrypt(struct cache_set *, unsigned, struct nonce, + void *data, size_t); + +struct bch_csum bch_checksum_bio(struct cache_set *, unsigned, + struct nonce, struct bio *); +void bch_encrypt_bio(struct cache_set *, unsigned, + struct nonce, struct bio *); + +int bch_disable_encryption(struct cache_set *); +int bch_enable_encryption(struct cache_set *, bool); + +void bch_cache_set_encryption_free(struct cache_set *); +int bch_cache_set_encryption_init(struct cache_set *); + +static inline unsigned bch_data_checksum_type(struct cache_set *c) +{ + if (c->sb.encryption_type) + return c->opts.wide_macs + ? BCH_CSUM_CHACHA20_POLY1305_128 + : BCH_CSUM_CHACHA20_POLY1305_80; + + return c->opts.data_checksum; +} + +static inline unsigned bch_meta_checksum_type(struct cache_set *c) +{ + return c->sb.encryption_type + ? BCH_CSUM_CHACHA20_POLY1305_128 + : c->opts.metadata_checksum; +} + +static inline bool bch_checksum_type_valid(const struct cache_set *c, + unsigned type) +{ + if (type >= BCH_CSUM_NR) + return false; + + if (bch_csum_type_is_encryption(type) && !c->chacha20) + return false; + + return true; +} + +static const unsigned bch_crc_bytes[] = { + [BCH_CSUM_NONE] = 0, + [BCH_CSUM_CRC32C] = 4, + [BCH_CSUM_CRC64] = 8, + [BCH_CSUM_CHACHA20_POLY1305_80] = 10, + [BCH_CSUM_CHACHA20_POLY1305_128] = 16, +}; + +static inline bool bch_crc_cmp(struct bch_csum l, struct bch_csum r) +{ + /* + * XXX: need some way of preventing the compiler from optimizing this + * into a form that isn't constant time.. + */ + return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; +} + +/* for skipping ahead and encrypting/decrypting at an offset: */ +static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) +{ + EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1)); + + le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE); + return nonce; +} + +static inline bool bch_key_is_encrypted(struct bch_encrypted_key *key) +{ + return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; +} + +static inline struct nonce __bch_sb_key_nonce(struct bch_sb *sb) +{ + __le64 magic = __bch_sb_magic(sb); + + return (struct nonce) {{ + [0] = 0, + [1] = 0, + [2] = ((__le32 *) &magic)[0], + [3] = ((__le32 *) &magic)[1], + }}; +} + +static inline struct nonce bch_sb_key_nonce(struct cache_set *c) +{ + __le64 magic = bch_sb_magic(c); + + return (struct nonce) {{ + [0] = 0, + [1] = 0, + [2] = ((__le32 *) &magic)[0], + [3] = ((__le32 *) &magic)[1], + }}; +} + #endif /* _BCACHE_CHECKSUM_H */ diff --git a/libbcache/compress.c b/libbcache/compress.c index f7bfd57f..e76850be 100644 --- a/libbcache/compress.c +++ b/libbcache/compress.c @@ -1,6 +1,8 @@ #include "bcache.h" #include "compress.h" +#include "extents.h" #include "io.h" +#include "super-io.h" #include <linux/lz4.h> #include <linux/zlib.h> @@ -50,7 +52,7 @@ static void *__bio_map_or_bounce(struct cache_set *c, unsigned prev_end = PAGE_SIZE; void *data; - BUG_ON(bvec_iter_sectors(start) > BCH_COMPRESSED_EXTENT_MAX); + BUG_ON(bvec_iter_sectors(start) > BCH_ENCODED_EXTENT_MAX); *bounced = BOUNCED_MAPPED; @@ -118,12 +120,12 @@ static void bio_unmap_or_unbounce(struct cache_set *c, void *data, } static int __bio_uncompress(struct cache_set *c, struct bio *src, - void *dst_data, struct bch_extent_crc64 crc) + void *dst_data, struct bch_extent_crc128 crc) { void *src_data = NULL; unsigned src_bounced; size_t src_len = src->bi_iter.bi_size; - size_t dst_len = crc.uncompressed_size << 9; + size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; int ret; src_data = bio_map_or_bounce(c, src, &src_bounced, READ); @@ -179,10 +181,10 @@ err: int bch_bio_uncompress_inplace(struct cache_set *c, struct bio *bio, unsigned live_data_sectors, - struct bch_extent_crc64 crc) + struct bch_extent_crc128 crc) { void *dst_data = NULL; - size_t dst_len = crc.uncompressed_size << 9; + size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; int ret = -ENOMEM; BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs); @@ -231,11 +233,11 @@ use_mempool: int bch_bio_uncompress(struct cache_set *c, struct bio *src, struct bio *dst, struct bvec_iter dst_iter, - struct bch_extent_crc64 crc) + struct bch_extent_crc128 crc) { void *dst_data = NULL; unsigned dst_bounced; - size_t dst_len = crc.uncompressed_size << 9; + size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; int ret = -ENOMEM; dst_data = dst_len == dst_iter.bi_size @@ -273,28 +275,23 @@ static int __bio_compress(struct cache_set *c, *src_len = src->bi_iter.bi_size; workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO); -retry_compress: - ret = lz4_compress(src_data, *src_len, - dst_data, dst_len, - workspace); - /* - * On error, the compressed data was bigger than dst_len, and - * -ret is the amount of data we were able to compress - round - * down to nearest block and try again: - */ - if (ret && round_down(-ret, block_bytes(c)) > *dst_len) { - BUG_ON(ret > 0); - /* not supposed to happen */ - if (WARN_ON(-ret >= *src_len)) - goto err; + while (*src_len > block_bytes(c) && + (ret = lz4_compress(src_data, *src_len, + dst_data, dst_len, + workspace))) { + /* + * On error, the compressed data was bigger than + * dst_len, and -ret is the amount of data we were able + * to compress - round down to nearest block and try + * again: + */ + BUG_ON(ret > 0); + BUG_ON(-ret >= *src_len); *src_len = round_down(-ret, block_bytes(c)); - if (!*src_len) - goto err; - - goto retry_compress; } + mempool_free(workspace, &c->lz4_workspace_pool); if (ret) @@ -354,6 +351,10 @@ zlib_err: } BUG_ON(!*dst_len); + BUG_ON(*dst_len > dst->bi_iter.bi_size); + + BUG_ON(*src_len & (block_bytes(c) - 1)); + BUG_ON(*src_len > src->bi_iter.bi_size); /* Didn't get smaller: */ if (round_up(*dst_len, block_bytes(c)) >= *src_len) { @@ -382,9 +383,9 @@ void bch_bio_compress(struct cache_set *c, unsigned orig_dst = dst->bi_iter.bi_size; unsigned orig_src = src->bi_iter.bi_size; - /* Don't consume more than BCH_COMPRESSED_EXTENT_MAX from @src: */ + /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ src->bi_iter.bi_size = - min(src->bi_iter.bi_size, BCH_COMPRESSED_EXTENT_MAX << 9); + min(src->bi_iter.bi_size, BCH_ENCODED_EXTENT_MAX << 9); /* Don't generate a bigger output than input: */ dst->bi_iter.bi_size = @@ -405,6 +406,30 @@ out: src->bi_iter.bi_size = orig_src; } +/* doesn't write superblock: */ +int bch_check_set_has_compressed_data(struct cache_set *c, + unsigned compression_type) +{ + switch (compression_type) { + case BCH_COMPRESSION_NONE: + return 0; + case BCH_COMPRESSION_LZ4: + if (bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) + return 0; + + bch_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4); + break; + case BCH_COMPRESSION_GZIP: + if (bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) + return 0; + + bch_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP); + break; + } + + return bch_compress_init(c); +} + void bch_compress_free(struct cache_set *c) { vfree(c->zlib_workspace); @@ -420,39 +445,56 @@ void bch_compress_free(struct cache_set *c) int bch_compress_init(struct cache_set *c) { + unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9); int ret, cpu; - c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker); - if (!c->bio_decompress_worker) - return -ENOMEM; + if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) && + !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) + return 0; - for_each_possible_cpu(cpu) { - struct bio_decompress_worker *d = - per_cpu_ptr(c->bio_decompress_worker, cpu); + if (!c->bio_decompress_worker) { + c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker); + if (!c->bio_decompress_worker) + return -ENOMEM; - d->c = c; - INIT_WORK(&d->work, bch_bio_decompress_work); - init_llist_head(&d->bio_list); + for_each_possible_cpu(cpu) { + struct bio_decompress_worker *d = + per_cpu_ptr(c->bio_decompress_worker, cpu); + + d->c = c; + INIT_WORK(&d->work, bch_bio_decompress_work); + init_llist_head(&d->bio_list); + } } - ret = mempool_init_page_pool(&c->compression_bounce[READ], 1, - get_order(BCH_COMPRESSED_EXTENT_MAX << 9)); - if (ret) - return ret; + if (!mempool_initialized(&c->compression_bounce[READ])) { + ret = mempool_init_page_pool(&c->compression_bounce[READ], + 1, order); + if (ret) + return ret; + } - ret = mempool_init_page_pool(&c->compression_bounce[WRITE], 1, - get_order(BCH_COMPRESSED_EXTENT_MAX << 9)); - if (ret) - return ret; + if (!mempool_initialized(&c->compression_bounce[WRITE])) { + ret = mempool_init_page_pool(&c->compression_bounce[WRITE], + 1, order); + if (ret) + return ret; + } - ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool, 1, - LZ4_MEM_COMPRESS); - if (ret) - return ret; + if (!mempool_initialized(&c->lz4_workspace_pool) && + bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) { + ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool, + 1, LZ4_MEM_COMPRESS); + if (ret) + return ret; + } - c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE); - if (!c->zlib_workspace) - return -ENOMEM; + if (!c->zlib_workspace && + bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) { + c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE); + if (!c->zlib_workspace) + return -ENOMEM; + } return 0; } diff --git a/libbcache/compress.h b/libbcache/compress.h index 02578ef7..485acd95 100644 --- a/libbcache/compress.h +++ b/libbcache/compress.h @@ -2,12 +2,13 @@ #define _BCACHE_COMPRESS_H int bch_bio_uncompress_inplace(struct cache_set *, struct bio *, - unsigned, struct bch_extent_crc64); + unsigned, struct bch_extent_crc128); int bch_bio_uncompress(struct cache_set *, struct bio *, struct bio *, - struct bvec_iter, struct bch_extent_crc64); + struct bvec_iter, struct bch_extent_crc128); void bch_bio_compress(struct cache_set *, struct bio *, size_t *, struct bio *, size_t *, unsigned *); +int bch_check_set_has_compressed_data(struct cache_set *, unsigned); void bch_compress_free(struct cache_set *); int bch_compress_init(struct cache_set *); diff --git a/libbcache/debug.c b/libbcache/debug.c index 39f5550e..d25c32ae 100644 --- a/libbcache/debug.c +++ b/libbcache/debug.c @@ -96,7 +96,7 @@ void __bch_btree_verify(struct cache_set *c, struct btree *b) if (inmemory->u64s != sorted->u64s || memcmp(inmemory->start, sorted->start, - (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) { + vstruct_end(inmemory) - (void *) inmemory->start)) { unsigned offset = 0, sectors; struct bset *i; unsigned j; @@ -112,18 +112,14 @@ void __bch_btree_verify(struct cache_set *c, struct btree *b) while (offset < b->written) { if (!offset ) { i = &n_ondisk->keys; - sectors = __set_blocks(n_ondisk, - le16_to_cpu(n_ondisk->keys.u64s), - block_bytes(c)) << + sectors = vstruct_blocks(n_ondisk, c->block_bits) << c->block_bits; } else { struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9); i = &bne->keys; - sectors = __set_blocks(bne, - le16_to_cpu(bne->keys.u64s), - block_bytes(c)) << + sectors = vstruct_blocks(bne, c->block_bits) << c->block_bits; } @@ -427,7 +423,7 @@ void bch_debug_init_cache_set(struct cache_set *c) if (IS_ERR_OR_NULL(bch_debug)) return; - snprintf(name, sizeof(name), "%pU", c->disk_sb.user_uuid.b); + snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); c->debug = debugfs_create_dir(name, bch_debug); if (IS_ERR_OR_NULL(c->debug)) return; diff --git a/libbcache/dirent.c b/libbcache/dirent.c index d97c3b22..ebf0f101 100644 --- a/libbcache/dirent.c +++ b/libbcache/dirent.c @@ -23,34 +23,13 @@ unsigned bch_dirent_name_bytes(struct bkey_s_c_dirent d) static u64 bch_dirent_hash(const struct bch_hash_info *info, const struct qstr *name) { - switch (info->type) { - case BCH_STR_HASH_SHA1: { - SHASH_DESC_ON_STACK(desc, bch_sha1); - u8 digest[SHA1_DIGEST_SIZE]; - u64 ret; - desc->tfm = bch_sha1; - desc->flags = 0; - crypto_shash_init(desc); - - crypto_shash_update(desc, (void *) &info->seed, sizeof(info->seed)); - - crypto_shash_update(desc, (void *) name->name, name->len); - crypto_shash_final(desc, digest); - memcpy(&ret, &digest, sizeof(ret)); - return max_t(u64, ret >> 1, 2); - } - default: { - struct bch_str_hash_ctx ctx; - - bch_str_hash_init(&ctx, info->type); - bch_str_hash_update(&ctx, info->type, &info->seed, sizeof(info->seed)); + struct bch_str_hash_ctx ctx; - bch_str_hash_update(&ctx, info->type, name->name, name->len); + bch_str_hash_init(&ctx, info); + bch_str_hash_update(&ctx, info, name->name, name->len); - /* [0,2) reserved for dots */ - return max_t(u64, bch_str_hash_end(&ctx, info->type), 2); - } - } + /* [0,2) reserved for dots */ + return max_t(u64, bch_str_hash_end(&ctx, info), 2); } static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) diff --git a/libbcache/extents.c b/libbcache/extents.c index c026d591..4b8a2665 100644 --- a/libbcache/extents.c +++ b/libbcache/extents.c @@ -9,19 +9,19 @@ #include "bkey_methods.h" #include "btree_gc.h" #include "btree_update.h" +#include "checksum.h" #include "debug.h" #include "dirent.h" #include "error.h" #include "extents.h" #include "inode.h" #include "journal.h" -#include "super.h" +#include "super-io.h" #include "writeback.h" #include "xattr.h" #include <trace/events/bcache.h> -static bool __bch_extent_normalize(struct cache_set *, struct bkey_s, bool); static enum merge_result bch_extent_merge(struct cache_set *, struct btree *, struct bkey_i *, struct bkey_i *); @@ -120,21 +120,38 @@ bch_extent_has_device(struct bkey_s_c_extent e, unsigned dev) return NULL; } -unsigned bch_extent_nr_ptrs_from(struct bkey_s_c_extent e, - const struct bch_extent_ptr *start) +unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent e) { const struct bch_extent_ptr *ptr; unsigned nr_ptrs = 0; - extent_for_each_ptr_from(e, ptr, start) + extent_for_each_ptr(e, ptr) nr_ptrs++; return nr_ptrs; } -unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent e) +unsigned bch_extent_nr_dirty_ptrs(struct bkey_s_c k) { - return bch_extent_nr_ptrs_from(e, &e.v->start->ptr); + struct bkey_s_c_extent e; + const struct bch_extent_ptr *ptr; + unsigned nr_ptrs = 0; + + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + e = bkey_s_c_to_extent(k); + + extent_for_each_ptr(e, ptr) + nr_ptrs += !ptr->cached; + break; + + case BCH_RESERVATION: + nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas; + break; + } + + return nr_ptrs; } /* returns true if equal */ @@ -177,16 +194,19 @@ void bch_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc * * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then * use crc_live here (that we verified was correct earlier) + * + * note: doesn't work with encryption */ void bch_extent_narrow_crcs(struct bkey_s_extent e) { union bch_extent_crc *crc; bool have_wide = false, have_narrow = false; - u64 csum = 0; + struct bch_csum csum = { 0 }; unsigned csum_type = 0; extent_for_each_crc(e, crc) { - if (crc_compression_type(crc)) + if (crc_compression_type(crc) || + bch_csum_type_is_encryption(crc_csum_type(crc))) continue; if (crc_uncompressed_size(e.k, crc) != e.k->size) { @@ -210,26 +230,38 @@ void bch_extent_narrow_crcs(struct bkey_s_extent e) case BCH_EXTENT_CRC_NONE: BUG(); case BCH_EXTENT_CRC32: - if (bch_crc_size[csum_type] > sizeof(crc->crc32.csum)) + if (bch_crc_bytes[csum_type] > 4) continue; bch_extent_crc_narrow_pointers(e, crc); - crc->crc32.compressed_size = e.k->size; - crc->crc32.uncompressed_size = e.k->size; + crc->crc32._compressed_size = e.k->size - 1; + crc->crc32._uncompressed_size = e.k->size - 1; crc->crc32.offset = 0; crc->crc32.csum_type = csum_type; - crc->crc32.csum = csum; + crc->crc32.csum = csum.lo; break; case BCH_EXTENT_CRC64: - if (bch_crc_size[csum_type] > sizeof(crc->crc64.csum)) + if (bch_crc_bytes[csum_type] > 10) continue; bch_extent_crc_narrow_pointers(e, crc); - crc->crc64.compressed_size = e.k->size; - crc->crc64.uncompressed_size = e.k->size; + crc->crc64._compressed_size = e.k->size - 1; + crc->crc64._uncompressed_size = e.k->size - 1; crc->crc64.offset = 0; crc->crc64.csum_type = csum_type; - crc->crc64.csum = csum; + crc->crc64.csum_lo = csum.lo; + crc->crc64.csum_hi = csum.hi; + break; + case BCH_EXTENT_CRC128: + if (bch_crc_bytes[csum_type] > 16) + continue; + + bch_extent_crc_narrow_pointers(e, crc); + crc->crc128._compressed_size = e.k->size - 1; + crc->crc128._uncompressed_size = e.k->size - 1; + crc->crc128.offset = 0; + crc->crc128.csum_type = csum_type; + crc->crc128.csum = csum; break; } } @@ -300,13 +332,8 @@ static void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent e) struct bch_extent_ptr *ptr = &e.v->start->ptr; bool dropped = false; - /* - * We don't want to change which pointers are considered cached/dirty, - * so don't remove pointers that are considered dirty: - */ rcu_read_lock(); - while ((ptr = extent_ptr_next(e, ptr)) && - !bch_extent_ptr_is_dirty(c, e.c, ptr)) + while ((ptr = extent_ptr_next(e, ptr))) if (should_drop_ptr(c, e.c, ptr)) { __bch_extent_drop_ptr(e, ptr); dropped = true; @@ -321,16 +348,43 @@ static void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent e) static bool bch_ptr_normalize(struct cache_set *c, struct btree *bk, struct bkey_s k) { - return __bch_extent_normalize(c, k, false); + return bch_extent_normalize(c, k); } static void bch_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) { - u64 *d = (u64 *) bkeyp_val(f, k); - unsigned i; + switch (k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: { + union bch_extent_entry *entry; + u64 *d = (u64 *) bkeyp_val(f, k); + unsigned i; - for (i = 0; i < bkeyp_val_u64s(f, k); i++) - d[i] = swab64(d[i]); + for (i = 0; i < bkeyp_val_u64s(f, k); i++) + d[i] = swab64(d[i]); + + for (entry = (union bch_extent_entry *) d; + entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k)); + entry = extent_entry_next(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.csum = swab32(entry->crc32.csum); + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); + entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.csum.hi = swab64(entry->crc64.csum_hi); + entry->crc128.csum.lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_ptr: + break; + } + } + break; + } + } } static const char *extent_ptr_invalid(struct bkey_s_c_extent e, @@ -341,7 +395,7 @@ static const char *extent_ptr_invalid(struct bkey_s_c_extent e, const struct bch_extent_ptr *ptr2; const struct cache_member_cpu *m = mi->m + ptr->dev; - if (ptr->dev > mi->nr_in_set || !m->valid) + if (ptr->dev > mi->nr_devices || !m->valid) return "pointer to invalid device"; extent_for_each_ptr(e, ptr2) @@ -380,7 +434,9 @@ static size_t extent_print_ptrs(struct cache_set *c, char *buf, switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: crc = entry_to_crc(entry); + p("crc: c_size %u size %u offset %u csum %u compress %u", crc_compressed_size(e.k, crc), crc_uncompressed_size(e.k, crc), @@ -388,7 +444,8 @@ static size_t extent_print_ptrs(struct cache_set *c, char *buf, crc_compression_type(crc)); break; case BCH_EXTENT_ENTRY_ptr: - ptr = &entry->ptr; + ptr = entry_to_ptr(entry); + p("ptr: %u:%llu gen %u%s", ptr->dev, (u64) ptr->offset, ptr->gen, (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr) @@ -621,6 +678,10 @@ static bool __bch_cut_front(struct bpos where, struct bkey_s k) if (prev_crc != crc) crc->crc64.offset += e.k->size - len; break; + case BCH_EXTENT_CRC128: + if (prev_crc != crc) + crc->crc128.offset += e.k->size - len; + break; } prev_crc = crc; } @@ -948,7 +1009,7 @@ static bool bch_extent_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r) BUG_ON(!l.k->size || !r.k->size); if (l.k->type != r.k->type || - l.k->version != r.k->version) + bversion_cmp(l.k->version, r.k->version)) return false; switch (l.k->type) { @@ -985,7 +1046,7 @@ static bool bch_extent_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r) extent_for_each_ptr(le, lp) { const union bch_extent_entry *entry = - bkey_idx(re.v, (u64 *) lp - le.v->_data); + vstruct_idx(re.v, (u64 *) lp - le.v->_data); if (!extent_entry_is_ptr(entry)) return false; @@ -1142,7 +1203,7 @@ static void extent_insert_committed(struct extent_insert_state *s) if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && bkey_cmp(s->committed, insert->k.p) && - bkey_extent_is_compressed(c, bkey_i_to_s_c(insert))) { + bkey_extent_is_compressed(bkey_i_to_s_c(insert))) { /* XXX: possibly need to increase our reservation? */ bch_cut_subtract_back(s, s->committed, bkey_i_to_s(&split.k)); @@ -1178,12 +1239,19 @@ __extent_insert_advance_pos(struct extent_insert_state *s, { struct extent_insert_hook *hook = s->trans->hook; enum extent_insert_hook_ret ret; - +#if 0 + /* + * Currently disabled for encryption - broken with fcollapse. Will have + * to reenable when versions are exposed for send/receive - versions + * will have to be monotonic then: + */ if (k.k && k.k->size && - s->insert->k->k.version && - k.k->version > s->insert->k->k.version) + !bversion_zero(s->insert->k->k.version) && + bversion_cmp(k.k->version, s->insert->k->k.version) > 0) { ret = BTREE_HOOK_NO_INSERT; - else if (hook) + } else +#endif + if (hook) ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k); else ret = BTREE_HOOK_DO_INSERT; @@ -1257,7 +1325,7 @@ extent_insert_check_split_compressed(struct extent_insert_state *s, unsigned sectors; if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && - (sectors = bkey_extent_is_compressed(c, k))) { + (sectors = bkey_extent_is_compressed(k))) { int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD; if (s->trans->flags & BTREE_INSERT_NOFAIL) @@ -1680,6 +1748,7 @@ static const char *bch_extent_invalid(const struct cache_set *c, struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const union bch_extent_entry *entry; const union bch_extent_crc *crc; + const struct bch_extent_ptr *ptr; struct cache_member_rcu *mi = cache_member_info_get(c); unsigned size_ondisk = e.k->size; const char *reason; @@ -1689,9 +1758,7 @@ static const char *bch_extent_invalid(const struct cache_set *c, if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) goto invalid; - switch (extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_crc32: - case BCH_EXTENT_ENTRY_crc64: + if (extent_entry_is_crc(entry)) { crc = entry_to_crc(entry); reason = "checksum offset + key size > uncompressed size"; @@ -1702,19 +1769,19 @@ static const char *bch_extent_invalid(const struct cache_set *c, size_ondisk = crc_compressed_size(e.k, crc); reason = "invalid checksum type"; - if (crc_csum_type(crc) >= BCH_CSUM_NR) + if (!bch_checksum_type_valid(c, crc_csum_type(crc))) goto invalid; reason = "invalid compression type"; if (crc_compression_type(crc) >= BCH_COMPRESSION_NR) goto invalid; - break; - case BCH_EXTENT_ENTRY_ptr: + } else { + ptr = entry_to_ptr(entry); + reason = extent_ptr_invalid(e, mi, &entry->ptr, size_ondisk); if (reason) goto invalid; - break; } } @@ -1725,8 +1792,17 @@ invalid: return reason; } - case BCH_RESERVATION: + case BCH_RESERVATION: { + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) + return "incorrect value size"; + + if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) + return "invalid nr_replicas"; + return NULL; + } default: return "invalid value type"; @@ -1743,7 +1819,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b, unsigned seq, stale; char buf[160]; bool bad; - unsigned ptrs_per_tier[CACHE_TIERS]; + unsigned ptrs_per_tier[BCH_TIER_MAX]; unsigned tier, replicas = 0; /* @@ -1760,11 +1836,9 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b, mi = cache_member_info_get(c); extent_for_each_ptr(e, ptr) { - bool dirty = bch_extent_ptr_is_dirty(c, e, ptr); - replicas++; - if (ptr->dev >= mi->nr_in_set) + if (ptr->dev >= mi->nr_devices) goto bad_device; /* @@ -1796,7 +1870,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b, stale = ptr_stale(ca, ptr); - cache_set_bug_on(stale && dirty, c, + cache_set_bug_on(stale && !ptr->cached, c, "stale dirty pointer"); cache_set_bug_on(stale > 96, c, @@ -1809,9 +1883,9 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b, bad = (mark.is_metadata || (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && !mark.owned_by_allocator && - !(dirty - ? mark.dirty_sectors - : mark.cached_sectors))); + !(ptr->cached + ? mark.cached_sectors + : mark.dirty_sectors))); } while (read_seqcount_retry(&c->gc_pos_lock, seq)); if (bad) @@ -1869,6 +1943,7 @@ static void bch_extent_debugcheck(struct cache_set *c, struct btree *b, case BCH_EXTENT: case BCH_EXTENT_CACHED: bch_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k)); + break; case BCH_RESERVATION: break; default: @@ -1896,69 +1971,77 @@ static void bch_extent_to_text(struct cache_set *c, char *buf, static unsigned PTR_TIER(struct cache_member_rcu *mi, const struct bch_extent_ptr *ptr) { - return ptr->dev < mi->nr_in_set + return ptr->dev < mi->nr_devices ? mi->m[ptr->dev].tier : UINT_MAX; } -void bch_extent_entry_append(struct bkey_i_extent *e, - union bch_extent_entry *entry) -{ - BUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) > - BKEY_EXTENT_VAL_U64s_MAX); - - memcpy_u64s(extent_entry_last(extent_i_to_s(e)), - entry, - extent_entry_u64s(entry)); - e->k.u64s += extent_entry_u64s(entry); -} - -const unsigned bch_crc_size[] = { - [BCH_CSUM_NONE] = 0, - [BCH_CSUM_CRC32C] = 4, - [BCH_CSUM_CRC64] = 8, -}; - static void bch_extent_crc_init(union bch_extent_crc *crc, unsigned compressed_size, unsigned uncompressed_size, unsigned compression_type, - u64 csum, unsigned csum_type) + unsigned nonce, + struct bch_csum csum, unsigned csum_type) { - if (bch_crc_size[csum_type] <= 4 && - uncompressed_size <= CRC32_EXTENT_SIZE_MAX) { + if (bch_crc_bytes[csum_type] <= 4 && + uncompressed_size <= CRC32_SIZE_MAX && + nonce <= CRC32_NONCE_MAX) { crc->crc32 = (struct bch_extent_crc32) { .type = 1 << BCH_EXTENT_ENTRY_crc32, - .compressed_size = compressed_size, - .uncompressed_size = uncompressed_size, + ._compressed_size = compressed_size - 1, + ._uncompressed_size = uncompressed_size - 1, .offset = 0, .compression_type = compression_type, .csum_type = csum_type, - .csum = csum, + .csum = *((__le32 *) &csum.lo), }; - } else { - BUG_ON(uncompressed_size > CRC64_EXTENT_SIZE_MAX); + return; + } + if (bch_crc_bytes[csum_type] <= 10 && + uncompressed_size <= CRC64_SIZE_MAX && + nonce <= CRC64_NONCE_MAX) { crc->crc64 = (struct bch_extent_crc64) { .type = 1 << BCH_EXTENT_ENTRY_crc64, - .compressed_size = compressed_size, - .uncompressed_size = uncompressed_size, + ._compressed_size = compressed_size - 1, + ._uncompressed_size = uncompressed_size - 1, + .offset = 0, + .nonce = nonce, + .compression_type = compression_type, + .csum_type = csum_type, + .csum_lo = csum.lo, + .csum_hi = *((__le16 *) &csum.hi), + }; + return; + } + + if (bch_crc_bytes[csum_type] <= 16 && + uncompressed_size <= CRC128_SIZE_MAX && + nonce <= CRC128_NONCE_MAX) { + crc->crc128 = (struct bch_extent_crc128) { + .type = 1 << BCH_EXTENT_ENTRY_crc128, + ._compressed_size = compressed_size - 1, + ._uncompressed_size = uncompressed_size - 1, .offset = 0, + .nonce = nonce, .compression_type = compression_type, .csum_type = csum_type, .csum = csum, }; + return; } + + BUG(); } void bch_extent_crc_append(struct bkey_i_extent *e, unsigned compressed_size, unsigned uncompressed_size, unsigned compression_type, - u64 csum, unsigned csum_type) + unsigned nonce, + struct bch_csum csum, unsigned csum_type) { union bch_extent_crc *crc; - union bch_extent_crc new; BUG_ON(compressed_size > uncompressed_size); BUG_ON(uncompressed_size != e->k.size); @@ -1971,123 +2054,26 @@ void bch_extent_crc_append(struct bkey_i_extent *e, extent_for_each_crc(extent_i_to_s(e), crc) ; - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - if (!csum_type && !compression_type) - return; - break; - case BCH_EXTENT_CRC32: - case BCH_EXTENT_CRC64: - if (crc_compressed_size(&e->k, crc) == compressed_size && - crc_uncompressed_size(&e->k, crc) == uncompressed_size && - crc_offset(crc) == 0 && - crc_compression_type(crc) == compression_type && - crc_csum_type(crc) == csum_type && - crc_csum(crc) == csum) - return; - break; - } + if (!crc && !csum_type && !compression_type) + return; + + if (crc && + crc_compressed_size(&e->k, crc) == compressed_size && + crc_uncompressed_size(&e->k, crc) == uncompressed_size && + crc_offset(crc) == 0 && + crc_nonce(crc) == nonce && + crc_csum_type(crc) == csum_type && + crc_compression_type(crc) == compression_type && + crc_csum(crc).lo == csum.lo && + crc_csum(crc).hi == csum.hi) + return; - bch_extent_crc_init(&new, + bch_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), compressed_size, uncompressed_size, compression_type, - csum, csum_type); - bch_extent_entry_append(e, to_entry(&new)); -} - -static void __extent_sort_ptrs(struct cache_member_rcu *mi, - struct bkey_s_extent src) -{ - struct bch_extent_ptr *src_ptr, *dst_ptr; - union bch_extent_crc *src_crc, *dst_crc; - union bch_extent_crc _src; - BKEY_PADDED(k) tmp; - struct bkey_s_extent dst; - size_t u64s, crc_u64s; - u64 *p; - - /* - * Insertion sort: - * - * Note: this sort needs to be stable, because pointer order determines - * pointer dirtyness. - */ - - tmp.k.k = *src.k; - dst = bkey_i_to_s_extent(&tmp.k); - set_bkey_val_u64s(dst.k, 0); - - extent_for_each_ptr_crc(src, src_ptr, src_crc) { - extent_for_each_ptr_crc(dst, dst_ptr, dst_crc) - if (PTR_TIER(mi, src_ptr) < PTR_TIER(mi, dst_ptr)) - goto found; - - dst_ptr = &extent_entry_last(dst)->ptr; - dst_crc = NULL; -found: - /* found insert position: */ - - /* - * we're making sure everything has a crc at this point, if - * dst_ptr points to a pointer it better have a crc: - */ - BUG_ON(dst_ptr != &extent_entry_last(dst)->ptr && !dst_crc); - BUG_ON(dst_crc && - (extent_entry_next(to_entry(dst_crc)) != - to_entry(dst_ptr))); - - if (!src_crc) { - bch_extent_crc_init(&_src, src.k->size, - src.k->size, 0, 0, 0); - src_crc = &_src; - } - - p = dst_ptr != &extent_entry_last(dst)->ptr - ? (void *) dst_crc - : (void *) dst_ptr; - - crc_u64s = extent_entry_u64s(to_entry(src_crc)); - u64s = crc_u64s + sizeof(*dst_ptr) / sizeof(u64); - - memmove_u64s_up(p + u64s, p, - (u64 *) extent_entry_last(dst) - (u64 *) p); - set_bkey_val_u64s(dst.k, bkey_val_u64s(dst.k) + u64s); - - memcpy_u64s(p, src_crc, crc_u64s); - memcpy_u64s(p + crc_u64s, src_ptr, - sizeof(*src_ptr) / sizeof(u64)); - } - - /* Sort done - now drop redundant crc entries: */ - bch_extent_drop_redundant_crcs(dst); - - memcpy_u64s(src.v, dst.v, bkey_val_u64s(dst.k)); - set_bkey_val_u64s(src.k, bkey_val_u64s(dst.k)); -} - -static void extent_sort_ptrs(struct cache_set *c, struct bkey_s_extent e) -{ - struct cache_member_rcu *mi; - struct bch_extent_ptr *ptr, *prev = NULL; - union bch_extent_crc *crc; - - /* - * First check if any pointers are out of order before doing the actual - * sort: - */ - mi = cache_member_info_get(c); - - extent_for_each_ptr_crc(e, ptr, crc) { - if (prev && - PTR_TIER(mi, ptr) < PTR_TIER(mi, prev)) { - __extent_sort_ptrs(mi, e); - break; - } - prev = ptr; - } - - cache_member_info_put(); + nonce, csum, csum_type); + __extent_entry_push(e); } /* @@ -2098,8 +2084,7 @@ static void extent_sort_ptrs(struct cache_set *c, struct bkey_s_extent e) * For existing keys, only called when btree nodes are being rewritten, not when * they're merely being compacted/resorted in memory. */ -static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k, - bool sort) +bool bch_extent_normalize(struct cache_set *c, struct bkey_s k) { struct bkey_s_extent e; @@ -2112,7 +2097,7 @@ static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k, return true; case KEY_TYPE_DISCARD: - return !k.k->version; + return bversion_zero(k.k->version); case BCH_EXTENT: case BCH_EXTENT_CACHED: @@ -2120,13 +2105,10 @@ static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k, bch_extent_drop_stale(c, e); - if (sort) - extent_sort_ptrs(c, e); - if (!bkey_val_u64s(e.k)) { if (bkey_extent_is_cached(e.k)) { k.k->type = KEY_TYPE_DISCARD; - if (!k.k->version) + if (bversion_zero(k.k->version)) return true; } else { k.k->type = KEY_TYPE_ERROR; @@ -2141,9 +2123,40 @@ static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k, } } -bool bch_extent_normalize(struct cache_set *c, struct bkey_s k) +void bch_extent_mark_replicas_cached(struct cache_set *c, + struct bkey_s_extent e, + unsigned nr_cached) { - return __bch_extent_normalize(c, k, true); + struct bch_extent_ptr *ptr; + struct cache_member_rcu *mi; + bool have_higher_tier; + unsigned tier = 0; + + if (!nr_cached) + return; + + mi = cache_member_info_get(c); + + do { + have_higher_tier = false; + + extent_for_each_ptr(e, ptr) { + if (!ptr->cached && + PTR_TIER(mi, ptr) == tier) { + ptr->cached = true; + nr_cached--; + if (!nr_cached) + goto out; + } + + if (PTR_TIER(mi, ptr) > tier) + have_higher_tier = true; + } + + tier++; + } while (have_higher_tier); +out: + cache_member_info_put(); } /* @@ -2183,7 +2196,7 @@ void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k, extent_for_each_online_device_crc(c, e, crc, ptr, ca) if (!ptr_stale(ca, ptr)) { *ret = (struct extent_pick_ptr) { - .crc = crc_to_64(e.k, crc), + .crc = crc_to_128(e.k, crc), .ptr = *ptr, .ca = ca, }; @@ -2227,7 +2240,7 @@ static enum merge_result bch_extent_merge(struct cache_set *c, if (l->k.u64s != r->k.u64s || l->k.type != r->k.type || - l->k.version != r->k.version || + bversion_cmp(l->k.version, r->k.version) || bkey_cmp(l->k.p, bkey_start_pos(&r->k))) return BCH_MERGE_NOMERGE; @@ -2235,7 +2248,6 @@ static enum merge_result bch_extent_merge(struct cache_set *c, case KEY_TYPE_DELETED: case KEY_TYPE_DISCARD: case KEY_TYPE_ERROR: - case BCH_RESERVATION: /* These types are mergeable, and no val to check */ break; @@ -2248,7 +2260,7 @@ static enum merge_result bch_extent_merge(struct cache_set *c, struct bch_extent_ptr *lp, *rp; struct cache_member_cpu *m; - en_r = bkey_idx(er.v, (u64 *) en_l - el.v->_data); + en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data); if ((extent_entry_type(en_l) != extent_entry_type(en_r)) || @@ -2276,6 +2288,15 @@ static enum merge_result bch_extent_merge(struct cache_set *c, } break; + case BCH_RESERVATION: { + struct bkey_i_reservation *li = bkey_i_to_reservation(l); + struct bkey_i_reservation *ri = bkey_i_to_reservation(r); + + if (li->v.generation != ri->v.generation || + li->v.nr_replicas != ri->v.nr_replicas) + return BCH_MERGE_NOMERGE; + break; + } default: return BCH_MERGE_NOMERGE; } diff --git a/libbcache/extents.h b/libbcache/extents.h index e1cb47ab..b0a05422 100644 --- a/libbcache/extents.h +++ b/libbcache/extents.h @@ -26,7 +26,7 @@ struct cache_set; struct journal_res; struct extent_pick_ptr { - struct bch_extent_crc64 crc; + struct bch_extent_crc128 crc; struct bch_extent_ptr ptr; struct cache *ca; }; @@ -53,10 +53,11 @@ bch_insert_fixup_extent(struct btree_insert *, struct btree_insert_entry *); bool bch_extent_normalize(struct cache_set *, struct bkey_s); +void bch_extent_mark_replicas_cached(struct cache_set *, + struct bkey_s_extent, unsigned); -unsigned bch_extent_nr_ptrs_from(struct bkey_s_c_extent, - const struct bch_extent_ptr *); unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent); +unsigned bch_extent_nr_dirty_ptrs(struct bkey_s_c); static inline bool bkey_extent_is_data(const struct bkey *k) { @@ -117,6 +118,8 @@ static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) return sizeof(struct bch_extent_crc32); case BCH_EXTENT_ENTRY_crc64: return sizeof(struct bch_extent_crc64); + case BCH_EXTENT_ENTRY_crc128: + return sizeof(struct bch_extent_crc128); case BCH_EXTENT_ENTRY_ptr: return sizeof(struct bch_extent_ptr); default: @@ -143,6 +146,7 @@ union bch_extent_crc { u8 type; struct bch_extent_crc32 crc32; struct bch_extent_crc64 crc64; + struct bch_extent_crc128 crc128; }; /* downcast, preserves const */ @@ -185,10 +189,11 @@ enum bch_extent_crc_type { BCH_EXTENT_CRC_NONE, BCH_EXTENT_CRC32, BCH_EXTENT_CRC64, + BCH_EXTENT_CRC128, }; static inline enum bch_extent_crc_type -extent_crc_type(const union bch_extent_crc *crc) +__extent_crc_type(const union bch_extent_crc *crc) { if (!crc) return BCH_EXTENT_CRC_NONE; @@ -198,16 +203,31 @@ extent_crc_type(const union bch_extent_crc *crc) return BCH_EXTENT_CRC32; case BCH_EXTENT_ENTRY_crc64: return BCH_EXTENT_CRC64; + case BCH_EXTENT_ENTRY_crc128: + return BCH_EXTENT_CRC128; default: BUG(); } } +#define extent_crc_type(_crc) \ +({ \ + BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) && \ + !type_is(_crc, struct bch_extent_crc64 *) && \ + !type_is(_crc, struct bch_extent_crc128 *) && \ + !type_is(_crc, union bch_extent_crc *)); \ + \ + type_is(_crc, struct bch_extent_crc32 *) ? BCH_EXTENT_CRC32 \ + : type_is(_crc, struct bch_extent_crc64 *) ? BCH_EXTENT_CRC64 \ + : type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128 \ + : __extent_crc_type((union bch_extent_crc *) _crc); \ +}) + #define extent_entry_next(_entry) \ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) #define extent_entry_last(_e) \ - bkey_idx((_e).v, bkey_val_u64s((_e).k)) + vstruct_idx((_e).v, bkey_val_u64s((_e).k)) /* Iterate over all entries: */ @@ -283,20 +303,16 @@ out: \ #define extent_ptr_next(_e, _ptr) \ extent_ptr_next_filter(_e, _ptr, true) -#define extent_for_each_ptr_from_filter(_e, _ptr, _start, _filter) \ - for ((_ptr) = (_start); \ +#define extent_for_each_ptr_filter(_e, _ptr, _filter) \ + for ((_ptr) = &(_e).v->start->ptr; \ ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter)); \ (_ptr)++) -#define extent_for_each_ptr_from(_e, _ptr, _start) \ - extent_for_each_ptr_from_filter(_e, _ptr, _start, true) - #define extent_for_each_ptr(_e, _ptr) \ - extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr, true) + extent_for_each_ptr_filter(_e, _ptr, true) #define extent_for_each_online_device(_c, _e, _ptr, _ca) \ - extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr, \ - ((_ca) = PTR_CACHE(_c, _ptr))) + extent_for_each_ptr_filter(_e, _ptr, ((_ca) = PTR_CACHE(_c, _ptr))) #define extent_ptr_prev(_e, _ptr) \ ({ \ @@ -321,67 +337,114 @@ out: \ (_ptr); \ (_ptr) = extent_ptr_prev(_e, _ptr)) -void bch_extent_entry_append(struct bkey_i_extent *, union bch_extent_entry *); void bch_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned, - unsigned, u64, unsigned); + unsigned, unsigned, struct bch_csum, unsigned); + +static inline void __extent_entry_push(struct bkey_i_extent *e) +{ + union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e)); + + EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) > + BKEY_EXTENT_VAL_U64s_MAX); + + e->k.u64s += extent_entry_u64s(entry); +} static inline void extent_ptr_append(struct bkey_i_extent *e, struct bch_extent_ptr ptr) { ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - bch_extent_entry_append(e, to_entry(&ptr)); + extent_entry_last(extent_i_to_s(e))->ptr = ptr; + __extent_entry_push(e); } -/* XXX: inefficient */ -static inline bool bch_extent_ptr_is_dirty(const struct cache_set *c, - struct bkey_s_c_extent e, - const struct bch_extent_ptr *ptr) +static inline struct bch_extent_crc128 crc_to_128(const struct bkey *k, + const union bch_extent_crc *crc) { - if (bkey_extent_is_cached(e.k)) - return false; - - /* Dirty pointers come last */ - return bch_extent_nr_ptrs_from(e, ptr) <= c->opts.data_replicas; -} - -extern const unsigned bch_crc_size[]; + EBUG_ON(!k->size); -static inline struct bch_extent_crc64 crc_to_64(const struct bkey *k, - const union bch_extent_crc *crc) -{ switch (extent_crc_type(crc)) { case BCH_EXTENT_CRC_NONE: - return (struct bch_extent_crc64) { - .compressed_size = k->size, - .uncompressed_size = k->size, + return (struct bch_extent_crc128) { + ._compressed_size = k->size - 1, + ._uncompressed_size = k->size - 1, }; case BCH_EXTENT_CRC32: - return (struct bch_extent_crc64) { - .compressed_size = crc->crc32.compressed_size, - .uncompressed_size = crc->crc32.uncompressed_size, + return (struct bch_extent_crc128) { + .type = 1 << BCH_EXTENT_ENTRY_crc128, + ._compressed_size = crc->crc32._compressed_size, + ._uncompressed_size = crc->crc32._uncompressed_size, .offset = crc->crc32.offset, .csum_type = crc->crc32.csum_type, .compression_type = crc->crc32.compression_type, - .csum = crc->crc32.csum, + .csum.lo = crc->crc32.csum, }; case BCH_EXTENT_CRC64: - return crc->crc64; + return (struct bch_extent_crc128) { + .type = 1 << BCH_EXTENT_ENTRY_crc128, + ._compressed_size = crc->crc64._compressed_size, + ._uncompressed_size = crc->crc64._uncompressed_size, + .offset = crc->crc64.offset, + .nonce = crc->crc64.nonce, + .csum_type = crc->crc64.csum_type, + .compression_type = crc->crc64.compression_type, + .csum.lo = crc->crc64.csum_lo, + .csum.hi = crc->crc64.csum_hi, + }; + case BCH_EXTENT_CRC128: + return crc->crc128; default: BUG(); } } -static inline unsigned crc_compressed_size(const struct bkey *k, - const union bch_extent_crc *crc) -{ - return crc_to_64(k, crc).compressed_size; -} +#define crc_compressed_size(_k, _crc) \ +({ \ + unsigned _size = 0; \ + \ + switch (extent_crc_type(_crc)) { \ + case BCH_EXTENT_CRC_NONE: \ + _size = ((const struct bkey *) (_k))->size; \ + break; \ + case BCH_EXTENT_CRC32: \ + _size = ((struct bch_extent_crc32 *) _crc) \ + ->_compressed_size + 1; \ + break; \ + case BCH_EXTENT_CRC64: \ + _size = ((struct bch_extent_crc64 *) _crc) \ + ->_compressed_size + 1; \ + break; \ + case BCH_EXTENT_CRC128: \ + _size = ((struct bch_extent_crc128 *) _crc) \ + ->_compressed_size + 1; \ + break; \ + } \ + _size; \ +}) -static inline unsigned crc_uncompressed_size(const struct bkey *k, - const union bch_extent_crc *crc) -{ - return crc_to_64(k, crc).uncompressed_size; -} +#define crc_uncompressed_size(_k, _crc) \ +({ \ + unsigned _size = 0; \ + \ + switch (extent_crc_type(_crc)) { \ + case BCH_EXTENT_CRC_NONE: \ + _size = ((const struct bkey *) (_k))->size; \ + break; \ + case BCH_EXTENT_CRC32: \ + _size = ((struct bch_extent_crc32 *) _crc) \ + ->_uncompressed_size + 1; \ + break; \ + case BCH_EXTENT_CRC64: \ + _size = ((struct bch_extent_crc64 *) _crc) \ + ->_uncompressed_size + 1; \ + break; \ + case BCH_EXTENT_CRC128: \ + _size = ((struct bch_extent_crc128 *) _crc) \ + ->_uncompressed_size + 1; \ + break; \ + } \ + _size; \ +}) static inline unsigned crc_offset(const union bch_extent_crc *crc) { @@ -392,6 +455,23 @@ static inline unsigned crc_offset(const union bch_extent_crc *crc) return crc->crc32.offset; case BCH_EXTENT_CRC64: return crc->crc64.offset; + case BCH_EXTENT_CRC128: + return crc->crc128.offset; + default: + BUG(); + } +} + +static inline unsigned crc_nonce(const union bch_extent_crc *crc) +{ + switch (extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + case BCH_EXTENT_CRC32: + return 0; + case BCH_EXTENT_CRC64: + return crc->crc64.nonce; + case BCH_EXTENT_CRC128: + return crc->crc128.nonce; default: BUG(); } @@ -406,6 +486,8 @@ static inline unsigned crc_csum_type(const union bch_extent_crc *crc) return crc->crc32.csum_type; case BCH_EXTENT_CRC64: return crc->crc64.csum_type; + case BCH_EXTENT_CRC128: + return crc->crc128.csum_type; default: BUG(); } @@ -420,27 +502,33 @@ static inline unsigned crc_compression_type(const union bch_extent_crc *crc) return crc->crc32.compression_type; case BCH_EXTENT_CRC64: return crc->crc64.compression_type; + case BCH_EXTENT_CRC128: + return crc->crc128.compression_type; default: BUG(); } } -static inline u64 crc_csum(const union bch_extent_crc *crc) +static inline struct bch_csum crc_csum(const union bch_extent_crc *crc) { switch (extent_crc_type(crc)) { case BCH_EXTENT_CRC_NONE: - return 0; + return (struct bch_csum) { 0 }; case BCH_EXTENT_CRC32: - return crc->crc32.csum; + return (struct bch_csum) { .lo = crc->crc32.csum }; case BCH_EXTENT_CRC64: - return crc->crc64.csum; + return (struct bch_csum) { + .lo = crc->crc64.csum_lo, + .hi = crc->crc64.csum_hi, + }; + case BCH_EXTENT_CRC128: + return crc->crc128.csum; default: BUG(); } } -static inline unsigned bkey_extent_is_compressed(struct cache_set *c, - struct bkey_s_c k) +static inline unsigned bkey_extent_is_compressed(struct bkey_s_c k) { struct bkey_s_c_extent e; const struct bch_extent_ptr *ptr; @@ -453,7 +541,7 @@ static inline unsigned bkey_extent_is_compressed(struct cache_set *c, e = bkey_s_c_to_extent(k); extent_for_each_ptr_crc(e, ptr, crc) - if (bch_extent_ptr_is_dirty(c, e, ptr) && + if (!ptr->cached && crc_compression_type(crc) != BCH_COMPRESSION_NONE && crc_compressed_size(e.k, crc) < k.k->size) ret = max_t(unsigned, ret, @@ -463,6 +551,17 @@ static inline unsigned bkey_extent_is_compressed(struct cache_set *c, return ret; } +static inline unsigned extent_current_nonce(struct bkey_s_c_extent e) +{ + const union bch_extent_crc *crc; + + extent_for_each_crc(e, crc) + if (bch_csum_type_is_encryption(crc_csum_type(crc))) + return crc_offset(crc) + crc_nonce(crc); + + return 0; +} + void bch_extent_narrow_crcs(struct bkey_s_extent); void bch_extent_drop_redundant_crcs(struct bkey_s_extent); diff --git a/libbcache/fs-gc.c b/libbcache/fs-gc.c index 1dec230f..a758e895 100644 --- a/libbcache/fs-gc.c +++ b/libbcache/fs-gc.c @@ -17,7 +17,7 @@ static int remove_dirent(struct cache_set *c, struct btree_iter *iter, struct bkey_s_c_dirent dirent) { struct qstr name; - struct bkey_i_inode dir_inode; + struct bch_inode_unpacked dir_inode; struct bch_hash_info dir_hash_info; u64 dir_inum = dirent.k->p.inode; int ret; @@ -39,7 +39,7 @@ static int remove_dirent(struct cache_set *c, struct btree_iter *iter, if (ret) goto err; - dir_hash_info = bch_hash_info_init(&dir_inode.v); + dir_hash_info = bch_hash_info_init(&dir_inode); ret = bch_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL); err: @@ -48,11 +48,12 @@ err: } static int reattach_inode(struct cache_set *c, - struct bkey_i_inode *lostfound_inode, + struct bch_inode_unpacked *lostfound_inode, u64 inum) { struct bch_hash_info lostfound_hash_info = - bch_hash_info_init(&lostfound_inode->v); + bch_hash_info_init(lostfound_inode); + struct bkey_inode_buf packed; char name_buf[20]; struct qstr name; int ret; @@ -60,14 +61,16 @@ static int reattach_inode(struct cache_set *c, snprintf(name_buf, sizeof(name_buf), "%llu", inum); name = (struct qstr) QSTR(name_buf); - le32_add_cpu(&lostfound_inode->v.i_nlink, 1); + lostfound_inode->i_nlink++; - ret = bch_btree_insert(c, BTREE_ID_INODES, &lostfound_inode->k_i, + bch_inode_pack(&packed, lostfound_inode); + + ret = bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, NULL, NULL, NULL, 0); if (ret) return ret; - return bch_dirent_create(c, lostfound_inode->k.p.inode, + return bch_dirent_create(c, lostfound_inode->inum, &lostfound_hash_info, DT_DIR, &name, inum, NULL, 0); } @@ -75,10 +78,8 @@ static int reattach_inode(struct cache_set *c, struct inode_walker { bool first_this_inode; bool have_inode; - u16 i_mode; - u64 i_size; u64 cur_inum; - struct bkey_i_inode inode; + struct bch_inode_unpacked inode; }; static struct inode_walker inode_walker_init(void) @@ -101,11 +102,6 @@ static int walk_inode(struct cache_set *c, struct inode_walker *w, u64 inum) return ret; w->have_inode = !ret; - - if (w->have_inode) { - w->i_mode = le16_to_cpu(w->inode.v.i_mode); - w->i_size = le64_to_cpu(w->inode.v.i_size); - } } return 0; @@ -138,20 +134,20 @@ static int check_extents(struct cache_set *c) k.k->type, k.k->p.inode); unfixable_fsck_err_on(w.first_this_inode && w.have_inode && - le64_to_cpu(w.inode.v.i_sectors) != + w.inode.i_sectors != (i_sectors = bch_count_inode_sectors(c, w.cur_inum)), c, "i_sectors wrong: got %llu, should be %llu", - le64_to_cpu(w.inode.v.i_sectors), i_sectors); + w.inode.i_sectors, i_sectors); unfixable_fsck_err_on(w.have_inode && - !S_ISREG(w.i_mode) && !S_ISLNK(w.i_mode), c, + !S_ISREG(w.inode.i_mode) && !S_ISLNK(w.inode.i_mode), c, "extent type %u for non regular file, inode %llu mode %o", - k.k->type, k.k->p.inode, w.i_mode); + k.k->type, k.k->p.inode, w.inode.i_mode); unfixable_fsck_err_on(k.k->type != BCH_RESERVATION && - k.k->p.offset > round_up(w.i_size, PAGE_SIZE) >> 9, c, + k.k->p.offset > round_up(w.inode.i_size, PAGE_SIZE) >> 9, c, "extent type %u offset %llu past end of inode %llu, i_size %llu", - k.k->type, k.k->p.offset, k.k->p.inode, w.i_size); + k.k->type, k.k->p.offset, k.k->p.inode, w.inode.i_size); } fsck_err: return bch_btree_iter_unlock(&iter) ?: ret; @@ -172,7 +168,7 @@ static int check_dirents(struct cache_set *c) for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(BCACHE_ROOT_INO, 0), k) { struct bkey_s_c_dirent d; - struct bkey_i_inode target; + struct bch_inode_unpacked target; bool have_target; u64 d_inum; @@ -184,9 +180,9 @@ static int check_dirents(struct cache_set *c) "dirent in nonexisting directory %llu", k.k->p.inode); - unfixable_fsck_err_on(!S_ISDIR(w.i_mode), c, + unfixable_fsck_err_on(!S_ISDIR(w.inode.i_mode), c, "dirent in non directory inode %llu, type %u", - k.k->p.inode, mode_to_type(w.i_mode)); + k.k->p.inode, mode_to_type(w.inode.i_mode)); if (k.k->type != BCH_DIRENT) continue; @@ -220,10 +216,10 @@ static int check_dirents(struct cache_set *c) if (fsck_err_on(have_target && d.v->d_type != - mode_to_type(le16_to_cpu(target.v.i_mode)), c, + mode_to_type(le16_to_cpu(target.i_mode)), c, "incorrect d_type: got %u should be %u, filename %s", d.v->d_type, - mode_to_type(le16_to_cpu(target.v.i_mode)), + mode_to_type(le16_to_cpu(target.i_mode)), d.v->d_name)) { struct bkey_i_dirent *n; @@ -234,7 +230,7 @@ static int check_dirents(struct cache_set *c) } bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = mode_to_type(le16_to_cpu(target.v.i_mode)); + n->v.d_type = mode_to_type(le16_to_cpu(target.i_mode)); ret = bch_btree_insert_at(c, NULL, NULL, NULL, BTREE_INSERT_NOFAIL, @@ -276,8 +272,9 @@ fsck_err: } /* Get root directory, create if it doesn't exist: */ -static int check_root(struct cache_set *c, struct bkey_i_inode *root_inode) +static int check_root(struct cache_set *c, struct bch_inode_unpacked *root_inode) { + struct bkey_inode_buf packed; int ret; ret = bch_inode_find_by_inum(c, BCACHE_ROOT_INO, root_inode); @@ -287,7 +284,7 @@ static int check_root(struct cache_set *c, struct bkey_i_inode *root_inode) if (fsck_err_on(ret, c, "root directory missing")) goto create_root; - if (fsck_err_on(!S_ISDIR(le16_to_cpu(root_inode->v.i_mode)), c, + if (fsck_err_on(!S_ISDIR(root_inode->i_mode), c, "root inode not a directory")) goto create_root; @@ -296,19 +293,23 @@ fsck_err: return ret; create_root: bch_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); - root_inode->k.p.inode = BCACHE_ROOT_INO; + root_inode->inum = BCACHE_ROOT_INO; + + bch_inode_pack(&packed, root_inode); - return bch_btree_insert(c, BTREE_ID_INODES, &root_inode->k_i, + return bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, NULL, NULL, NULL, 0); } /* Get lost+found, create if it doesn't exist: */ static int check_lostfound(struct cache_set *c, - struct bkey_i_inode *root_inode, - struct bkey_i_inode *lostfound_inode) + struct bch_inode_unpacked *root_inode, + struct bch_inode_unpacked *lostfound_inode) { struct qstr lostfound = QSTR("lost+found"); - struct bch_hash_info root_hash_info = bch_hash_info_init(&root_inode->v); + struct bch_hash_info root_hash_info = + bch_hash_info_init(root_inode); + struct bkey_inode_buf packed; u64 inum; int ret; @@ -326,7 +327,7 @@ static int check_lostfound(struct cache_set *c, if (fsck_err_on(ret, c, "lost+found missing")) goto create_lostfound; - if (fsck_err_on(!S_ISDIR(le16_to_cpu(lostfound_inode->v.i_mode)), c, + if (fsck_err_on(!S_ISDIR(lostfound_inode->i_mode), c, "lost+found inode not a directory")) goto create_lostfound; @@ -334,22 +335,27 @@ static int check_lostfound(struct cache_set *c, fsck_err: return ret; create_lostfound: - le32_add_cpu(&root_inode->v.i_nlink, 1); + root_inode->i_nlink++; - ret = bch_btree_insert(c, BTREE_ID_INODES, &root_inode->k_i, + bch_inode_pack(&packed, root_inode); + + ret = bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, NULL, NULL, NULL, 0); if (ret) return ret; bch_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); + bch_inode_pack(&packed, lostfound_inode); - ret = bch_inode_create(c, &lostfound_inode->k_i, BLOCKDEV_INODE_MAX, 0, + ret = bch_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0, &c->unused_inode_hint); if (ret) return ret; + lostfound_inode->inum = packed.inode.k.p.inode; + ret = bch_dirent_create(c, BCACHE_ROOT_INO, &root_hash_info, DT_DIR, - &lostfound, lostfound_inode->k.p.inode, NULL, 0); + &lostfound, lostfound_inode->inum, NULL, 0); if (ret) return ret; @@ -420,7 +426,7 @@ static int path_down(struct pathbuf *p, u64 inum) noinline_for_stack static int check_directory_structure(struct cache_set *c, - struct bkey_i_inode *lostfound_inode) + struct bch_inode_unpacked *lostfound_inode) { struct inode_bitmap dirs_done = { NULL, 0 }; struct pathbuf path = { 0, 0, NULL }; @@ -618,25 +624,30 @@ s64 bch_count_inode_sectors(struct cache_set *c, u64 inum) } static int bch_gc_do_inode(struct cache_set *c, - struct bkey_i_inode *lostfound_inode, + struct bch_inode_unpacked *lostfound_inode, struct btree_iter *iter, struct bkey_s_c_inode inode, struct nlink link) { - u16 i_mode = le16_to_cpu(inode.v->i_mode); - u32 i_flags = le32_to_cpu(inode.v->i_flags); - u32 i_nlink = le32_to_cpu(inode.v->i_nlink); - u64 i_size = le64_to_cpu(inode.v->i_size); - s64 i_sectors = 0; + struct bch_inode_unpacked u; int ret = 0; - u32 real_i_nlink; + u32 i_nlink, real_i_nlink; + bool do_update = false; + + ret = bch_inode_unpack(inode, &u); + if (cache_set_inconsistent_on(ret, c, + "error unpacking inode %llu in fs-gc", + inode.k->p.inode)) + return ret; + + i_nlink = u.i_nlink + nlink_bias(u.i_mode); fsck_err_on(i_nlink < link.count, c, "inode %llu i_link too small (%u < %u, type %i)", inode.k->p.inode, i_nlink, - link.count, mode_to_type(i_mode)); + link.count, mode_to_type(u.i_mode)); /* These should have been caught/fixed by earlier passes: */ - if (S_ISDIR(i_mode)) { + if (S_ISDIR(u.i_mode)) { need_fsck_err_on(link.count > 1, c, "directory %llu with multiple hardlinks: %u", inode.k->p.inode, link.count); @@ -656,7 +667,7 @@ static int bch_gc_do_inode(struct cache_set *c, "but found orphaned inode %llu", inode.k->p.inode); - if (fsck_err_on(S_ISDIR(i_mode) && + if (fsck_err_on(S_ISDIR(u.i_mode) && bch_empty_dir(c, inode.k->p.inode), c, "non empty directory with link count 0, " "inode nlink %u, dir links found %u", @@ -676,7 +687,7 @@ static int bch_gc_do_inode(struct cache_set *c, return ret; } - if (i_flags & BCH_INODE_I_SIZE_DIRTY) { + if (u.i_flags & BCH_INODE_I_SIZE_DIRTY) { fsck_err_on(c->sb.clean, c, "filesystem marked clean, " "but inode %llu has i_size dirty", @@ -690,7 +701,7 @@ static int bch_gc_do_inode(struct cache_set *c, */ ret = bch_inode_truncate(c, inode.k->p.inode, - round_up(i_size, PAGE_SIZE) >> 9, + round_up(u.i_size, PAGE_SIZE) >> 9, NULL, NULL); if (ret) { bch_err(c, "error in fs gc: error %i " @@ -702,10 +713,15 @@ static int bch_gc_do_inode(struct cache_set *c, * We truncated without our normal sector accounting hook, just * make sure we recalculate it: */ - i_flags |= BCH_INODE_I_SECTORS_DIRTY; + u.i_flags |= BCH_INODE_I_SECTORS_DIRTY; + + u.i_flags &= ~BCH_INODE_I_SIZE_DIRTY; + do_update = true; } - if (i_flags & BCH_INODE_I_SECTORS_DIRTY) { + if (u.i_flags & BCH_INODE_I_SECTORS_DIRTY) { + s64 sectors; + fsck_err_on(c->sb.clean, c, "filesystem marked clean, " "but inode %llu has i_sectors dirty", @@ -714,13 +730,17 @@ static int bch_gc_do_inode(struct cache_set *c, bch_verbose(c, "recounting sectors for inode %llu", inode.k->p.inode); - i_sectors = bch_count_inode_sectors(c, inode.k->p.inode); - if (i_sectors < 0) { + sectors = bch_count_inode_sectors(c, inode.k->p.inode); + if (sectors < 0) { bch_err(c, "error in fs gc: error %i " "recounting inode sectors", - (int) i_sectors); - return i_sectors; + (int) sectors); + return sectors; } + + u.i_sectors = sectors; + u.i_flags &= ~BCH_INODE_I_SECTORS_DIRTY; + do_update = true; } if (i_nlink != real_i_nlink) { @@ -728,30 +748,23 @@ static int bch_gc_do_inode(struct cache_set *c, "filesystem marked clean, " "but inode %llu has wrong i_nlink " "(type %u i_nlink %u, should be %u)", - inode.k->p.inode, mode_to_type(i_mode), + inode.k->p.inode, mode_to_type(u.i_mode), i_nlink, real_i_nlink); bch_verbose(c, "setting inode %llu nlinks from %u to %u", inode.k->p.inode, i_nlink, real_i_nlink); + u.i_nlink = real_i_nlink - nlink_bias(u.i_mode);; + do_update = true; } - if (i_nlink != real_i_nlink|| - i_flags & BCH_INODE_I_SECTORS_DIRTY || - i_flags & BCH_INODE_I_SIZE_DIRTY) { - struct bkey_i_inode update; - - bkey_reassemble(&update.k_i, inode.s_c); - update.v.i_nlink = cpu_to_le32(real_i_nlink); - update.v.i_flags = cpu_to_le32(i_flags & - ~(BCH_INODE_I_SIZE_DIRTY| - BCH_INODE_I_SECTORS_DIRTY)); + if (do_update) { + struct bkey_inode_buf p; - if (i_flags & BCH_INODE_I_SECTORS_DIRTY) - update.v.i_sectors = cpu_to_le64(i_sectors); + bch_inode_pack(&p, &u); ret = bch_btree_insert_at(c, NULL, NULL, NULL, BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(iter, &update.k_i)); + BTREE_INSERT_ENTRY(iter, &p.inode.k_i)); if (ret && ret != -EINTR) bch_err(c, "error in fs gc: error %i " "updating inode", ret); @@ -762,7 +775,7 @@ fsck_err: noinline_for_stack static int bch_gc_walk_inodes(struct cache_set *c, - struct bkey_i_inode *lostfound_inode, + struct bch_inode_unpacked *lostfound_inode, struct nlinks *links, u64 range_start, u64 range_end) { @@ -835,7 +848,7 @@ fsck_err: noinline_for_stack static int check_inode_nlinks(struct cache_set *c, - struct bkey_i_inode *lostfound_inode) + struct bch_inode_unpacked *lostfound_inode) { struct nlinks links; u64 this_iter_range_start, next_iter_range_start = 0; @@ -873,7 +886,7 @@ static int check_inode_nlinks(struct cache_set *c, */ int bch_fsck(struct cache_set *c, bool full_fsck) { - struct bkey_i_inode root_inode, lostfound_inode; + struct bch_inode_unpacked root_inode, lostfound_inode; int ret; ret = check_root(c, &root_inode); diff --git a/libbcache/fs-io.c b/libbcache/fs-io.c index 942baeb1..ecf249c3 100644 --- a/libbcache/fs-io.c +++ b/libbcache/fs-io.c @@ -59,22 +59,20 @@ static int write_invalidate_inode_pages_range(struct address_space *mapping, /* i_size updates: */ -static int inode_set_size(struct bch_inode_info *ei, struct bch_inode *bi, +static int inode_set_size(struct bch_inode_info *ei, + struct bch_inode_unpacked *bi, void *p) { loff_t *new_i_size = p; - unsigned i_flags = le32_to_cpu(bi->i_flags); lockdep_assert_held(&ei->update_lock); - bi->i_size = cpu_to_le64(*new_i_size); + bi->i_size = *new_i_size; if (atomic_long_read(&ei->i_size_dirty_count)) - i_flags |= BCH_INODE_I_SIZE_DIRTY; + bi->i_flags |= BCH_INODE_I_SIZE_DIRTY; else - i_flags &= ~BCH_INODE_I_SIZE_DIRTY; - - bi->i_flags = cpu_to_le32(i_flags); + bi->i_flags &= ~BCH_INODE_I_SIZE_DIRTY; return 0; } @@ -122,23 +120,22 @@ i_sectors_hook_fn(struct extent_insert_hook *hook, } static int inode_set_i_sectors_dirty(struct bch_inode_info *ei, - struct bch_inode *bi, void *p) + struct bch_inode_unpacked *bi, void *p) { - BUG_ON(le32_to_cpu(bi->i_flags) & BCH_INODE_I_SECTORS_DIRTY); + BUG_ON(bi->i_flags & BCH_INODE_I_SECTORS_DIRTY); - bi->i_flags = cpu_to_le32(le32_to_cpu(bi->i_flags)| - BCH_INODE_I_SECTORS_DIRTY); + bi->i_flags |= BCH_INODE_I_SECTORS_DIRTY; return 0; } static int inode_clear_i_sectors_dirty(struct bch_inode_info *ei, - struct bch_inode *bi, void *p) + struct bch_inode_unpacked *bi, + void *p) { - BUG_ON(!(le32_to_cpu(bi->i_flags) & BCH_INODE_I_SECTORS_DIRTY)); + BUG_ON(!(bi->i_flags & BCH_INODE_I_SECTORS_DIRTY)); - bi->i_sectors = cpu_to_le64(atomic64_read(&ei->i_sectors)); - bi->i_flags = cpu_to_le32(le32_to_cpu(bi->i_flags) & - ~BCH_INODE_I_SECTORS_DIRTY); + bi->i_sectors = atomic64_read(&ei->i_sectors); + bi->i_flags &= ~BCH_INODE_I_SECTORS_DIRTY; return 0; } @@ -203,7 +200,10 @@ static int __must_check i_sectors_dirty_get(struct bch_inode_info *ei, struct bchfs_extent_trans_hook { struct bchfs_write_op *op; struct extent_insert_hook hook; - struct bkey_i_inode new_inode; + + struct bch_inode_unpacked inode_u; + struct bkey_inode_buf inode_p; + bool need_inode_update; }; @@ -222,6 +222,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook, (k.k && bkey_extent_is_allocation(k.k)); s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign; u64 offset = min(next_pos.offset << 9, h->op->new_i_size); + bool do_pack = false; BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE)); @@ -234,7 +235,9 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook, return BTREE_HOOK_RESTART_TRANS; } - h->new_inode.v.i_size = cpu_to_le64(offset); + h->inode_u.i_size = offset; + do_pack = true; + ei->i_size = offset; if (h->op->is_dio) @@ -247,7 +250,9 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook, return BTREE_HOOK_RESTART_TRANS; } - le64_add_cpu(&h->new_inode.v.i_sectors, sectors); + h->inode_u.i_sectors += sectors; + do_pack = true; + atomic64_add(sectors, &ei->i_sectors); h->op->sectors_added += sectors; @@ -259,6 +264,9 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook, } } + if (do_pack) + bch_inode_pack(&h->inode_p, &h->inode_u); + return BTREE_HOOK_DO_INSERT; } @@ -310,13 +318,32 @@ static int bchfs_write_index_update(struct bch_write_op *wop) break; } - bkey_reassemble(&hook.new_inode.k_i, inode); + if (WARN_ONCE(bkey_bytes(inode.k) > + sizeof(hook.inode_p), + "inode %llu too big (%zu bytes, buf %zu)", + extent_iter.pos.inode, + bkey_bytes(inode.k), + sizeof(hook.inode_p))) { + ret = -ENOENT; + break; + } + + bkey_reassemble(&hook.inode_p.inode.k_i, inode); + ret = bch_inode_unpack(bkey_s_c_to_inode(inode), + &hook.inode_u); + if (WARN_ONCE(ret, + "error %i unpacking inode %llu", + ret, extent_iter.pos.inode)) { + ret = -ENOENT; + break; + } ret = bch_btree_insert_at(wop->c, &wop->res, &hook.hook, op_journal_seq(wop), BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC, BTREE_INSERT_ENTRY(&extent_iter, k), - BTREE_INSERT_ENTRY(&inode_iter, &hook.new_inode.k_i)); + BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter, + &hook.inode_p.inode.k_i, 2)); } else { ret = bch_btree_insert_at(wop->c, &wop->res, &hook.hook, op_journal_seq(wop), @@ -350,25 +377,15 @@ err: struct bch_page_state { union { struct { /* - * BCH_PAGE_ALLOCATED: page is _fully_ written on disk, and not - * compressed - which means to write this page we don't have to reserve - * space (the new write will never take up more space on disk than what - * it's overwriting) - * - * BCH_PAGE_UNALLOCATED: page is not fully written on disk, or is - * compressed - before writing we have to reserve space with - * bch_reserve_sectors() - * - * BCH_PAGE_RESERVED: page has space reserved on disk (reservation will - * be consumed when the page is written). + * page is _fully_ written on disk, and not compressed - which means to + * write this page we don't have to reserve space (the new write will + * never take up more space on disk than what it's overwriting) */ - enum { - BCH_PAGE_UNALLOCATED = 0, - BCH_PAGE_ALLOCATED, - } alloc_state:2; + unsigned allocated:1; /* Owns PAGE_SECTORS sized reservation: */ unsigned reserved:1; + unsigned nr_replicas:4; /* * Number of sectors on disk - for i_blocks @@ -431,11 +448,9 @@ static int bch_get_page_reservation(struct cache_set *c, struct page *page, struct disk_reservation res; int ret = 0; - BUG_ON(s->alloc_state == BCH_PAGE_ALLOCATED && - s->sectors != PAGE_SECTORS); + BUG_ON(s->allocated && s->sectors != PAGE_SECTORS); - if (s->reserved || - s->alloc_state == BCH_PAGE_ALLOCATED) + if (s->allocated || s->reserved) return 0; ret = bch_disk_reservation_get(c, &res, PAGE_SECTORS, !check_enospc @@ -448,7 +463,8 @@ static int bch_get_page_reservation(struct cache_set *c, struct page *page, bch_disk_reservation_put(c, &res); return 0; } - new.reserved = 1; + new.reserved = 1; + new.nr_replicas = res.nr_replicas; }); return 0; @@ -585,10 +601,10 @@ static void bch_mark_pages_unalloc(struct bio *bio) struct bio_vec bv; bio_for_each_segment(bv, bio, iter) - page_state(bv.bv_page)->alloc_state = BCH_PAGE_UNALLOCATED; + page_state(bv.bv_page)->allocated = 0; } -static void bch_add_page_sectors(struct bio *bio, const struct bkey *k) +static void bch_add_page_sectors(struct bio *bio, struct bkey_s_c k) { struct bvec_iter iter; struct bio_vec bv; @@ -597,12 +613,17 @@ static void bch_add_page_sectors(struct bio *bio, const struct bkey *k) struct bch_page_state *s = page_state(bv.bv_page); /* sectors in @k from the start of this page: */ - unsigned k_sectors = k->size - (iter.bi_sector - k->p.offset); + unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset); unsigned page_sectors = min(bv.bv_len >> 9, k_sectors); - BUG_ON(s->sectors + page_sectors > PAGE_SECTORS); + if (!s->sectors) + s->nr_replicas = bch_extent_nr_dirty_ptrs(k); + else + s->nr_replicas = min_t(unsigned, s->nr_replicas, + bch_extent_nr_dirty_ptrs(k)); + BUG_ON(s->sectors + page_sectors > PAGE_SECTORS); s->sectors += page_sectors; } } @@ -634,7 +655,7 @@ static void bchfs_read(struct cache_set *c, struct bch_read_bio *rbio, u64 inode EBUG_ON(s->reserved); - s->alloc_state = BCH_PAGE_ALLOCATED; + s->allocated = 1; s->sectors = 0; } @@ -650,7 +671,7 @@ static void bchfs_read(struct cache_set *c, struct bch_read_bio *rbio, u64 inode k = bkey_i_to_s_c(&tmp.k); if (!bkey_extent_is_allocation(k.k) || - bkey_extent_is_compressed(c, k)) + bkey_extent_is_compressed(k)) bch_mark_pages_unalloc(bio); bch_extent_pick_ptr(c, k, &pick); @@ -667,7 +688,7 @@ static void bchfs_read(struct cache_set *c, struct bch_read_bio *rbio, u64 inode swap(bio->bi_iter.bi_size, bytes); if (bkey_extent_is_allocation(k.k)) - bch_add_page_sectors(bio, k.k); + bch_add_page_sectors(bio, k); if (pick.ca) { PTR_BUCKET(pick.ca, &pick.ptr)->read_prio = @@ -859,6 +880,10 @@ static void bch_writepage_io_alloc(struct cache_set *c, struct page *page) { u64 inum = ei->vfs_inode.i_ino; + unsigned nr_replicas = page_state(page)->nr_replicas; + + EBUG_ON(!nr_replicas); + /* XXX: disk_reservation->gen isn't plumbed through */ if (!w->io) { alloc_io: @@ -881,7 +906,8 @@ alloc_io: w->io->op.op.index_update_fn = bchfs_write_index_update; } - if (bio_add_page_contig(&w->io->bio.bio, page)) { + if (w->io->op.op.res.nr_replicas != nr_replicas || + bio_add_page_contig(&w->io->bio.bio, page)) { bch_writepage_do_io(w); goto alloc_io; } @@ -936,13 +962,13 @@ do_io: /* Before unlocking the page, transfer reservation to w->io: */ old = page_state_cmpxchg(page_state(page), new, { - BUG_ON(!new.reserved && - (new.sectors != PAGE_SECTORS || - new.alloc_state != BCH_PAGE_ALLOCATED)); + EBUG_ON(!new.reserved && + (new.sectors != PAGE_SECTORS || + !new.allocated)); - if (new.alloc_state == BCH_PAGE_ALLOCATED && + if (new.allocated && w->io->op.op.compression_type != BCH_COMPRESSION_NONE) - new.alloc_state = BCH_PAGE_UNALLOCATED; + new.allocated = 0; else if (!new.reserved) goto out; new.reserved = 0; @@ -1919,7 +1945,7 @@ int bch_truncate(struct inode *inode, struct iattr *iattr) mutex_lock(&ei->update_lock); setattr_copy(inode, iattr); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); /* clear I_SIZE_DIRTY: */ i_size_dirty_put(ei); @@ -1981,7 +2007,7 @@ static long bch_fpunch(struct inode *inode, loff_t offset, loff_t len) ret = bch_discard(c, POS(ino, discard_start), POS(ino, discard_end), - 0, + ZERO_VERSION, &disk_res, &i_sectors_hook.hook, &ei->journal_seq); @@ -2132,12 +2158,11 @@ static long bch_fallocate(struct inode *inode, int mode, struct cache_set *c = inode->i_sb->s_fs_info; struct i_sectors_hook i_sectors_hook; struct btree_iter iter; - struct bkey_i reservation; - struct bkey_s_c k; struct bpos end; loff_t block_start, block_end; loff_t new_size = offset + len; unsigned sectors; + unsigned replicas = READ_ONCE(c->opts.data_replicas); int ret; bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, POS_MIN); @@ -2186,13 +2211,16 @@ static long bch_fallocate(struct inode *inode, int mode, while (bkey_cmp(iter.pos, end) < 0) { struct disk_reservation disk_res = { 0 }; + struct bkey_i_reservation reservation; + struct bkey_s_c k; k = bch_btree_iter_peek_with_holes(&iter); if ((ret = btree_iter_err(k))) goto btree_iter_err; /* already reserved */ - if (k.k->type == BCH_RESERVATION) { + if (k.k->type == BCH_RESERVATION && + bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { bch_btree_iter_advance_pos(&iter); continue; } @@ -2204,29 +2232,32 @@ static long bch_fallocate(struct inode *inode, int mode, } } - bkey_init(&reservation.k); + bkey_reservation_init(&reservation.k_i); reservation.k.type = BCH_RESERVATION; reservation.k.p = k.k->p; reservation.k.size = k.k->size; - bch_cut_front(iter.pos, &reservation); + bch_cut_front(iter.pos, &reservation.k_i); bch_cut_back(end, &reservation.k); sectors = reservation.k.size; + reservation.v.nr_replicas = bch_extent_nr_dirty_ptrs(k); - if (!bkey_extent_is_allocation(k.k) || - bkey_extent_is_compressed(c, k)) { + if (reservation.v.nr_replicas < replicas || + bkey_extent_is_compressed(k)) { ret = bch_disk_reservation_get(c, &disk_res, sectors, 0); if (ret) goto err_put_sectors_dirty; + + reservation.v.nr_replicas = disk_res.nr_replicas; } ret = bch_btree_insert_at(c, &disk_res, &i_sectors_hook.hook, &ei->journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &reservation)); + BTREE_INSERT_ENTRY(&iter, &reservation.k_i)); bch_disk_reservation_put(c, &disk_res); btree_iter_err: if (ret < 0 && ret != -EINTR) diff --git a/libbcache/fs.c b/libbcache/fs.c index 884a950f..76948e79 100644 --- a/libbcache/fs.c +++ b/libbcache/fs.c @@ -26,7 +26,9 @@ static struct kmem_cache *bch_inode_cache; -static void bch_vfs_inode_init(struct bch_inode_info *, struct bkey_s_c_inode); +static void bch_vfs_inode_init(struct cache_set *, + struct bch_inode_info *, + struct bch_inode_unpacked *); /* * I_SIZE_DIRTY requires special handling: @@ -63,11 +65,20 @@ int __must_check __bch_write_inode(struct cache_set *c, { struct btree_iter iter; struct inode *inode = &ei->vfs_inode; - struct bkey_i_inode new_inode; - struct bch_inode *bi; + struct bch_inode_unpacked inode_u; + struct bkey_inode_buf inode_p; u64 inum = inode->i_ino; + unsigned i_nlink = READ_ONCE(inode->i_nlink); int ret; + /* + * We can't write an inode with i_nlink == 0 because it's stored biased; + * however, we don't need to because if i_nlink is 0 the inode is + * getting deleted when it's evicted. + */ + if (!i_nlink) + return 0; + lockdep_assert_held(&ei->update_lock); bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(inum, 0)); @@ -84,33 +95,41 @@ int __must_check __bch_write_inode(struct cache_set *c, return -ENOENT; } - bkey_reassemble(&new_inode.k_i, k); - bi = &new_inode.v; + ret = bch_inode_unpack(bkey_s_c_to_inode(k), &inode_u); + if (WARN_ONCE(ret, + "error %i unpacking inode %llu", ret, inum)) { + ret = -ENOENT; + break; + } if (set) { - ret = set(ei, bi, p); + ret = set(ei, &inode_u, p); if (ret) goto out; } - bi->i_mode = cpu_to_le16(inode->i_mode); - bi->i_uid = cpu_to_le32(i_uid_read(inode)); - bi->i_gid = cpu_to_le32(i_gid_read(inode)); - bi->i_nlink = cpu_to_le32(inode->i_nlink); - bi->i_dev = cpu_to_le32(inode->i_rdev); - bi->i_atime = cpu_to_le64(timespec_to_ns(&inode->i_atime)); - bi->i_mtime = cpu_to_le64(timespec_to_ns(&inode->i_mtime)); - bi->i_ctime = cpu_to_le64(timespec_to_ns(&inode->i_ctime)); + BUG_ON(i_nlink < nlink_bias(inode->i_mode)); + + inode_u.i_mode = inode->i_mode; + inode_u.i_uid = i_uid_read(inode); + inode_u.i_gid = i_gid_read(inode); + inode_u.i_nlink = i_nlink - nlink_bias(inode->i_mode); + inode_u.i_dev = inode->i_rdev; + inode_u.i_atime = timespec_to_bch_time(c, inode->i_atime); + inode_u.i_mtime = timespec_to_bch_time(c, inode->i_mtime); + inode_u.i_ctime = timespec_to_bch_time(c, inode->i_ctime); + + bch_inode_pack(&inode_p, &inode_u); ret = bch_btree_insert_at(c, NULL, NULL, &ei->journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &new_inode.k_i)); + BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i)); } while (ret == -EINTR); if (!ret) { - ei->i_size = le64_to_cpu(bi->i_size); - ei->i_flags = le32_to_cpu(bi->i_flags); + ei->i_size = inode_u.i_size; + ei->i_flags = inode_u.i_flags; } out: bch_btree_iter_unlock(&iter); @@ -138,7 +157,7 @@ int bch_inc_nlink(struct cache_set *c, struct bch_inode_info *ei) int bch_dec_nlink(struct cache_set *c, struct bch_inode_info *ei) { - int ret; + int ret = 0; mutex_lock(&ei->update_lock); drop_nlink(&ei->vfs_inode); @@ -152,9 +171,8 @@ static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum) { struct cache_set *c = sb->s_fs_info; struct inode *inode; + struct bch_inode_unpacked inode_u; struct bch_inode_info *ei; - struct btree_iter iter; - struct bkey_s_c k; int ret; pr_debug("inum %llu", inum); @@ -165,24 +183,19 @@ static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum) if (!(inode->i_state & I_NEW)) return inode; - bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0)); - k = bch_btree_iter_peek_with_holes(&iter); - - if ((ret = btree_iter_err(k)) || k.k->type != BCH_INODE_FS) { - ret = bch_btree_iter_unlock(&iter); + ret = bch_inode_find_by_inum(c, inum, &inode_u); + if (ret) { iget_failed(inode); - return ERR_PTR(ret ?: -ENOENT); + return ERR_PTR(ret); } ei = to_bch_ei(inode); - bch_vfs_inode_init(ei, bkey_s_c_to_inode(k)); + bch_vfs_inode_init(c, ei, &inode_u); ei->journal_seq = bch_inode_journal_seq(&c->journal, inum); unlock_new_inode(inode); - bch_btree_iter_unlock(&iter); - return inode; } @@ -193,7 +206,8 @@ static struct inode *bch_vfs_inode_create(struct cache_set *c, struct inode *inode; struct posix_acl *default_acl = NULL, *acl = NULL; struct bch_inode_info *ei; - struct bkey_i_inode bkey_inode; + struct bch_inode_unpacked inode_u; + struct bkey_inode_buf inode_p; int ret; inode = new_inode(parent->i_sb); @@ -210,10 +224,11 @@ static struct inode *bch_vfs_inode_create(struct cache_set *c, ei = to_bch_ei(inode); - bch_inode_init(c, &bkey_inode, i_uid_read(inode), + bch_inode_init(c, &inode_u, i_uid_read(inode), i_gid_read(inode), inode->i_mode, rdev); + bch_inode_pack(&inode_p, &inode_u); - ret = bch_inode_create(c, &bkey_inode.k_i, + ret = bch_inode_create(c, &inode_p.inode.k_i, BLOCKDEV_INODE_MAX, 0, &c->unused_inode_hint); if (unlikely(ret)) { @@ -225,7 +240,8 @@ static struct inode *bch_vfs_inode_create(struct cache_set *c, goto err; } - bch_vfs_inode_init(ei, inode_i_to_s_c(&bkey_inode)); + inode_u.inum = inode_p.inode.k.p.inode; + bch_vfs_inode_init(c, ei, &inode_u); if (default_acl) { ret = bch_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); @@ -266,7 +282,7 @@ static int bch_vfs_dirent_create(struct cache_set *c, struct inode *dir, if (unlikely(ret)) return ret; - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); mark_inode_dirty_sync(dir); return 0; } @@ -337,7 +353,7 @@ static int bch_link(struct dentry *old_dentry, struct inode *dir, lockdep_assert_held(&inode->i_rwsem); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_fs_time(dir->i_sb); ret = bch_inc_nlink(c, ei); if (ret) @@ -382,12 +398,7 @@ static int bch_unlink(struct inode *dir, struct dentry *dentry) drop_nlink(inode); } - drop_nlink(inode); - if (inode->i_nlink) { - mutex_lock(&ei->update_lock); - ret = bch_write_inode(c, ei); - mutex_unlock(&ei->update_lock); - } + bch_dec_nlink(c, ei); return 0; } @@ -473,7 +484,7 @@ static int bch_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = old_dentry->d_inode; struct bch_inode_info *ei = to_bch_ei(old_inode); struct inode *new_inode = new_dentry->d_inode; - struct timespec now = CURRENT_TIME; + struct timespec now = current_fs_time(old_dir->i_sb); int ret; lockdep_assert_held(&old_dir->i_rwsem); @@ -550,7 +561,7 @@ static int bch_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct bch_inode_info *ei = to_bch_ei(old_inode); - struct timespec now = CURRENT_TIME; + struct timespec now = current_fs_time(old_dir->i_sb); int ret; ret = bch_dirent_rename(c, @@ -783,14 +794,14 @@ static unsigned bch_inode_flags_to_user_flags(unsigned flags) } static int bch_inode_user_flags_set(struct bch_inode_info *ei, - struct bch_inode *bi, + struct bch_inode_unpacked *bi, void *p) { /* * We're relying on btree locking here for exclusion with other ioctl * calls - use the flags in the btree (@bi), not ei->i_flags: */ - unsigned bch_flags = le32_to_cpu(bi->i_flags); + unsigned bch_flags = bi->i_flags; unsigned oldflags = bch_inode_flags_to_user_flags(bch_flags); unsigned newflags = *((unsigned *) p); unsigned i; @@ -812,8 +823,8 @@ static int bch_inode_user_flags_set(struct bch_inode_info *ei, if (oldflags != newflags) return -EOPNOTSUPP; - bi->i_flags = cpu_to_le32(bch_flags); - ei->vfs_inode.i_ctime = CURRENT_TIME; + bi->i_flags = bch_flags; + ei->vfs_inode.i_ctime = current_fs_time(ei->vfs_inode.i_sb); return 0; } @@ -1010,32 +1021,33 @@ static const struct address_space_operations bch_address_space_operations = { .error_remove_page = generic_error_remove_page, }; -static void bch_vfs_inode_init(struct bch_inode_info *ei, - struct bkey_s_c_inode bkey_inode) +static void bch_vfs_inode_init(struct cache_set *c, + struct bch_inode_info *ei, + struct bch_inode_unpacked *bi) { struct inode *inode = &ei->vfs_inode; - const struct bch_inode *bi = bkey_inode.v; pr_debug("init inode %llu with mode %o", - bkey_inode.k->p.inode, bi->i_mode); - - ei->i_flags = le32_to_cpu(bi->i_flags); - ei->i_size = le64_to_cpu(bi->i_size); - - inode->i_mode = le16_to_cpu(bi->i_mode); - i_uid_write(inode, le32_to_cpu(bi->i_uid)); - i_gid_write(inode, le32_to_cpu(bi->i_gid)); - - atomic64_set(&ei->i_sectors, le64_to_cpu(bi->i_sectors)); - inode->i_blocks = atomic64_read(&ei->i_sectors); - - inode->i_ino = bkey_inode.k->p.inode; - set_nlink(inode, le32_to_cpu(bi->i_nlink)); - inode->i_rdev = le32_to_cpu(bi->i_dev); - inode->i_size = le64_to_cpu(bi->i_size); - inode->i_atime = ns_to_timespec(le64_to_cpu(bi->i_atime)); - inode->i_mtime = ns_to_timespec(le64_to_cpu(bi->i_mtime)); - inode->i_ctime = ns_to_timespec(le64_to_cpu(bi->i_ctime)); + bi->inum, bi->i_mode); + + ei->i_flags = bi->i_flags; + ei->i_size = bi->i_size; + + inode->i_mode = bi->i_mode; + i_uid_write(inode, bi->i_uid); + i_gid_write(inode, bi->i_gid); + + atomic64_set(&ei->i_sectors, bi->i_sectors); + inode->i_blocks = bi->i_sectors; + + inode->i_ino = bi->inum; + set_nlink(inode, bi->i_nlink + nlink_bias(inode->i_mode)); + inode->i_rdev = bi->i_dev; + inode->i_generation = bi->i_generation; + inode->i_size = bi->i_size; + inode->i_atime = bch_time_to_timespec(c, bi->i_atime); + inode->i_mtime = bch_time_to_timespec(c, bi->i_mtime); + inode->i_ctime = bch_time_to_timespec(c, bi->i_ctime); bch_inode_flags_to_vfs(inode); ei->str_hash = bch_hash_info_init(bi); @@ -1149,8 +1161,8 @@ static int bch_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = atomic_long_read(&c->nr_inodes); buf->f_ffree = U64_MAX; - fsid = le64_to_cpup((void *) c->disk_sb.user_uuid.b) ^ - le64_to_cpup((void *) c->disk_sb.user_uuid.b + sizeof(u64)); + fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ + le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; buf->f_namelen = NAME_MAX; @@ -1380,7 +1392,7 @@ static struct dentry *bch_mount(struct file_system_type *fs_type, sb->s_op = &bch_super_operations; sb->s_xattr = bch_xattr_handlers; sb->s_magic = BCACHE_STATFS_MAGIC; - sb->s_time_gran = 1; + sb->s_time_gran = c->sb.time_precision; c->vfs_sb = sb; sb->s_bdi = &c->bdi; diff --git a/libbcache/fs.h b/libbcache/fs.h index c9820241..aec6159b 100644 --- a/libbcache/fs.h +++ b/libbcache/fs.h @@ -34,9 +34,16 @@ static inline u8 mode_to_type(umode_t mode) return (mode >> 12) & 15; } +static inline unsigned nlink_bias(umode_t mode) +{ + return S_ISDIR(mode) ? 2 : 1; +} + +struct bch_inode_unpacked; + /* returns 0 if we want to do the update, or error is passed up */ typedef int (*inode_set_fn)(struct bch_inode_info *, - struct bch_inode *, void *); + struct bch_inode_unpacked *, void *); int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *, inode_set_fn, void *); diff --git a/libbcache/inode.c b/libbcache/inode.c index 200deb0e..b72a1c51 100644 --- a/libbcache/inode.c +++ b/libbcache/inode.c @@ -9,51 +9,195 @@ #include <linux/random.h> -ssize_t bch_inode_status(char *buf, size_t len, const struct bkey *k) +#include <asm/unaligned.h> + +#define FIELD_BYTES() \ + +static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; +static const u8 bits_table[8] = { + 1 * 8 - 1, + 2 * 8 - 2, + 3 * 8 - 3, + 4 * 8 - 4, + 6 * 8 - 5, + 8 * 8 - 6, + 10 * 8 - 7, + 13 * 8 - 8, +}; + +static int inode_encode_field(u8 *out, u8 *end, const u64 in[2]) { - if (k->p.offset) - return scnprintf(buf, len, "offset nonzero: %llu", k->p.offset); - - if (k->size) - return scnprintf(buf, len, "size nonzero: %u", k->size); - - switch (k->type) { - case KEY_TYPE_DELETED: - return scnprintf(buf, len, "deleted"); - case KEY_TYPE_DISCARD: - return scnprintf(buf, len, "discarded"); - case KEY_TYPE_ERROR: - return scnprintf(buf, len, "error"); - case KEY_TYPE_COOKIE: - return scnprintf(buf, len, "cookie"); + unsigned bytes, bits, shift; - case BCH_INODE_FS: - if (bkey_val_bytes(k) != sizeof(struct bch_inode)) - return scnprintf(buf, len, "bad size: %zu", - bkey_val_bytes(k)); + if (likely(!in[1])) + bits = fls64(in[0]); + else + bits = fls64(in[1]) + 64; - if (k->p.inode < BLOCKDEV_INODE_MAX) - return scnprintf(buf, len, - "fs inode in blockdev range: %llu", - k->p.inode); - return 0; + for (shift = 1; shift <= 8; shift++) + if (bits < bits_table[shift - 1]) + goto got_shift; - case BCH_INODE_BLOCKDEV: - if (bkey_val_bytes(k) != sizeof(struct bch_inode_blockdev)) - return scnprintf(buf, len, "bad size: %zu", - bkey_val_bytes(k)); + BUG(); +got_shift: + bytes = byte_table[shift - 1]; - if (k->p.inode >= BLOCKDEV_INODE_MAX) - return scnprintf(buf, len, - "blockdev inode in fs range: %llu", - k->p.inode); - return 0; + BUG_ON(out + bytes > end); - default: - return scnprintf(buf, len, "unknown inode type: %u", k->type); + if (likely(bytes <= 8)) { + u64 b = cpu_to_be64(in[0]); + + memcpy(out, (void *) &b + 8 - bytes, bytes); + } else { + u64 b = cpu_to_be64(in[1]); + + memcpy(out, (void *) &b + 16 - bytes, bytes); + put_unaligned_be64(in[0], out + bytes - 8); + } + + *out |= (1 << 8) >> shift; + + return bytes; +} + +static int inode_decode_field(const u8 *in, const u8 *end, + u64 out[2], unsigned *out_bits) +{ + unsigned bytes, bits, shift; + + if (in >= end) + return -1; + + if (!*in) + return -1; + + /* + * position of highest set bit indicates number of bytes: + * shift = number of bits to remove in high byte: + */ + shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ + bytes = byte_table[shift - 1]; + bits = bytes * 8 - shift; + + if (in + bytes > end) + return -1; + + /* + * we're assuming it's safe to deref up to 7 bytes < in; this will work + * because keys always start quite a bit more than 7 bytes after the + * start of the btree node header: + */ + if (likely(bytes <= 8)) { + out[0] = get_unaligned_be64(in + bytes - 8); + out[0] <<= 64 - bits; + out[0] >>= 64 - bits; + out[1] = 0; + } else { + out[0] = get_unaligned_be64(in + bytes - 8); + out[1] = get_unaligned_be64(in + bytes - 16); + out[1] <<= 128 - bits; + out[1] >>= 128 - bits; + } + + *out_bits = out[1] ? 64 + fls64(out[1]) : fls64(out[0]); + return bytes; +} + +void bch_inode_pack(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + u8 *out = packed->inode.v.fields; + u8 *end = (void *) &packed[1]; + u8 *last_nonzero_field = out; + u64 field[2]; + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + + bkey_inode_init(&packed->inode.k_i); + packed->inode.k.p.inode = inode->inum; + packed->inode.v.i_hash_seed = inode->i_hash_seed; + packed->inode.v.i_flags = cpu_to_le32(inode->i_flags); + packed->inode.v.i_mode = cpu_to_le16(inode->i_mode); + +#define BCH_INODE_FIELD(_name, _bits) \ + field[0] = inode->_name; \ + field[1] = 0; \ + out += inode_encode_field(out, end, field); \ + nr_fields++; \ + \ + if (field[0] | field[1]) { \ + last_nonzero_field = out; \ + last_nonzero_fieldnr = nr_fields; \ + } + + BCH_INODE_FIELDS() +#undef BCH_INODE_FIELD + + out = last_nonzero_field; + nr_fields = last_nonzero_fieldnr; + + set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v); + memset(out, 0, + (u8 *) &packed->inode.v + + bkey_val_bytes(&packed->inode.k) - out); + + SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); + + if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) { + struct bch_inode_unpacked unpacked; + + int ret = bch_inode_unpack(inode_i_to_s_c(&packed->inode), + &unpacked); + BUG_ON(ret); + BUG_ON(unpacked.inum != inode->inum); + BUG_ON(unpacked.i_hash_seed != inode->i_hash_seed); + BUG_ON(unpacked.i_mode != inode->i_mode); + +#define BCH_INODE_FIELD(_name, _bits) BUG_ON(unpacked._name != inode->_name); + BCH_INODE_FIELDS() +#undef BCH_INODE_FIELD } } +int bch_inode_unpack(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) +{ + const u8 *in = inode.v->fields; + const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); + u64 field[2]; + unsigned fieldnr = 0, field_bits; + int ret; + + unpacked->inum = inode.k->p.inode; + unpacked->i_hash_seed = inode.v->i_hash_seed; + unpacked->i_flags = le32_to_cpu(inode.v->i_flags); + unpacked->i_mode = le16_to_cpu(inode.v->i_mode); + +#define BCH_INODE_FIELD(_name, _bits) \ + if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ + memset(&unpacked->_name, 0, \ + sizeof(*unpacked) - \ + offsetof(struct bch_inode_unpacked, _name)); \ + return 0; \ + } \ + \ + ret = inode_decode_field(in, end, field, &field_bits); \ + if (ret < 0) \ + return ret; \ + \ + if (field_bits > sizeof(unpacked->_name) * 8) \ + return -1; \ + \ + unpacked->_name = field[0]; \ + in += ret; + + BCH_INODE_FIELDS() +#undef BCH_INODE_FIELD + + /* XXX: signal if there were more fields than expected? */ + + return 0; +} + static const char *bch_inode_invalid(const struct cache_set *c, struct bkey_s_c k) { @@ -63,16 +207,20 @@ static const char *bch_inode_invalid(const struct cache_set *c, switch (k.k->type) { case BCH_INODE_FS: { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + struct bch_inode_unpacked unpacked; - if (bkey_val_bytes(k.k) != sizeof(struct bch_inode)) + if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) return "incorrect value size"; if (k.k->p.inode < BLOCKDEV_INODE_MAX) return "fs inode in blockdev range"; - if (INODE_STR_HASH_TYPE(inode.v) >= BCH_STR_HASH_NR) + if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) return "invalid str hash type"; + if (bch_inode_unpack(inode, &unpacked)) + return "invalid variable length fields"; + return NULL; } case BCH_INODE_BLOCKDEV: @@ -92,12 +240,17 @@ static void bch_inode_to_text(struct cache_set *c, char *buf, size_t size, struct bkey_s_c k) { struct bkey_s_c_inode inode; + struct bch_inode_unpacked unpacked; switch (k.k->type) { case BCH_INODE_FS: inode = bkey_s_c_to_inode(k); + if (bch_inode_unpack(inode, &unpacked)) { + scnprintf(buf, size, "(unpack error)"); + break; + } - scnprintf(buf, size, "i_size %llu", inode.v->i_size); + scnprintf(buf, size, "i_size %llu", unpacked.i_size); break; } } @@ -107,26 +260,25 @@ const struct bkey_ops bch_bkey_inode_ops = { .val_to_text = bch_inode_to_text, }; -void bch_inode_init(struct cache_set *c, struct bkey_i_inode *inode, +void bch_inode_init(struct cache_set *c, struct bch_inode_unpacked *inode_u, uid_t uid, gid_t gid, umode_t mode, dev_t rdev) { - struct timespec ts = CURRENT_TIME; - s64 now = timespec_to_ns(&ts); - struct bch_inode *bi; - - bi = &bkey_inode_init(&inode->k_i)->v; - bi->i_uid = cpu_to_le32(uid); - bi->i_gid = cpu_to_le32(gid); - - bi->i_mode = cpu_to_le16(mode); - bi->i_dev = cpu_to_le32(rdev); - bi->i_atime = cpu_to_le64(now); - bi->i_mtime = cpu_to_le64(now); - bi->i_ctime = cpu_to_le64(now); - bi->i_nlink = cpu_to_le32(S_ISDIR(mode) ? 2 : 1); - - get_random_bytes(&bi->i_hash_seed, sizeof(bi->i_hash_seed)); - SET_INODE_STR_HASH_TYPE(bi, c->sb.str_hash_type); + s64 now = timespec_to_bch_time(c, CURRENT_TIME); + + memset(inode_u, 0, sizeof(*inode_u)); + + /* ick */ + inode_u->i_flags |= c->sb.str_hash_type << INODE_STR_HASH_OFFSET; + get_random_bytes(&inode_u->i_hash_seed, sizeof(inode_u->i_hash_seed)); + + inode_u->i_mode = mode; + inode_u->i_uid = uid; + inode_u->i_gid = gid; + inode_u->i_dev = rdev; + inode_u->i_atime = now; + inode_u->i_mtime = now; + inode_u->i_ctime = now; + inode_u->i_otime = now; } int bch_inode_create(struct cache_set *c, struct bkey_i *inode, @@ -200,7 +352,7 @@ int bch_inode_truncate(struct cache_set *c, u64 inode_nr, u64 new_size, struct extent_insert_hook *hook, u64 *journal_seq) { return bch_discard(c, POS(inode_nr, new_size), POS(inode_nr + 1, 0), - 0, NULL, hook, journal_seq); + ZERO_VERSION, NULL, hook, journal_seq); } int bch_inode_rm(struct cache_set *c, u64 inode_nr) @@ -215,7 +367,7 @@ int bch_inode_rm(struct cache_set *c, u64 inode_nr) ret = bch_btree_delete_range(c, BTREE_ID_XATTRS, POS(inode_nr, 0), POS(inode_nr + 1, 0), - 0, NULL, NULL, NULL); + ZERO_VERSION, NULL, NULL, NULL); if (ret < 0) return ret; @@ -230,7 +382,7 @@ int bch_inode_rm(struct cache_set *c, u64 inode_nr) ret = bch_btree_delete_range(c, BTREE_ID_DIRENTS, POS(inode_nr, 0), POS(inode_nr + 1, 0), - 0, NULL, NULL, NULL); + ZERO_VERSION, NULL, NULL, NULL); if (ret < 0) return ret; @@ -241,25 +393,19 @@ int bch_inode_rm(struct cache_set *c, u64 inode_nr) NULL, NULL, BTREE_INSERT_NOFAIL); } -int bch_inode_update(struct cache_set *c, struct bkey_i *inode, - u64 *journal_seq) -{ - return bch_btree_update(c, BTREE_ID_INODES, inode, journal_seq); -} - int bch_inode_find_by_inum(struct cache_set *c, u64 inode_nr, - struct bkey_i_inode *inode) + struct bch_inode_unpacked *inode) { struct btree_iter iter; struct bkey_s_c k; + int ret = -ENOENT; for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0), k) { switch (k.k->type) { case BCH_INODE_FS: - bkey_reassemble(&inode->k_i, k); - bch_btree_iter_unlock(&iter); - return 0; + ret = bch_inode_unpack(bkey_s_c_to_inode(k), inode); + break; default: /* hole, not found */ break; @@ -269,7 +415,7 @@ int bch_inode_find_by_inum(struct cache_set *c, u64 inode_nr, } - return bch_btree_iter_unlock(&iter) ?: -ENOENT; + return bch_btree_iter_unlock(&iter) ?: ret; } int bch_cached_dev_inode_find_by_uuid(struct cache_set *c, uuid_le *uuid, diff --git a/libbcache/inode.h b/libbcache/inode.h index fa1a4cf9..81dccf68 100644 --- a/libbcache/inode.h +++ b/libbcache/inode.h @@ -3,18 +3,53 @@ extern const struct bkey_ops bch_bkey_inode_ops; -ssize_t bch_inode_status(char *, size_t, const struct bkey *); +struct bch_inode_unpacked { + u64 inum; + __le64 i_hash_seed; + u32 i_flags; + u16 i_mode; -void bch_inode_init(struct cache_set *, struct bkey_i_inode *, +#define BCH_INODE_FIELD(_name, _bits) u##_bits _name; + BCH_INODE_FIELDS() +#undef BCH_INODE_FIELD +}; + +struct bkey_inode_buf { + struct bkey_i_inode inode; + +#define BCH_INODE_FIELD(_name, _bits) + 8 + _bits / 8 + u8 _pad[0 + BCH_INODE_FIELDS()]; +#undef BCH_INODE_FIELD +} __packed; + +void bch_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); +int bch_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); + +void bch_inode_init(struct cache_set *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t); int bch_inode_create(struct cache_set *, struct bkey_i *, u64, u64, u64 *); int bch_inode_truncate(struct cache_set *, u64, u64, struct extent_insert_hook *, u64 *); int bch_inode_rm(struct cache_set *, u64); -int bch_inode_update(struct cache_set *, struct bkey_i *, u64 *); -int bch_inode_find_by_inum(struct cache_set *, u64, struct bkey_i_inode *); +int bch_inode_find_by_inum(struct cache_set *, u64, + struct bch_inode_unpacked *); int bch_cached_dev_inode_find_by_uuid(struct cache_set *, uuid_le *, struct bkey_i_inode_blockdev *); +static inline struct timespec bch_time_to_timespec(struct cache_set *c, u64 time) +{ + return ns_to_timespec(time * c->sb.time_precision + c->sb.time_base_lo); +} + +static inline u64 timespec_to_bch_time(struct cache_set *c, struct timespec ts) +{ + s64 ns = timespec_to_ns(&ts) - c->sb.time_base_lo; + + if (c->sb.time_precision == 1) + return ns; + + return div_s64(ns, c->sb.time_precision); +} + #endif diff --git a/libbcache/io.c b/libbcache/io.c index 4112ea50..2f0e48a0 100644 --- a/libbcache/io.c +++ b/libbcache/io.c @@ -22,7 +22,7 @@ #include "move.h" #include "notify.h" #include "stats.h" -#include "super.h" +#include "super-io.h" #include <linux/blkdev.h> #include <linux/random.h> @@ -382,11 +382,27 @@ static void bch_write_endio(struct bio *bio) closure_put(cl); } +static struct nonce extent_nonce(struct bversion version, + unsigned nonce, + unsigned uncompressed_size, + unsigned compression_type) +{ + return (struct nonce) {{ + [0] = cpu_to_le32((nonce << 12) | + (uncompressed_size << 22)), + [1] = cpu_to_le32(version.lo), + [2] = cpu_to_le32(version.lo >> 32), + [3] = cpu_to_le32(version.hi| + (compression_type << 24))^BCH_NONCE_EXTENT, + }}; +} + static void init_append_extent(struct bch_write_op *op, unsigned compressed_size, unsigned uncompressed_size, unsigned compression_type, - u64 csum, unsigned csum_type, + unsigned nonce, + struct bch_csum csum, unsigned csum_type, struct open_bucket *ob) { struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top); @@ -394,11 +410,13 @@ static void init_append_extent(struct bch_write_op *op, op->pos.offset += uncompressed_size; e->k.p = op->pos; e->k.size = uncompressed_size; + e->k.version = op->version; + bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED); bch_extent_crc_append(e, compressed_size, uncompressed_size, compression_type, - csum, csum_type); + nonce, csum, csum_type); bch_alloc_sectors_append_ptrs(op->c, e, op->nr_replicas, ob, compressed_size); @@ -417,7 +435,7 @@ static int bch_write_extent(struct bch_write_op *op, unsigned key_to_write_offset = op->insert_keys.top_p - op->insert_keys.keys_p; struct bkey_i *key_to_write; - unsigned csum_type = c->opts.data_checksum; + unsigned csum_type = op->csum_type; unsigned compression_type = op->compression_type; int ret; @@ -426,8 +444,8 @@ static int bch_write_extent(struct bch_write_op *op, /* Need to decompress data? */ if ((op->flags & BCH_WRITE_DATA_COMPRESSED) && - (op->crc.uncompressed_size != op->size || - op->crc.compressed_size > ob->sectors_free)) { + (crc_uncompressed_size(NULL, &op->crc) != op->size || + crc_compressed_size(NULL, &op->crc) > ob->sectors_free)) { int ret; ret = bch_bio_uncompress_inplace(c, orig, op->size, op->crc); @@ -439,9 +457,10 @@ static int bch_write_extent(struct bch_write_op *op, if (op->flags & BCH_WRITE_DATA_COMPRESSED) { init_append_extent(op, - op->crc.compressed_size, - op->crc.uncompressed_size, + crc_compressed_size(NULL, &op->crc), + crc_uncompressed_size(NULL, &op->crc), op->crc.compression_type, + op->crc.nonce, op->crc.csum, op->crc.csum_type, ob); @@ -457,7 +476,10 @@ static int bch_write_extent(struct bch_write_op *op, /* all units here in bytes */ unsigned total_output = 0, output_available = min(ob->sectors_free << 9, orig->bi_iter.bi_size); - u64 csum; + unsigned crc_nonce = bch_csum_type_is_encryption(csum_type) + ? op->nonce : 0; + struct bch_csum csum; + struct nonce nonce; bio = bio_alloc_bioset(GFP_NOIO, DIV_ROUND_UP(output_available, PAGE_SIZE), @@ -489,13 +511,20 @@ static int bch_write_extent(struct bch_write_op *op, BUG_ON(src_len & (block_bytes(c) - 1)); swap(bio->bi_iter.bi_size, dst_len); - csum = bch_checksum_bio(bio, csum_type); + nonce = extent_nonce(op->version, + crc_nonce, + src_len >> 9, + compression_type), + + bch_encrypt_bio(c, csum_type, nonce, bio); + + csum = bch_checksum_bio(c, csum_type, nonce, bio); swap(bio->bi_iter.bi_size, dst_len); init_append_extent(op, dst_len >> 9, src_len >> 9, fragment_compression_type, - csum, csum_type, ob); + crc_nonce, csum, csum_type, ob); total_output += dst_len; bio_advance(bio, dst_len); @@ -531,7 +560,8 @@ static int bch_write_extent(struct bch_write_op *op, wbio->put_bio = bio != orig; init_append_extent(op, bio_sectors(bio), bio_sectors(bio), - compression_type, 0, csum_type, ob); + compression_type, 0, + (struct bch_csum) { 0 }, csum_type, ob); ret = bio != orig; } @@ -546,8 +576,7 @@ static int bch_write_extent(struct bch_write_op *op, key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - if (!(op->flags & BCH_WRITE_CACHED)) - bch_check_mark_super(c, key_to_write, false); + bch_check_mark_super(c, key_to_write, false); #ifndef CONFIG_BCACHE_NO_IO bch_submit_wbio_replicas(to_wbio(bio), c, key_to_write, false); @@ -748,6 +777,11 @@ void bch_write(struct closure *cl) closure_return(cl); } + if (bversion_zero(op->version) && + bch_csum_type_is_encryption(op->csum_type)) + op->version.lo = + atomic64_inc_return(&c->key_version) + 1; + if (!(op->flags & BCH_WRITE_DISCARD)) bch_increment_clock(c, bio_sectors(bio), WRITE); @@ -804,17 +838,21 @@ void bch_write_op_init(struct bch_write_op *op, struct cache_set *c, struct write_point *wp, struct bpos pos, u64 *journal_seq, unsigned flags) { + EBUG_ON(res.sectors && !res.nr_replicas); + op->c = c; op->io_wq = index_update_wq(op); op->bio = bio; op->written = 0; op->error = 0; op->flags = flags; + op->csum_type = bch_data_checksum_type(c); op->compression_type = c->opts.compression; op->nr_replicas = res.nr_replicas; op->alloc_reserve = RESERVE_NONE; + op->nonce = 0; op->pos = pos; - op->version = 0; + op->version = ZERO_VERSION; op->res = res; op->wp = wp; @@ -853,7 +891,7 @@ void bch_write_op_init(struct bch_write_op *op, struct cache_set *c, * appropriately inode_truncate should call this */ int bch_discard(struct cache_set *c, struct bpos start, - struct bpos end, u64 version, + struct bpos end, struct bversion version, struct disk_reservation *disk_res, struct extent_insert_hook *hook, u64 *journal_seq) @@ -878,7 +916,11 @@ static int bio_checksum_uncompress(struct cache_set *c, struct bio *src = &rbio->bio; struct bio *dst = &bch_rbio_parent(rbio)->bio; struct bvec_iter dst_iter = rbio->parent_iter; - u64 csum; + struct nonce nonce = extent_nonce(rbio->version, + rbio->crc.nonce, + crc_uncompressed_size(NULL, &rbio->crc), + rbio->crc.compression_type); + struct bch_csum csum; int ret = 0; /* @@ -888,18 +930,19 @@ static int bio_checksum_uncompress(struct cache_set *c, * in order to promote */ if (rbio->bounce) { - src->bi_iter.bi_size = rbio->crc.compressed_size << 9; - src->bi_iter.bi_idx = 0; - src->bi_iter.bi_bvec_done = 0; + src->bi_iter.bi_size = crc_compressed_size(NULL, &rbio->crc) << 9; + src->bi_iter.bi_idx = 0; + src->bi_iter.bi_bvec_done = 0; } else { src->bi_iter = rbio->parent_iter; } - csum = bch_checksum_bio(src, rbio->crc.csum_type); - if (cache_nonfatal_io_err_on(rbio->crc.csum != csum, rbio->ca, - "data checksum error, inode %llu offset %llu: expected %0llx got %0llx (type %u)", + csum = bch_checksum_bio(c, rbio->crc.csum_type, nonce, src); + if (cache_nonfatal_io_err_on(bch_crc_cmp(rbio->crc.csum, csum), rbio->ca, + "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)", rbio->inode, (u64) rbio->parent_iter.bi_sector << 9, - rbio->crc.csum, csum, rbio->crc.csum_type)) + rbio->crc.csum.hi, rbio->crc.csum.lo, csum.hi, csum.lo, + rbio->crc.csum_type)) ret = -EIO; /* @@ -908,6 +951,7 @@ static int bio_checksum_uncompress(struct cache_set *c, */ if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) { if (!ret) { + bch_encrypt_bio(c, rbio->crc.csum_type, nonce, src); ret = bch_bio_uncompress(c, src, dst, dst_iter, rbio->crc); if (ret) @@ -915,8 +959,20 @@ static int bio_checksum_uncompress(struct cache_set *c, } } else if (rbio->bounce) { bio_advance(src, rbio->crc.offset << 9); + + /* don't need to decrypt the entire bio: */ + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; + + nonce = nonce_add(nonce, rbio->crc.offset << 9); + + bch_encrypt_bio(c, rbio->crc.csum_type, + nonce, src); + bio_copy_data_iter(dst, dst_iter, src, src->bi_iter); + } else { + bch_encrypt_bio(c, rbio->crc.csum_type, nonce, src); } return ret; @@ -1108,7 +1164,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig, */ unsigned sectors = max_t(unsigned, k.k->size, - pick->crc.uncompressed_size); + crc_uncompressed_size(NULL, &pick->crc)); unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); promote_op = kmalloc(sizeof(*promote_op) + @@ -1130,7 +1186,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig, */ if (pick->crc.compression_type != BCH_COMPRESSION_NONE || (pick->crc.csum_type != BCH_CSUM_NONE && - (bvec_iter_sectors(iter) != pick->crc.uncompressed_size || + (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) || (flags & BCH_READ_FORCE_BOUNCE)))) { read_full = true; bounce = true; @@ -1138,7 +1194,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig, if (bounce) { unsigned sectors = read_full - ? (pick->crc.compressed_size ?: k.k->size) + ? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size) : bvec_iter_sectors(iter); rbio = container_of(bio_alloc_bioset(GFP_NOIO, @@ -1183,6 +1239,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig, rbio->flags = flags; rbio->bounce = bounce; rbio->split = split; + rbio->version = k.k->version; rbio->crc = pick->crc; /* * crc.compressed_size will be 0 if there wasn't any checksum @@ -1190,7 +1247,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig, * bounced (which isn't necessarily the original key size, if we bounced * only for promoting) */ - rbio->crc.compressed_size = bio_sectors(&rbio->bio); + rbio->crc._compressed_size = bio_sectors(&rbio->bio) - 1; rbio->ptr = pick->ptr; rbio->ca = pick->ca; rbio->promote = promote_op; @@ -1210,7 +1267,8 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig, bch_migrate_write_init(c, &promote_op->write, &c->promote_write_point, k, NULL, - BCH_WRITE_ALLOC_NOWAIT); + BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_CACHED); promote_op->write.promote = true; if (rbio->crc.compression_type) { diff --git a/libbcache/io.h b/libbcache/io.h index b7668b4e..99e51089 100644 --- a/libbcache/io.h +++ b/libbcache/io.h @@ -79,7 +79,7 @@ void bch_submit_wbio_replicas(struct bch_write_bio *, struct cache_set *, const struct bkey_i *, bool); int bch_discard(struct cache_set *, struct bpos, struct bpos, - u64, struct disk_reservation *, + struct bversion, struct disk_reservation *, struct extent_insert_hook *, u64 *); void bch_read_retry_work(struct work_struct *); diff --git a/libbcache/io_types.h b/libbcache/io_types.h index f7d99cdb..64269d94 100644 --- a/libbcache/io_types.h +++ b/libbcache/io_types.h @@ -43,7 +43,8 @@ struct bch_read_bio { u8 bounce:1, split:1; - struct bch_extent_crc64 crc; + struct bversion version; + struct bch_extent_crc128 crc; struct bch_extent_ptr ptr; struct cache *ca; @@ -101,15 +102,17 @@ struct bch_write_op { short error; u16 flags; + unsigned csum_type:4; unsigned compression_type:4; unsigned nr_replicas:4; unsigned alloc_reserve:4; + unsigned nonce:14; struct bpos pos; - unsigned version; + struct bversion version; /* For BCH_WRITE_DATA_COMPRESSED: */ - struct bch_extent_crc64 crc; + struct bch_extent_crc128 crc; unsigned size; struct disk_reservation res; diff --git a/libbcache/journal.c b/libbcache/journal.c index 9e09b86d..3bb9e3c3 100644 --- a/libbcache/journal.c +++ b/libbcache/journal.c @@ -18,7 +18,8 @@ #include "io.h" #include "keylist.h" #include "journal.h" -#include "super.h" +#include "super-io.h" +#include "vstructs.h" #include <trace/events/bcache.h> @@ -52,19 +53,14 @@ static inline u64 journal_pin_seq(struct journal *j, return last_seq(j) + fifo_entry_idx(&j->pin, pin_list); } -#define for_each_jset_entry(entry, jset) \ - for (entry = (jset)->start; \ - entry < bkey_idx(jset, le32_to_cpu((jset)->u64s)); \ - entry = jset_keys_next(entry)) - static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, struct jset_entry *entry, unsigned type) { - while (entry < bkey_idx(jset, le32_to_cpu(jset->u64s))) { + while (entry < vstruct_last(jset)) { if (JOURNAL_ENTRY_TYPE(entry) == type) return entry; - entry = jset_keys_next(entry); + entry = vstruct_next(entry); } return NULL; @@ -73,14 +69,11 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, #define for_each_jset_entry_type(entry, jset, type) \ for (entry = (jset)->start; \ (entry = __jset_entry_type_next(jset, entry, type)); \ - entry = jset_keys_next(entry)) + entry = vstruct_next(entry)) #define for_each_jset_key(k, _n, entry, jset) \ for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \ - for (k = (entry)->start; \ - (k < bkey_idx(entry, le16_to_cpu((entry)->u64s)) &&\ - (_n = bkey_next(k), 1)); \ - k = _n) + vstruct_for_each_safe(entry, k, _n) static inline void bch_journal_add_entry(struct journal_buf *buf, const void *data, size_t u64s, @@ -199,8 +192,6 @@ redo_peek: closure_sync(&cl); - mutex_lock(&c->btree_interior_update_lock); - for (i = 0;; i++) { struct btree_interior_update *as; struct pending_btree_node_free *d; @@ -212,6 +203,8 @@ redo_peek: } n = bl->entries[i]; mutex_unlock(&j->blacklist_lock); +redo_wait: + mutex_lock(&c->btree_interior_update_lock); /* * Is the node on the list of pending interior node updates - @@ -225,11 +218,11 @@ redo_peek: closure_wait(&as->wait, &cl); mutex_unlock(&c->btree_interior_update_lock); closure_sync(&cl); - break; + goto redo_wait; } - } - mutex_unlock(&c->btree_interior_update_lock); + mutex_unlock(&c->btree_interior_update_lock); + } mutex_lock(&j->blacklist_lock); @@ -377,7 +370,6 @@ out: struct journal_list { struct closure cl; struct mutex lock; - struct mutex cache_set_buffer_lock; struct list_head *head; int ret; }; @@ -394,7 +386,7 @@ static int journal_entry_add(struct cache_set *c, struct journal_list *jlist, { struct journal_replay *i, *pos; struct list_head *where; - size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s)); + size_t bytes = vstruct_bytes(j); __le64 last_seq; int ret; @@ -422,8 +414,7 @@ static int journal_entry_add(struct cache_set *c, struct journal_list *jlist, list_for_each_entry_reverse(i, jlist->head, list) { /* Duplicate? */ if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { - fsck_err_on(bytes != __set_bytes(&i->j, - le32_to_cpu(i->j.u64s)) || + fsck_err_on(bytes != vstruct_bytes(&i->j) || memcmp(j, &i->j, bytes), c, "found duplicate but non identical journal entries (seq %llu)", le64_to_cpu(j->seq)); @@ -455,11 +446,21 @@ fsck_err: return ret; } +static struct nonce journal_nonce(const struct jset *jset) +{ + return (struct nonce) {{ + [0] = 0, + [1] = ((__le32 *) &jset->seq)[0], + [2] = ((__le32 *) &jset->seq)[1], + [3] = BCH_NONCE_JOURNAL, + }}; +} + static void journal_entry_null_range(void *start, void *end) { struct jset_entry *entry; - for (entry = start; entry != end; entry = jset_keys_next(entry)) { + for (entry = start; entry != end; entry = vstruct_next(entry)) { entry->u64s = 0; entry->btree_id = 0; entry->level = 0; @@ -473,7 +474,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j, struct bkey_i *k, enum bkey_type key_type, const char *type) { - void *next = jset_keys_next(entry); + void *next = vstruct_next(entry); const char *invalid; char buf[160]; int ret = 0; @@ -481,16 +482,16 @@ static int journal_validate_key(struct cache_set *c, struct jset *j, if (mustfix_fsck_err_on(!k->k.u64s, c, "invalid %s in journal: k->u64s 0", type)) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(jset_keys_next(entry), next); + journal_entry_null_range(vstruct_next(entry), next); return 0; } if (mustfix_fsck_err_on((void *) bkey_next(k) > - (void *) jset_keys_next(entry), c, + (void *) vstruct_next(entry), c, "invalid %s in journal: extends past end of journal entry", type)) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); - journal_entry_null_range(jset_keys_next(entry), next); + journal_entry_null_range(vstruct_next(entry), next); return 0; } @@ -499,7 +500,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j, type, k->k.format)) { le16_add_cpu(&entry->u64s, -k->k.u64s); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(jset_keys_next(entry), next); + journal_entry_null_range(vstruct_next(entry), next); return 0; } @@ -514,7 +515,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j, le16_add_cpu(&entry->u64s, -k->k.u64s); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); - journal_entry_null_range(jset_keys_next(entry), next); + journal_entry_null_range(vstruct_next(entry), next); return 0; } fsck_err: @@ -525,16 +526,17 @@ fsck_err: #define JOURNAL_ENTRY_NONE 6 #define JOURNAL_ENTRY_BAD 7 -static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 sector, +static int journal_entry_validate(struct cache_set *c, + struct jset *j, u64 sector, unsigned bucket_sectors_left, unsigned sectors_read) { struct jset_entry *entry; - size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s)); - u64 got, expect; + size_t bytes = vstruct_bytes(j); + struct bch_csum csum; int ret = 0; - if (le64_to_cpu(j->magic) != jset_magic(&c->disk_sb)) + if (le64_to_cpu(j->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) { @@ -554,25 +556,32 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto if (bytes > sectors_read << 9) return JOURNAL_ENTRY_REREAD; - got = le64_to_cpu(j->csum); - expect = __csum_set(j, le32_to_cpu(j->u64s), JSET_CSUM_TYPE(j)); - if (mustfix_fsck_err_on(got != expect, c, - "journal checksum bad (got %llu expect %llu), sector %lluu", - got, expect, sector)) { + if (fsck_err_on(!bch_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c, + "journal entry with unknown csum type %llu sector %lluu", + JSET_CSUM_TYPE(j), sector)) + return JOURNAL_ENTRY_BAD; + + csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); + if (mustfix_fsck_err_on(bch_crc_cmp(csum, j->csum), c, + "journal checksum bad, sector %llu", sector)) { /* XXX: retry IO, when we start retrying checksum errors */ /* XXX: note we might have missing journal entries */ return JOURNAL_ENTRY_BAD; } - if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), - c, "invalid journal entry: last_seq > seq")) + bch_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), + j->encrypted_start, + vstruct_end(j) - (void *) j->encrypted_start); + + if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c, + "invalid journal entry: last_seq > seq")) j->last_seq = j->seq; - for_each_jset_entry(entry, j) { + vstruct_for_each(j, entry) { struct bkey_i *k; - if (mustfix_fsck_err_on(jset_keys_next(entry) > - bkey_idx(j, le32_to_cpu(j->u64s)), c, + if (mustfix_fsck_err_on(vstruct_next(entry) > + vstruct_last(j), c, "journal entry extents past end of jset")) { j->u64s = cpu_to_le64((u64 *) entry - j->_data); break; @@ -580,9 +589,7 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto switch (JOURNAL_ENTRY_TYPE(entry)) { case JOURNAL_ENTRY_BTREE_KEYS: - for (k = entry->start; - k < bkey_idx(entry, le16_to_cpu(entry->u64s)); - k = bkey_next(k)) { + vstruct_for_each(entry, k) { ret = journal_validate_key(c, j, entry, k, bkey_type(entry->level, entry->btree_id), @@ -599,7 +606,7 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto le16_to_cpu(entry->u64s) != k->k.u64s, c, "invalid btree root journal entry: wrong number of keys")) { journal_entry_null_range(entry, - jset_keys_next(entry)); + vstruct_next(entry)); continue; } @@ -616,14 +623,14 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto if (mustfix_fsck_err_on(le16_to_cpu(entry->u64s) != 1, c, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, - jset_keys_next(entry)); + vstruct_next(entry)); } break; default: mustfix_fsck_err(c, "invalid journal entry type %llu", JOURNAL_ENTRY_TYPE(entry)); - journal_entry_null_range(entry, jset_keys_next(entry)); + journal_entry_null_range(entry, vstruct_next(entry)); break; } } @@ -632,126 +639,127 @@ fsck_err: return ret; } -static int journal_read_bucket(struct cache *ca, struct journal_list *jlist, +struct journal_read_buf { + void *data; + size_t size; +}; + +static int journal_read_buf_realloc(struct journal_read_buf *b, + size_t new_size) +{ + void *n; + + new_size = roundup_pow_of_two(new_size); + n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size)); + if (!n) + return -ENOMEM; + + free_pages((unsigned long) b->data, get_order(b->size)); + b->data = n; + b->size = new_size; + return 0; +} + +static int journal_read_bucket(struct cache *ca, + struct journal_read_buf *buf, + struct journal_list *jlist, unsigned bucket, u64 *seq, bool *entries_found) { struct cache_set *c = ca->set; struct journal_device *ja = &ca->journal; struct bio *bio = ja->bio; - struct jset *j, *data; - unsigned blocks, sectors_read, bucket_offset = 0; - unsigned max_entry_sectors = c->journal.entry_size_max >> 9; - u64 sector = bucket_to_sector(ca, - journal_bucket(ca->disk_sb.sb, bucket)); + struct jset *j = NULL; + unsigned sectors, sectors_read = 0; + u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), + end = offset + ca->mi.bucket_size; bool saw_bad = false; int ret = 0; - data = (void *) __get_free_pages(GFP_KERNEL, - get_order(c->journal.entry_size_max)); - if (!data) { - mutex_lock(&jlist->cache_set_buffer_lock); - data = c->journal.buf[0].data; - } - pr_debug("reading %u", bucket); - while (bucket_offset < ca->mi.bucket_size) { -reread: - sectors_read = min_t(unsigned, - ca->mi.bucket_size - bucket_offset, - max_entry_sectors); + while (offset < end) { + if (!sectors_read) { +reread: sectors_read = min_t(unsigned, + end - offset, buf->size >> 9); - bio_reset(bio); - bio->bi_bdev = ca->disk_sb.bdev; - bio->bi_iter.bi_sector = sector + bucket_offset; - bio->bi_iter.bi_size = sectors_read << 9; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bch_bio_map(bio, data); - - ret = submit_bio_wait(bio); - - if (cache_fatal_io_err_on(ret, ca, - "journal read from sector %llu", - sector + bucket_offset) || - bch_meta_read_fault("journal")) { - ret = -EIO; - goto err; - } + bio_reset(bio); + bio->bi_bdev = ca->disk_sb.bdev; + bio->bi_iter.bi_sector = offset; + bio->bi_iter.bi_size = sectors_read << 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + bch_bio_map(bio, buf->data); - /* This function could be simpler now since we no longer write - * journal entries that overlap bucket boundaries; this means - * the start of a bucket will always have a valid journal entry - * if it has any journal entries at all. - */ + ret = submit_bio_wait(bio); - j = data; - while (sectors_read) { - ret = journal_entry_validate(c, j, - sector + bucket_offset, - ca->mi.bucket_size - bucket_offset, - sectors_read); - switch (ret) { - case BCH_FSCK_OK: - break; - case JOURNAL_ENTRY_REREAD: - goto reread; - case JOURNAL_ENTRY_NONE: - if (!saw_bad) - goto out; - blocks = 1; - goto next_block; - case JOURNAL_ENTRY_BAD: - saw_bad = true; - blocks = 1; - goto next_block; - default: - goto err; - } + if (cache_fatal_io_err_on(ret, ca, + "journal read from sector %llu", + offset) || + bch_meta_read_fault("journal")) + return -EIO; - /* - * This happens sometimes if we don't have discards on - - * when we've partially overwritten a bucket with new - * journal entries. We don't need the rest of the - * bucket: - */ - if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - goto out; - - ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - - ret = journal_entry_add(c, jlist, j); - switch (ret) { - case JOURNAL_ENTRY_ADD_OK: - *entries_found = true; - break; - case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: - break; - default: - goto err; + j = buf->data; + } + + ret = journal_entry_validate(c, j, offset, + end - offset, sectors_read); + switch (ret) { + case BCH_FSCK_OK: + break; + case JOURNAL_ENTRY_REREAD: + if (vstruct_bytes(j) > buf->size) { + ret = journal_read_buf_realloc(buf, + vstruct_bytes(j)); + if (ret) + return ret; } + goto reread; + case JOURNAL_ENTRY_NONE: + if (!saw_bad) + return 0; + sectors = c->sb.block_size; + goto next_block; + case JOURNAL_ENTRY_BAD: + saw_bad = true; + sectors = c->sb.block_size; + goto next_block; + default: + return ret; + } - if (le64_to_cpu(j->seq) > *seq) - *seq = le64_to_cpu(j->seq); -next_block: - blocks = __set_blocks(j, le32_to_cpu(j->u64s), - block_bytes(c)); + /* + * This happens sometimes if we don't have discards on - + * when we've partially overwritten a bucket with new + * journal entries. We don't need the rest of the + * bucket: + */ + if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) + return 0; + + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - pr_debug("next"); - bucket_offset += blocks * c->sb.block_size; - sectors_read -= blocks * c->sb.block_size; - j = ((void *) j) + blocks * block_bytes(c); + ret = journal_entry_add(c, jlist, j); + switch (ret) { + case JOURNAL_ENTRY_ADD_OK: + *entries_found = true; + break; + case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: + break; + default: + return ret; } + + if (le64_to_cpu(j->seq) > *seq) + *seq = le64_to_cpu(j->seq); + + sectors = vstruct_sectors(j, c->block_bits); +next_block: + pr_debug("next"); + offset += sectors; + sectors_read -= sectors; + j = ((void *) j) + (sectors << 9); } -out: - ret = 0; -err: - if (data == c->journal.buf[0].data) - mutex_unlock(&jlist->cache_set_buffer_lock); - else - free_pages((unsigned long) data, - get_order(c->journal.entry_size_max)); - return ret; + return 0; } static void bch_journal_read_device(struct closure *cl) @@ -759,15 +767,11 @@ static void bch_journal_read_device(struct closure *cl) #define read_bucket(b) \ ({ \ bool entries_found = false; \ - int ret = journal_read_bucket(ca, jlist, b, \ - &seq, &entries_found); \ + ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \ + &entries_found); \ + if (ret) \ + goto err; \ __set_bit(b, bitmap); \ - if (ret) { \ - mutex_lock(&jlist->lock); \ - jlist->ret = ret; \ - mutex_unlock(&jlist->lock); \ - closure_return(cl); \ - } \ entries_found; \ }) @@ -777,24 +781,29 @@ static void bch_journal_read_device(struct closure *cl) struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev); + struct journal_read_buf buf = { NULL, 0 }; - unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb); - DECLARE_BITMAP(bitmap, nr_buckets); + DECLARE_BITMAP(bitmap, ja->nr); unsigned i, l, r; u64 seq = 0; + int ret; - if (!nr_buckets) - closure_return(cl); + if (!ja->nr) + goto out; + + bitmap_zero(bitmap, ja->nr); + ret = journal_read_buf_realloc(&buf, PAGE_SIZE); + if (ret) + goto err; - bitmap_zero(bitmap, nr_buckets); - pr_debug("%u journal buckets", nr_buckets); + pr_debug("%u journal buckets", ja->nr); /* * If the device supports discard but not secure discard, we can't do * the fancy fibonacci hash/binary search because the live journal * entries might not form a contiguous range: */ - for (i = 0; i < nr_buckets; i++) + for (i = 0; i < ja->nr; i++) read_bucket(i); goto search_done; @@ -805,8 +814,8 @@ static void bch_journal_read_device(struct closure *cl) * Read journal buckets ordered by golden ratio hash to quickly * find a sequence of buckets with valid journal entries */ - for (i = 0; i < nr_buckets; i++) { - l = (i * 2654435769U) % nr_buckets; + for (i = 0; i < ja->nr; i++) { + l = (i * 2654435769U) % ja->nr; if (test_bit(l, bitmap)) break; @@ -821,18 +830,18 @@ static void bch_journal_read_device(struct closure *cl) */ pr_debug("falling back to linear search"); linear_scan: - for (l = find_first_zero_bit(bitmap, nr_buckets); - l < nr_buckets; - l = find_next_zero_bit(bitmap, nr_buckets, l + 1)) + for (l = find_first_zero_bit(bitmap, ja->nr); + l < ja->nr; + l = find_next_zero_bit(bitmap, ja->nr, l + 1)) if (read_bucket(l)) goto bsearch; /* no journal entries on this device? */ - if (l == nr_buckets) - closure_return(cl); + if (l == ja->nr) + goto out; bsearch: /* Binary search */ - r = find_next_bit(bitmap, nr_buckets, l + 1); + r = find_next_bit(bitmap, ja->nr, l + 1); pr_debug("starting binary search, l %u r %u", l, r); while (l + 1 < r) { @@ -858,9 +867,9 @@ search_done: */ seq = 0; - for (i = 0; i < nr_buckets; i++) + for (i = 0; i < ja->nr; i++) if (ja->bucket_seq[i] >= seq && - ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % nr_buckets]) { + ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) { /* * When journal_next_bucket() goes to allocate for * the first time, it'll use the bucket after @@ -875,20 +884,26 @@ search_done: * reclaimed - journal reclaim will immediately reclaim whatever isn't * pinned when it first runs: */ - ja->last_idx = (ja->cur_idx + 1) % nr_buckets; + ja->last_idx = (ja->cur_idx + 1) % ja->nr; /* * Read buckets in reverse order until we stop finding more journal * entries: */ - for (i = (ja->cur_idx + nr_buckets - 1) % nr_buckets; + for (i = (ja->cur_idx + ja->nr - 1) % ja->nr; i != ja->cur_idx; - i = (i + nr_buckets - 1) % nr_buckets) + i = (i + ja->nr - 1) % ja->nr) if (!test_bit(i, bitmap) && !read_bucket(i)) break; - +out: + free_pages((unsigned long) buf.data, get_order(buf.size)); closure_return(cl); +err: + mutex_lock(&jlist->lock); + jlist->ret = ret; + mutex_unlock(&jlist->lock); + goto out; #undef read_bucket } @@ -930,6 +945,19 @@ static int journal_seq_blacklist_read(struct journal *j, return 0; } +static inline bool journal_has_keys(struct list_head *list) +{ + struct journal_replay *i; + struct jset_entry *entry; + struct bkey_i *k, *_n; + + list_for_each_entry(i, list, list) + for_each_jset_key(k, _n, entry, &i->j) + return true; + + return false; +} + int bch_journal_read(struct cache_set *c, struct list_head *list) { struct jset_entry *prio_ptrs; @@ -944,7 +972,6 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) closure_init_stack(&jlist.cl); mutex_init(&jlist.lock); - mutex_init(&jlist.cache_set_buffer_lock); jlist.head = list; jlist.ret = 0; @@ -964,6 +991,9 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) return BCH_FSCK_REPAIR_IMPOSSIBLE; } + fsck_err_on(c->sb.clean && journal_has_keys(list), c, + "filesystem marked clean but journal has keys to replay"); + j = &list_entry(list->prev, struct journal_replay, list)->j; unfixable_fsck_err_on(le64_to_cpu(j->seq) - @@ -1057,7 +1087,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) struct bkey_s_c k_s_c = bkey_i_to_s_c(k); if (btree_type_has_ptrs(type)) - __bch_btree_mark_key(c, type, k_s_c); + bch_btree_mark_key_initial(c, type, k_s_c); } } @@ -1171,10 +1201,9 @@ static enum { buf->data->last_seq = cpu_to_le64(last_seq(j)); j->prev_buf_sectors = - __set_blocks(buf->data, - le32_to_cpu(buf->data->u64s) + - journal_entry_u64s_reserve(buf), - block_bytes(c)) * c->sb.block_size; + vstruct_blocks_plus(buf->data, c->block_bits, + journal_entry_u64s_reserve(buf)) * + c->sb.block_size; BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors); @@ -1219,9 +1248,8 @@ static unsigned journal_dev_buckets_available(struct journal *j, struct cache *ca) { struct journal_device *ja = &ca->journal; - unsigned nr = bch_nr_journal_buckets(ca->disk_sb.sb); - unsigned next = (ja->cur_idx + 1) % nr; - unsigned available = (ja->last_idx + nr - next) % nr; + unsigned next = (ja->cur_idx + 1) % ja->nr; + unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; /* * Hack to avoid a deadlock during journal replay: @@ -1271,7 +1299,7 @@ static int journal_entry_sectors(struct journal *j) * for the previous entry we have to make sure we have space for * it too: */ - if (bch_extent_has_device(e.c, ca->sb.nr_this_dev)) { + if (bch_extent_has_device(e.c, ca->dev_idx)) { if (j->prev_buf_sectors > ca->journal.sectors_free) buckets_required++; @@ -1479,17 +1507,28 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list) entries++; } + if (keys) { + bch_btree_flush(c); + + /* + * Write a new journal entry _before_ we start journalling new data - + * otherwise, we could end up with btree node bsets with journal seqs + * arbitrarily far in the future vs. the most recently written journal + * entry on disk, if we crash before writing the next journal entry: + */ + ret = bch_journal_meta(&c->journal); + if (ret) + goto err; + } + bch_info(c, "journal replay done, %i keys in %i entries, seq %llu", keys, entries, (u64) atomic64_read(&j->seq)); - fsck_err_on(c->sb.clean && keys, c, - "filesystem marked clean, but journal had keys to replay"); - bch_journal_set_replay_done(&c->journal); err: if (ret) bch_err(c, "journal replay error: %d", ret); -fsck_err: + bch_journal_entries_free(list); return ret; @@ -1497,28 +1536,40 @@ fsck_err: static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr) { - unsigned u64s = bch_journal_buckets_offset(ca->disk_sb.sb) + nr; + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets = + bch_sb_get_journal(ca->disk_sb.sb); + struct bch_sb_field *f; u64 *p; - int ret; - ret = bch_super_realloc(&ca->disk_sb, u64s); - if (ret) - return ret; + p = krealloc(ja->bucket_seq, nr * sizeof(u64), + GFP_KERNEL|__GFP_ZERO); + if (!p) + return -ENOMEM; + + ja->bucket_seq = p; - p = krealloc(ca->journal.bucket_seq, - nr * sizeof(u64), + p = krealloc(ja->buckets, nr * sizeof(u64), GFP_KERNEL|__GFP_ZERO); if (!p) return -ENOMEM; - ca->journal.bucket_seq = p; - ca->disk_sb.sb->u64s = cpu_to_le16(u64s); + ja->buckets = p; + + f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr + + sizeof(*journal_buckets) / sizeof(u64)); + if (!f) + return -ENOMEM; + f->type = BCH_SB_FIELD_journal; + ja->nr = nr; return 0; } int bch_cache_journal_alloc(struct cache *ca) { + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets; int ret; unsigned i; @@ -1540,11 +1591,15 @@ int bch_cache_journal_alloc(struct cache *ca) if (ret) return ret; - for (i = 0; i < bch_nr_journal_buckets(ca->disk_sb.sb); i++) { - unsigned long r = ca->mi.first_bucket + i; + journal_buckets = bch_sb_get_journal(ca->disk_sb.sb); + + for (i = 0; i < ja->nr; i++) { + u64 bucket = ca->mi.first_bucket + i; - bch_mark_metadata_bucket(ca, &ca->buckets[r], true); - set_journal_bucket(ca->disk_sb.sb, i, r); + ja->buckets[i] = bucket; + journal_buckets->buckets[i] = cpu_to_le64(bucket); + + bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true); } return 0; @@ -1749,7 +1804,7 @@ static void journal_reclaim_work(struct work_struct *work) struct cache *ca; struct journal_entry_pin *pin; u64 seq_to_flush = 0; - unsigned iter, nr, bucket_to_flush; + unsigned iter, bucket_to_flush; unsigned long next_flush; bool reclaim_lock_held = false, need_flush; @@ -1781,13 +1836,11 @@ static void journal_reclaim_work(struct work_struct *work) blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, - journal_bucket(ca->disk_sb.sb, - ja->last_idx)), + ja->buckets[ja->last_idx]), ca->mi.bucket_size, GFP_NOIO, 0); spin_lock(&j->lock); - ja->last_idx = (ja->last_idx + 1) % - bch_nr_journal_buckets(ca->disk_sb.sb); + ja->last_idx = (ja->last_idx + 1) % ja->nr; spin_unlock(&j->lock); wake_up(&j->wait); @@ -1798,8 +1851,7 @@ static void journal_reclaim_work(struct work_struct *work) * buckets */ spin_lock(&j->lock); - nr = bch_nr_journal_buckets(ca->disk_sb.sb), - bucket_to_flush = (ja->cur_idx + (nr >> 1)) % nr; + bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr; seq_to_flush = max_t(u64, seq_to_flush, ja->bucket_seq[bucket_to_flush]); spin_unlock(&j->lock); @@ -1861,7 +1913,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) */ extent_for_each_ptr_backwards(e, ptr) if (!(ca = PTR_CACHE(c, ptr)) || - ca->mi.state != CACHE_ACTIVE || + ca->mi.state != BCH_MEMBER_STATE_ACTIVE || ca->journal.sectors_free <= sectors) __bch_extent_drop_ptr(e, ptr); else @@ -1875,7 +1927,6 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) */ group_for_each_cache_rcu(ca, &j->devs, iter) { struct journal_device *ja = &ca->journal; - unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb); if (replicas >= replicas_want) break; @@ -1884,21 +1935,20 @@ static int journal_write_alloc(struct journal *j, unsigned sectors) * Check that we can use this device, and aren't already using * it: */ - if (bch_extent_has_device(e.c, ca->sb.nr_this_dev) || + if (bch_extent_has_device(e.c, ca->dev_idx) || !journal_dev_buckets_available(j, ca) || sectors > ca->mi.bucket_size) continue; ja->sectors_free = ca->mi.bucket_size - sectors; - ja->cur_idx = (ja->cur_idx + 1) % nr_buckets; + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq); extent_ptr_append(bkey_i_to_extent(&j->key), (struct bch_extent_ptr) { .offset = bucket_to_sector(ca, - journal_bucket(ca->disk_sb.sb, - ja->cur_idx)), - .dev = ca->sb.nr_this_dev, + ja->buckets[ja->cur_idx]), + .dev = ca->dev_idx, }); replicas++; @@ -1928,10 +1978,7 @@ static void journal_write_compact(struct jset *jset) * If we wanted to be really fancy here, we could sort all the keys in * the jset and drop keys that were overwritten - probably not worth it: */ - for (i = jset->start; - i < (struct jset_entry *) bkey_idx(jset, le32_to_cpu(jset->u64s)) && - (next = jset_keys_next(i), true); - i = next) { + vstruct_for_each_safe(jset, i, next) { unsigned u64s = le16_to_cpu(i->u64s); /* Empty entry: */ @@ -1945,7 +1992,7 @@ static void journal_write_compact(struct jset *jset) JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) && JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS && le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { - memmove_u64s_down(jset_keys_next(prev), + memmove_u64s_down(vstruct_next(prev), i->_data, u64s); le16_add_cpu(&prev->u64s, u64s); @@ -1953,12 +2000,12 @@ static void journal_write_compact(struct jset *jset) } /* Couldn't merge, move i into new position (after prev): */ - prev = prev ? jset_keys_next(prev) : jset->start; + prev = prev ? vstruct_next(prev) : jset->start; if (i != prev) memmove_u64s_down(prev, i, jset_u64s(u64s)); } - prev = prev ? jset_keys_next(prev) : jset->start; + prev = prev ? vstruct_next(prev) : jset->start; jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); } @@ -2019,6 +2066,7 @@ static void journal_write(struct closure *cl) struct cache_set *c = container_of(j, struct cache_set, journal); struct cache *ca; struct journal_buf *w = journal_prev_buf(j); + struct jset *jset = w->data; struct bio *bio; struct bch_extent_ptr *ptr; unsigned i, sectors, bytes; @@ -2036,24 +2084,27 @@ static void journal_write(struct closure *cl) } mutex_unlock(&c->btree_root_lock); - journal_write_compact(w->data); + journal_write_compact(jset); + + jset->read_clock = cpu_to_le16(c->prio_clock[READ].hand); + jset->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand); + jset->magic = cpu_to_le64(jset_magic(c)); + jset->version = cpu_to_le32(BCACHE_JSET_VERSION); - w->data->read_clock = cpu_to_le16(c->prio_clock[READ].hand); - w->data->write_clock = cpu_to_le16(c->prio_clock[WRITE].hand); - w->data->magic = cpu_to_le64(jset_magic(&c->disk_sb)); - w->data->version = cpu_to_le32(BCACHE_JSET_VERSION); + SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); + SET_JSET_CSUM_TYPE(jset, bch_meta_checksum_type(c)); - SET_JSET_BIG_ENDIAN(w->data, CPU_BIG_ENDIAN); - SET_JSET_CSUM_TYPE(w->data, c->opts.metadata_checksum); - w->data->csum = cpu_to_le64(__csum_set(w->data, - le32_to_cpu(w->data->u64s), - JSET_CSUM_TYPE(w->data))); + bch_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); - sectors = __set_blocks(w->data, le32_to_cpu(w->data->u64s), - block_bytes(c)) * c->sb.block_size; + jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), + journal_nonce(jset), jset); + + sectors = vstruct_sectors(jset, c->block_bits); BUG_ON(sectors > j->prev_buf_sectors); - bytes = __set_bytes(w->data, le32_to_cpu(w->data->u64s)); + bytes = vstruct_bytes(w->data); memset((void *) w->data + bytes, 0, (sectors << 9) - bytes); if (journal_write_alloc(j, sectors)) { @@ -2096,7 +2147,7 @@ static void journal_write(struct closure *cl) bio->bi_private = ca; bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); - bch_bio_map(bio, w->data); + bch_bio_map(bio, jset); trace_bcache_journal_write(bio); closure_bio_submit_punt(bio, cl, c); @@ -2105,7 +2156,7 @@ static void journal_write(struct closure *cl) } for_each_cache(ca, c, i) - if (ca->mi.state == CACHE_ACTIVE && + if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && journal_flushes_device(ca) && !bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) { percpu_ref_get(&ca->ref); @@ -2503,7 +2554,7 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf) "\tnr\t\t%u\n" "\tcur_idx\t\t%u (seq %llu)\n" "\tlast_idx\t%u (seq %llu)\n", - iter, bch_nr_journal_buckets(ca->disk_sb.sb), + iter, ja->nr, ja->cur_idx, ja->bucket_seq[ja->cur_idx], ja->last_idx, ja->bucket_seq[ja->last_idx]); } @@ -2521,7 +2572,7 @@ static bool bch_journal_writing_to_device(struct cache *ca) spin_lock(&j->lock); ret = bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), - ca->sb.nr_this_dev); + ca->dev_idx); spin_unlock(&j->lock); return ret; @@ -2541,10 +2592,11 @@ static bool bch_journal_writing_to_device(struct cache *ca) int bch_journal_move(struct cache *ca) { - unsigned i, nr_buckets; u64 last_flushed_seq; + struct journal_device *ja = &ca->journal; struct cache_set *c = ca->set; struct journal *j = &c->journal; + unsigned i; int ret = 0; /* Success */ if (bch_journal_writing_to_device(ca)) { @@ -2585,10 +2637,45 @@ int bch_journal_move(struct cache *ca) last_flushed_seq = last_seq(j); spin_unlock(&j->lock); - nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb); - - for (i = 0; i < nr_buckets; i += 1) - BUG_ON(ca->journal.bucket_seq[i] > last_flushed_seq); + for (i = 0; i < ja->nr; i += 1) + BUG_ON(ja->bucket_seq[i] > last_flushed_seq); return ret; } + +void bch_journal_free_cache(struct cache *ca) +{ + kfree(ca->journal.buckets); + kfree(ca->journal.bucket_seq); +} + +int bch_journal_init_cache(struct cache *ca) +{ + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets = + bch_sb_get_journal(ca->disk_sb.sb); + unsigned i, journal_entry_pages; + + journal_entry_pages = + DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb), + PAGE_SECTORS); + + ja->nr = bch_nr_journal_buckets(journal_buckets); + + ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->bucket_seq) + return -ENOMEM; + + ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages); + if (!ca->journal.bio) + return -ENOMEM; + + ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->buckets) + return -ENOMEM; + + for (i = 0; i < ja->nr; i++) + ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); + + return 0; +} diff --git a/libbcache/journal.h b/libbcache/journal.h index 759ed609..9274831a 100644 --- a/libbcache/journal.h +++ b/libbcache/journal.h @@ -111,11 +111,7 @@ #include <linux/hash.h> #include "journal_types.h" - -static inline struct jset_entry *jset_keys_next(struct jset_entry *j) -{ - return (void *) __bkey_idx(j, le16_to_cpu(j->u64s)); -} +//#include "super-io.h" /* * Only used for holding the journal entries we read in btree_journal_read() @@ -182,7 +178,7 @@ static inline void bch_journal_add_entry_at(struct journal_buf *buf, unsigned type, enum btree_id id, unsigned level, unsigned offset) { - struct jset_entry *entry = bkey_idx(buf->data, offset); + struct jset_entry *entry = vstruct_idx(buf->data, offset); entry->u64s = cpu_to_le16(u64s); entry->btree_id = id; @@ -336,7 +332,7 @@ static inline int bch_journal_error(struct journal *j) static inline bool is_journal_device(struct cache *ca) { - return ca->mi.state == CACHE_ACTIVE && ca->mi.tier == 0; + return ca->mi.state == BCH_MEMBER_STATE_ACTIVE && ca->mi.tier == 0; } static inline bool journal_flushes_device(struct cache *ca) @@ -367,21 +363,16 @@ ssize_t bch_journal_print_debug(struct journal *, char *); int bch_cache_journal_alloc(struct cache *); -static inline __le64 *__journal_buckets(struct cache_sb *sb) -{ - return sb->_data + bch_journal_buckets_offset(sb); -} - -static inline u64 journal_bucket(struct cache_sb *sb, unsigned nr) +static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j) { - return le64_to_cpu(__journal_buckets(sb)[nr]); -} - -static inline void set_journal_bucket(struct cache_sb *sb, unsigned nr, u64 bucket) -{ - __journal_buckets(sb)[nr] = cpu_to_le64(bucket); + return j + ? (__le64 *) vstruct_end(&j->field) - j->buckets + : 0; } int bch_journal_move(struct cache *); +void bch_journal_free_cache(struct cache *); +int bch_journal_init_cache(struct cache *); + #endif /* _BCACHE_JOURNAL_H */ diff --git a/libbcache/journal_types.h b/libbcache/journal_types.h index e3698b5a..5c95e37d 100644 --- a/libbcache/journal_types.h +++ b/libbcache/journal_types.h @@ -186,7 +186,7 @@ struct journal { * ugh: need to get prio_buckets converted over to the eventual new * transaction machinery */ - __le64 prio_buckets[MAX_CACHES_PER_SET]; + __le64 prio_buckets[BCH_SB_MEMBERS_MAX]; unsigned nr_prio_buckets; unsigned write_delay_ms; @@ -208,7 +208,7 @@ struct journal { /* * Embedded in struct cache. First three fields refer to the array of journal - * buckets, in cache_sb. + * buckets, in bch_sb. */ struct journal_device { /* @@ -229,6 +229,8 @@ struct journal_device { * sufficient to read: */ unsigned last_idx; + unsigned nr; + u64 *buckets; /* Bio for journal reads/writes to this device */ struct bio *bio; diff --git a/libbcache/migrate.c b/libbcache/migrate.c index 5a26e228..407ca17e 100644 --- a/libbcache/migrate.c +++ b/libbcache/migrate.c @@ -25,7 +25,7 @@ static int issue_migration_move(struct cache *ca, return -ENOSPC; extent_for_each_ptr(bkey_s_c_to_extent(k), ptr) - if (ptr->dev == ca->sb.nr_this_dev) + if (ptr->dev == ca->dev_idx) goto found; BUG(); @@ -62,7 +62,7 @@ int bch_move_data_off_device(struct cache *ca) u64 seen_key_count; int ret = 0; - BUG_ON(ca->mi.state == CACHE_ACTIVE); + BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE); bch_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE); ctxt.avoid = ca; @@ -99,7 +99,7 @@ int bch_move_data_off_device(struct cache *ca) !(ret = btree_iter_err(k))) { if (!bkey_extent_is_data(k.k) || !bch_extent_has_device(bkey_s_c_to_extent(k), - ca->sb.nr_this_dev)) + ca->dev_idx)) goto next; ret = issue_migration_move(ca, &ctxt, k); @@ -151,14 +151,14 @@ static int bch_move_btree_off(struct cache *ca, enum btree_id id) struct btree *b; int ret; - BUG_ON(ca->mi.state == CACHE_ACTIVE); + BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE); closure_init_stack(&cl); for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); retry: - if (!bch_extent_has_device(e, ca->sb.nr_this_dev)) + if (!bch_extent_has_device(e, ca->dev_idx)) continue; ret = bch_btree_node_rewrite(&iter, b, &cl); @@ -188,7 +188,7 @@ retry: for_each_btree_node(&iter, c, id, POS_MIN, 0, b) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); - BUG_ON(bch_extent_has_device(e, ca->sb.nr_this_dev)); + BUG_ON(bch_extent_has_device(e, ca->dev_idx)); } bch_btree_iter_unlock(&iter); } @@ -282,7 +282,7 @@ static int bch_flag_key_bad(struct btree_iter *iter, e = bkey_i_to_s_extent(&tmp.key); extent_for_each_ptr_backwards(e, ptr) - if (ptr->dev == ca->sb.nr_this_dev) + if (ptr->dev == ca->dev_idx) bch_extent_drop_ptr(e, ptr); /* @@ -323,7 +323,7 @@ int bch_flag_data_bad(struct cache *ca) goto advance; e = bkey_s_c_to_extent(k); - if (!bch_extent_has_device(e, ca->sb.nr_this_dev)) + if (!bch_extent_has_device(e, ca->dev_idx)) goto advance; ret = bch_flag_key_bad(&iter, ca, e); diff --git a/libbcache/move.c b/libbcache/move.c index f3ab9e83..655a5233 100644 --- a/libbcache/move.c +++ b/libbcache/move.c @@ -5,7 +5,7 @@ #include "buckets.h" #include "io.h" #include "move.h" -#include "super.h" +#include "super-io.h" #include "keylist.h" #include <linux/ioprio.h> @@ -63,7 +63,8 @@ static int bch_migrate_index_update(struct bch_write_op *op) bkey_start_pos(&bch_keylist_front(keys)->k)); while (1) { - struct bkey_i *insert = bch_keylist_front(keys); + struct bkey_s_extent insert = + bkey_i_to_s_extent(bch_keylist_front(keys)); struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter); struct bch_extent_ptr *ptr; struct bkey_s_extent e; @@ -79,17 +80,18 @@ static int bch_migrate_index_update(struct bch_write_op *op) bkey_reassemble(&new.k, k); bch_cut_front(iter.pos, &new.k); - bch_cut_back(insert->k.p, &new.k.k); + bch_cut_back(insert.k->p, &new.k.k); e = bkey_i_to_s_extent(&new.k); /* hack - promotes can race: */ if (m->promote) - extent_for_each_ptr(bkey_i_to_s_extent(insert), ptr) + extent_for_each_ptr(insert, ptr) if (bch_extent_has_device(e.c, ptr->dev)) goto nomatch; ptr = bch_migrate_matching_ptr(m, e); if (ptr) { + int nr_new_dirty = bch_extent_nr_dirty_ptrs(insert.s_c); unsigned insert_flags = BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL; @@ -98,17 +100,22 @@ static int bch_migrate_index_update(struct bch_write_op *op) if (m->move) insert_flags |= BTREE_INSERT_USE_RESERVE; - if (m->move) + if (m->move) { + nr_new_dirty -= !ptr->cached; __bch_extent_drop_ptr(e, ptr); + } + + BUG_ON(nr_new_dirty < 0); memcpy_u64s(extent_entry_last(e), - &insert->v, - bkey_val_u64s(&insert->k)); - e.k->u64s += bkey_val_u64s(&insert->k); + insert.v, + bkey_val_u64s(insert.k)); + e.k->u64s += bkey_val_u64s(insert.k); bch_extent_narrow_crcs(e); bch_extent_drop_redundant_crcs(e); bch_extent_normalize(c, e.s); + bch_extent_mark_replicas_cached(c, e, nr_new_dirty); ret = bch_btree_insert_at(c, &op->res, NULL, op_journal_seq(op), @@ -148,7 +155,8 @@ void bch_migrate_write_init(struct cache_set *c, if (move_ptr) m->move_ptr = *move_ptr; - if (bkey_extent_is_cached(k.k)) + if (bkey_extent_is_cached(k.k) || + (move_ptr && move_ptr->cached)) flags |= BCH_WRITE_CACHED; bch_write_op_init(&m->op, c, &m->wbio, @@ -160,6 +168,7 @@ void bch_migrate_write_init(struct cache_set *c, if (m->move) m->op.alloc_reserve = RESERVE_MOVINGGC; + m->op.nonce = extent_current_nonce(bkey_s_c_to_extent(k)); m->op.nr_replicas = 1; m->op.index_update_fn = bch_migrate_index_update; } diff --git a/libbcache/movinggc.c b/libbcache/movinggc.c index cb4f1654..83407eb1 100644 --- a/libbcache/movinggc.c +++ b/libbcache/movinggc.c @@ -28,7 +28,7 @@ static const struct bch_extent_ptr *moving_pred(struct cache *ca, if (bkey_extent_is_data(k.k) && (ptr = bch_extent_has_device(bkey_s_c_to_extent(k), - ca->sb.nr_this_dev)) && + ca->dev_idx)) && PTR_BUCKET(ca, ptr)->mark.copygc) return ptr; diff --git a/libbcache/notify.c b/libbcache/notify.c index e9b5568c..3a50f8fb 100644 --- a/libbcache/notify.c +++ b/libbcache/notify.c @@ -25,7 +25,7 @@ static void notify_get(struct cache_set *c) env->envp_idx = 0; env->buflen = 0; - notify_var(c, "SET_UUID=%pU", c->disk_sb.user_uuid.b); + notify_var(c, "SET_UUID=%pU", c->sb.user_uuid.b); } static void notify_get_cache(struct cache *ca) @@ -34,7 +34,7 @@ static void notify_get_cache(struct cache *ca) char buf[BDEVNAME_SIZE]; notify_get(c); - notify_var(c, "UUID=%pU", ca->disk_sb.sb->disk_uuid.b); + notify_var(c, "UUID=%pU", ca->uuid.b); notify_var(c, "BLOCKDEV=%s", bdevname(ca->disk_sb.bdev, buf)); } diff --git a/libbcache/opts.c b/libbcache/opts.c index 60a2a4d1..333654eb 100644 --- a/libbcache/opts.c +++ b/libbcache/opts.c @@ -29,7 +29,6 @@ const char * const bch_str_hash_types[] = { "crc32c", "crc64", "siphash", - "sha1", NULL }; @@ -70,11 +69,11 @@ const char * const bch_uint_opt[] = { }; enum bch_opts { -#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ +#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ Opt_##_name, - CACHE_SET_VISIBLE_OPTS() -#undef CACHE_SET_OPT + BCH_VISIBLE_OPTS() +#undef BCH_OPT Opt_bad_opt, }; @@ -144,15 +143,15 @@ static int parse_string_opt(const struct bch_option *opt, const char *s) static struct bch_opt_result parse_one_opt(const char *opt) { static const struct bch_option opt_table[] = { -#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ +#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ [Opt_##_name] = { \ .name = #_name, \ .opts = _choices, \ .min = _min, \ .max = _max, \ }, - CACHE_SET_VISIBLE_OPTS() -#undef CACHE_SET_OPT + BCH_VISIBLE_OPTS() +#undef BCH_OPT }, *i; for (i = opt_table; @@ -186,13 +185,13 @@ int bch_parse_options(struct cache_set_opts *opts, int flags, char *options) struct bch_opt_result res = parse_one_opt(p); switch (res.opt) { -#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ +#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ case Opt_##_name: \ opts->_name = res.val; \ break; - CACHE_SET_VISIBLE_OPTS() -#undef CACHE_SET_OPT + BCH_VISIBLE_OPTS() +#undef BCH_OPT case Opt_bad_opt: return -EINVAL; diff --git a/libbcache/opts.h b/libbcache/opts.h index 70df232c..1d30848f 100644 --- a/libbcache/opts.h +++ b/libbcache/opts.h @@ -30,47 +30,47 @@ extern const char * const bch_bool_opt[]; extern const char * const bch_uint_opt[]; /* dummy option, for options that aren't stored in the superblock */ -LE64_BITMASK(NO_SB_OPT, struct cache_sb, flags, 0, 0); - -#define CACHE_SET_VISIBLE_OPTS() \ - CACHE_SET_OPT(verbose_recovery, \ - bch_bool_opt, 0, 2, \ - NO_SB_OPT, false) \ - CACHE_SET_OPT(posix_acl, \ - bch_bool_opt, 0, 2, \ - NO_SB_OPT, false) \ - CACHE_SET_OPT(journal_flush_disabled, \ - bch_bool_opt, 0, 2, \ - NO_SB_OPT, true) \ - CACHE_SET_OPT(nofsck, \ - bch_bool_opt, 0, 2, \ - NO_SB_OPT, true) \ - CACHE_SET_OPT(fix_errors, \ - bch_bool_opt, 0, 2, \ - NO_SB_OPT, true) \ - CACHE_SET_OPT(nochanges, \ - bch_bool_opt, 0, 2, \ - NO_SB_OPT, 0) \ - CACHE_SET_OPT(noreplay, \ - bch_bool_opt, 0, 2, \ - NO_SB_OPT, 0) \ - CACHE_SET_OPT(norecovery, \ - bch_bool_opt, 0, 2, \ - NO_SB_OPT, 0) \ - CACHE_SET_SB_OPTS() - -#define CACHE_SET_OPTS() \ - CACHE_SET_OPT(read_only, \ - bch_bool_opt, 0, 2, \ - NO_SB_OPT, 0) \ - CACHE_SET_VISIBLE_OPTS() +LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); + +#define BCH_VISIBLE_OPTS() \ + BCH_OPT(verbose_recovery, \ + bch_bool_opt, 0, 2, \ + NO_SB_OPT, false) \ + BCH_OPT(posix_acl, \ + bch_bool_opt, 0, 2, \ + NO_SB_OPT, false) \ + BCH_OPT(journal_flush_disabled, \ + bch_bool_opt, 0, 2, \ + NO_SB_OPT, true) \ + BCH_OPT(nofsck, \ + bch_bool_opt, 0, 2, \ + NO_SB_OPT, true) \ + BCH_OPT(fix_errors, \ + bch_bool_opt, 0, 2, \ + NO_SB_OPT, true) \ + BCH_OPT(nochanges, \ + bch_bool_opt, 0, 2, \ + NO_SB_OPT, 0) \ + BCH_OPT(noreplay, \ + bch_bool_opt, 0, 2, \ + NO_SB_OPT, 0) \ + BCH_OPT(norecovery, \ + bch_bool_opt, 0, 2, \ + NO_SB_OPT, 0) \ + BCH_SB_OPTS() + +#define BCH_OPTS() \ + BCH_OPT(read_only, \ + bch_bool_opt, 0, 2, \ + NO_SB_OPT, 0) \ + BCH_VISIBLE_OPTS() struct cache_set_opts { -#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\ +#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\ s8 _name; - CACHE_SET_OPTS() -#undef CACHE_SET_OPT + BCH_OPTS() +#undef BCH_OPT }; static inline struct cache_set_opts cache_set_opts_empty(void) @@ -85,27 +85,27 @@ static inline struct cache_set_opts cache_set_opts_empty(void) * Initial options from superblock - here we don't want any options undefined, * any options the superblock doesn't specify are set to 0: */ -static inline struct cache_set_opts cache_superblock_opts(struct cache_sb *sb) +static inline struct cache_set_opts cache_superblock_opts(struct bch_sb *sb) { return (struct cache_set_opts) { -#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\ +#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\ ._name = _sb_opt##_BITS ? _sb_opt(sb) : 0, - CACHE_SET_OPTS() -#undef CACHE_SET_OPT + BCH_SB_OPTS() +#undef BCH_OPT }; } static inline void cache_set_opts_apply(struct cache_set_opts *dst, struct cache_set_opts src) { -#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\ +#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\ BUILD_BUG_ON(_max > S8_MAX); \ if (src._name >= 0) \ dst->_name = src._name; - CACHE_SET_OPTS() -#undef CACHE_SET_OPT + BCH_SB_OPTS() +#undef BCH_OPT } int bch_parse_options(struct cache_set_opts *, int, char *); diff --git a/libbcache/siphash.c b/libbcache/siphash.c index 5ba80b52..3a6c9c82 100644 --- a/libbcache/siphash.c +++ b/libbcache/siphash.c @@ -43,19 +43,46 @@ * https://131002.net/siphash/ */ -//#include <sys/param.h> -//#include <sys/systm.h> - #include <asm/byteorder.h> +#include <asm/unaligned.h> +#include <linux/bitops.h> #include <linux/string.h> #include "siphash.h" -static void SipHash_CRounds(SIPHASH_CTX *, int); -static void SipHash_Rounds(SIPHASH_CTX *, int); +static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) +{ + while (rounds--) { + ctx->v[0] += ctx->v[1]; + ctx->v[2] += ctx->v[3]; + ctx->v[1] = rol64(ctx->v[1], 13); + ctx->v[3] = rol64(ctx->v[3], 16); + + ctx->v[1] ^= ctx->v[0]; + ctx->v[3] ^= ctx->v[2]; + ctx->v[0] = rol64(ctx->v[0], 32); + + ctx->v[2] += ctx->v[1]; + ctx->v[0] += ctx->v[3]; + ctx->v[1] = rol64(ctx->v[1], 17); + ctx->v[3] = rol64(ctx->v[3], 21); + + ctx->v[1] ^= ctx->v[2]; + ctx->v[3] ^= ctx->v[0]; + ctx->v[2] = rol64(ctx->v[2], 32); + } +} + +static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) +{ + u64 m = get_unaligned_le64(ptr); -void -SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) + ctx->v[3] ^= m; + SipHash_Rounds(ctx, rounds); + ctx->v[0] ^= m; +} + +void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) { u64 k0, k1; @@ -71,8 +98,8 @@ SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) ctx->bytes = 0; } -void -SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len) +void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, + const void *src, size_t len) { const u8 *ptr = src; size_t left, used; @@ -88,7 +115,7 @@ SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len) if (len >= left) { memcpy(&ctx->buf[used], ptr, left); - SipHash_CRounds(ctx, rc); + SipHash_CRounds(ctx, ctx->buf, rc); len -= left; ptr += left; } else { @@ -98,8 +125,7 @@ SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len) } while (len >= sizeof(ctx->buf)) { - memcpy(ctx->buf, ptr, sizeof(ctx->buf)); - SipHash_CRounds(ctx, rc); + SipHash_CRounds(ctx, ptr, rc); len -= sizeof(ctx->buf); ptr += sizeof(ctx->buf); } @@ -108,8 +134,7 @@ SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len) memcpy(&ctx->buf[used], ptr, len); } -void -SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) +void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) { u64 r; @@ -118,8 +143,7 @@ SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) *((__le64 *) dst) = cpu_to_le64(r); } -u64 -SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) +u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) { u64 r; size_t left, used; @@ -129,7 +153,7 @@ SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) memset(&ctx->buf[used], 0, left - 1); ctx->buf[7] = ctx->bytes; - SipHash_CRounds(ctx, rc); + SipHash_CRounds(ctx, ctx->buf, rc); ctx->v[2] ^= 0xff; SipHash_Rounds(ctx, rf); @@ -138,48 +162,11 @@ SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) return (r); } -u64 -SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) +u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) { SIPHASH_CTX ctx; SipHash_Init(&ctx, key); SipHash_Update(&ctx, rc, rf, src, len); - return (SipHash_End(&ctx, rc, rf)); -} - -#define SIP_ROTL(x, b) ((x) << (b)) | ( (x) >> (64 - (b))) - -static void -SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) -{ - while (rounds--) { - ctx->v[0] += ctx->v[1]; - ctx->v[2] += ctx->v[3]; - ctx->v[1] = SIP_ROTL(ctx->v[1], 13); - ctx->v[3] = SIP_ROTL(ctx->v[3], 16); - - ctx->v[1] ^= ctx->v[0]; - ctx->v[3] ^= ctx->v[2]; - ctx->v[0] = SIP_ROTL(ctx->v[0], 32); - - ctx->v[2] += ctx->v[1]; - ctx->v[0] += ctx->v[3]; - ctx->v[1] = SIP_ROTL(ctx->v[1], 17); - ctx->v[3] = SIP_ROTL(ctx->v[3], 21); - - ctx->v[1] ^= ctx->v[2]; - ctx->v[3] ^= ctx->v[0]; - ctx->v[2] = SIP_ROTL(ctx->v[2], 32); - } -} - -static void -SipHash_CRounds(SIPHASH_CTX *ctx, int rounds) -{ - u64 m = le64_to_cpu(*((__le64 *)ctx->buf)); - - ctx->v[3] ^= m; - SipHash_Rounds(ctx, rounds); - ctx->v[0] ^= m; + return SipHash_End(&ctx, rc, rf); } diff --git a/libbcache/str_hash.h b/libbcache/str_hash.h index a489304c..b14d05c9 100644 --- a/libbcache/str_hash.h +++ b/libbcache/str_hash.h @@ -3,37 +3,74 @@ #include "btree_iter.h" #include "checksum.h" +#include "inode.h" #include "siphash.h" #include "super.h" -#include <crypto/sha1_base.h> #include <linux/crc32c.h> +#include <crypto/hash.h> -static const SIPHASH_KEY bch_siphash_key = { - .k0 = cpu_to_le64(0x5a9585fd80087730ULL), - .k1 = cpu_to_le64(0xc8de666d50b45664ULL ), +struct bch_hash_info { + u8 type; + union { + __le64 crc_key; + SIPHASH_KEY siphash_key; + }; }; +static inline struct bch_hash_info +bch_hash_info_init(const struct bch_inode_unpacked *bi) +{ + /* XXX ick */ + struct bch_hash_info info = { + .type = (bi->i_flags >> INODE_STR_HASH_OFFSET) & + ~(~0 << INODE_STR_HASH_BITS) + }; + + switch (info.type) { + case BCH_STR_HASH_CRC32C: + case BCH_STR_HASH_CRC64: + info.crc_key = bi->i_hash_seed; + break; + case BCH_STR_HASH_SIPHASH: { + SHASH_DESC_ON_STACK(desc, bch_sha256); + u8 digest[crypto_shash_digestsize(bch_sha256)]; + + desc->tfm = bch_sha256; + desc->flags = 0; + + crypto_shash_digest(desc, (void *) &bi->i_hash_seed, + sizeof(bi->i_hash_seed), digest); + memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); + break; + } + default: + BUG(); + } + + return info; +} + struct bch_str_hash_ctx { union { - u32 crc32c; - u64 crc64; - SIPHASH_CTX siphash; + u32 crc32c; + u64 crc64; + SIPHASH_CTX siphash; }; }; static inline void bch_str_hash_init(struct bch_str_hash_ctx *ctx, - enum bch_str_hash_type type) + const struct bch_hash_info *info) { - switch (type) { + switch (info->type) { case BCH_STR_HASH_CRC32C: - ctx->crc32c = ~0; + ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key)); break; case BCH_STR_HASH_CRC64: - ctx->crc64 = ~0; + ctx->crc64 = bch_crc64_update(~0, &info->crc_key, sizeof(info->crc_key)); break; case BCH_STR_HASH_SIPHASH: - SipHash24_Init(&ctx->siphash, &bch_siphash_key); + SipHash24_Init(&ctx->siphash, &info->siphash_key); break; default: BUG(); @@ -41,10 +78,10 @@ static inline void bch_str_hash_init(struct bch_str_hash_ctx *ctx, } static inline void bch_str_hash_update(struct bch_str_hash_ctx *ctx, - enum bch_str_hash_type type, - const void *data, size_t len) + const struct bch_hash_info *info, + const void *data, size_t len) { - switch (type) { + switch (info->type) { case BCH_STR_HASH_CRC32C: ctx->crc32c = crc32c(ctx->crc32c, data, len); break; @@ -60,9 +97,9 @@ static inline void bch_str_hash_update(struct bch_str_hash_ctx *ctx, } static inline u64 bch_str_hash_end(struct bch_str_hash_ctx *ctx, - enum bch_str_hash_type type) + const struct bch_hash_info *info) { - switch (type) { + switch (info->type) { case BCH_STR_HASH_CRC32C: return ctx->crc32c; case BCH_STR_HASH_CRC64: @@ -74,19 +111,6 @@ static inline u64 bch_str_hash_end(struct bch_str_hash_ctx *ctx, } } -struct bch_hash_info { - u64 seed; - u8 type; -}; - -static inline struct bch_hash_info bch_hash_info_init(const struct bch_inode *bi) -{ - return (struct bch_hash_info) { - .seed = le64_to_cpu(bi->i_hash_seed), - .type = INODE_STR_HASH_TYPE(bi), - }; -} - struct bch_hash_desc { enum btree_id btree_id; u8 key_type; diff --git a/libbcache/super-io.c b/libbcache/super-io.c new file mode 100644 index 00000000..66338a1c --- /dev/null +++ b/libbcache/super-io.c @@ -0,0 +1,798 @@ + +#include "bcache.h" +#include "blockdev.h" +#include "checksum.h" +#include "error.h" +#include "io.h" +#include "journal.h" +#include "super-io.h" +#include "super.h" +#include "vstructs.h" + +#include <linux/backing-dev.h> + +static inline void __bch_sb_layout_size_assert(void) +{ + BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); +} + +struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb, + enum bch_sb_field_types type) +{ + struct bch_sb_field *f; + + /* XXX: need locking around superblock to access optional fields */ + + vstruct_for_each(sb, f) + if (le32_to_cpu(f->type) == type) + return f; + return NULL; +} + +void bch_free_super(struct bcache_superblock *sb) +{ + if (sb->bio) + bio_put(sb->bio); + if (!IS_ERR_OR_NULL(sb->bdev)) + blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + + free_pages((unsigned long) sb->sb, sb->page_order); + memset(sb, 0, sizeof(*sb)); +} + +static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order) +{ + struct bch_sb *new_sb; + struct bio *bio; + + if (sb->page_order >= order && sb->sb) + return 0; + + if (dynamic_fault("bcache:add:super_realloc")) + return -ENOMEM; + + bio = bio_kmalloc(GFP_KERNEL, 1 << order); + if (!bio) + return -ENOMEM; + + if (sb->bio) + bio_put(sb->bio); + sb->bio = bio; + + new_sb = (void *) __get_free_pages(GFP_KERNEL, order); + if (!new_sb) + return -ENOMEM; + + if (sb->sb) + memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); + + free_pages((unsigned long) sb->sb, sb->page_order); + sb->sb = new_sb; + + sb->page_order = order; + + return 0; +} + +int bch_dev_sb_realloc(struct bcache_superblock *sb, unsigned u64s) +{ + u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s); + u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; + + if (new_bytes > max_bytes) { + char buf[BDEVNAME_SIZE]; + + pr_err("%s: superblock too big: want %llu but have %llu", + bdevname(sb->bdev, buf), new_bytes, max_bytes); + return -ENOSPC; + } + + return __bch_super_realloc(sb, get_order(new_bytes)); +} + +static int bch_fs_sb_realloc(struct cache_set *c, unsigned u64s) +{ + u64 bytes = __vstruct_bytes(struct bch_sb, u64s); + struct bch_sb *sb; + unsigned order = get_order(bytes); + + if (c->disk_sb && order <= c->disk_sb_order) + return 0; + + sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + if (!sb) + return -ENOMEM; + + if (c->disk_sb) + memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order); + + free_pages((unsigned long) c->disk_sb, c->disk_sb_order); + + c->disk_sb = sb; + c->disk_sb_order = order; + return 0; +} + +static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb, + struct bch_sb_field *f, + unsigned u64s) +{ + unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; + + if (!f) { + f = vstruct_last(sb); + memset(f, 0, sizeof(u64) * u64s); + f->u64s = cpu_to_le32(u64s); + f->type = 0; + } else { + void *src, *dst; + + src = vstruct_end(f); + f->u64s = cpu_to_le32(u64s); + dst = vstruct_end(f); + + memmove(dst, src, vstruct_end(sb) - src); + + if (dst > src) + memset(src, 0, dst - src); + } + + le32_add_cpu(&sb->u64s, u64s - old_u64s); + + return f; + +} + +struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c, + struct bch_sb_field *f, + unsigned u64s) +{ + ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; + ssize_t d = -old_u64s + u64s; + struct cache *ca; + unsigned i; + + lockdep_assert_held(&c->sb_lock); + + if (bch_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d)) + return NULL; + + for_each_cache(ca, c, i) { + struct bcache_superblock *sb = &ca->disk_sb; + + if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { + percpu_ref_put(&ca->ref); + return NULL; + } + } + + return __bch_sb_field_resize(c->disk_sb, f, u64s); +} + +struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *sb, + struct bch_sb_field *f, + unsigned u64s) +{ + ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; + ssize_t d = -old_u64s + u64s; + + if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) + return NULL; + + return __bch_sb_field_resize(sb->sb, f, u64s); +} + +static const char *validate_sb_layout(struct bch_sb_layout *layout) +{ + u64 offset, prev_offset, max_sectors; + unsigned i; + + if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) + return "Not a bcache superblock layout"; + + if (layout->layout_type != 0) + return "Invalid superblock layout type"; + + if (!layout->nr_superblocks) + return "Invalid superblock layout: no superblocks"; + + if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) + return "Invalid superblock layout: too many superblocks"; + + max_sectors = 1 << layout->sb_max_size_bits; + + prev_offset = le64_to_cpu(layout->sb_offset[0]); + + if (prev_offset != BCH_SB_SECTOR) + return "Invalid superblock layout: doesn't have default superblock location"; + + for (i = 1; i < layout->nr_superblocks; i++) { + offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset < prev_offset + max_sectors) + return "Invalid superblock layout: superblocks overlap"; + prev_offset = offset; + } + + return NULL; +} + +const char *bch_validate_cache_super(struct bcache_superblock *disk_sb) +{ + struct bch_sb *sb = disk_sb->sb; + struct bch_sb_field *f; + struct bch_sb_field_members *sb_mi; + struct bch_sb_field_journal *journal; + struct cache_member_cpu mi; + const char *err; + u16 block_size; + unsigned i; + + switch (le64_to_cpu(sb->version)) { + case BCACHE_SB_VERSION_CDEV_V4: + break; + default: + return"Unsupported superblock version"; + } + + if (BCH_SB_INITIALIZED(sb) && + le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V4) + return "Unsupported superblock version"; + + block_size = le16_to_cpu(sb->block_size); + + if (!is_power_of_2(block_size) || + block_size > PAGE_SECTORS) + return "Bad block size"; + + if (bch_is_zero(sb->user_uuid.b, sizeof(uuid_le))) + return "Bad user UUID"; + + if (bch_is_zero(sb->uuid.b, sizeof(uuid_le))) + return "Bad internal UUID"; + + if (!sb->nr_devices || + sb->nr_devices <= sb->dev_idx || + sb->nr_devices > BCH_SB_MEMBERS_MAX) + return "Bad cache device number in set"; + + if (!BCH_SB_META_REPLICAS_WANT(sb) || + BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) + return "Invalid number of metadata replicas"; + + if (!BCH_SB_META_REPLICAS_HAVE(sb) || + BCH_SB_META_REPLICAS_HAVE(sb) > + BCH_SB_META_REPLICAS_WANT(sb)) + return "Invalid number of metadata replicas"; + + if (!BCH_SB_DATA_REPLICAS_WANT(sb) || + BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) + return "Invalid number of data replicas"; + + if (!BCH_SB_DATA_REPLICAS_HAVE(sb) || + BCH_SB_DATA_REPLICAS_HAVE(sb) > + BCH_SB_DATA_REPLICAS_WANT(sb)) + return "Invalid number of data replicas"; + + if (!BCH_SB_BTREE_NODE_SIZE(sb)) + return "Btree node size not set"; + + if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) + return "Btree node size not a power of two"; + + if (BCH_SB_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX) + return "Btree node size too large"; + + if (BCH_SB_GC_RESERVE(sb) < 5) + return "gc reserve percentage too small"; + + if (1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) < block_size) + return "max journal entry size too small"; + + /* 4 mb max: */ + if (512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX) + return "max journal entry size too big"; + + if (!sb->time_precision || + le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) + return "invalid time precision"; + + /* validate layout */ + err = validate_sb_layout(&sb->layout); + if (err) + return err; + + vstruct_for_each(sb, f) { + if (!f->u64s) + return "Invalid superblock: invalid optional field"; + + if (vstruct_next(f) > vstruct_last(sb)) + return "Invalid superblock: invalid optional field"; + + if (le32_to_cpu(f->type) >= BCH_SB_FIELD_NR) + return "Invalid superblock: unknown optional field type"; + } + + /* Validate member info: */ + sb_mi = bch_sb_get_members(sb); + if (!sb_mi) + return "Invalid superblock: member info area missing"; + + if ((void *) (sb_mi->members + sb->nr_devices) > + vstruct_end(&sb_mi->field)) + return "Invalid superblock: bad member info"; + + mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx); + + for (i = 0; i < sb->layout.nr_superblocks; i++) { + u64 offset = le64_to_cpu(sb->layout.sb_offset[i]); + u64 max_size = 1 << sb->layout.sb_max_size_bits; + + if (offset + max_size > mi.first_bucket * mi.bucket_size) + return "Invalid superblock: first bucket comes before end of super"; + } + + if (mi.nbuckets > LONG_MAX) + return "Too many buckets"; + + if (mi.nbuckets - mi.first_bucket < 1 << 10) + return "Not enough buckets"; + + if (!is_power_of_2(mi.bucket_size) || + mi.bucket_size < PAGE_SECTORS || + mi.bucket_size < block_size) + return "Bad bucket size"; + + if (get_capacity(disk_sb->bdev->bd_disk) < + mi.bucket_size * mi.nbuckets) + return "Invalid superblock: device too small"; + + /* Validate journal buckets: */ + journal = bch_sb_get_journal(sb); + if (journal) { + for (i = 0; i < bch_nr_journal_buckets(journal); i++) { + u64 b = le64_to_cpu(journal->buckets[i]); + + if (b < mi.first_bucket || b >= mi.nbuckets) + return "bad journal bucket"; + } + } + + return NULL; +} + +/* device open: */ + +static bool bch_is_open_cache(struct block_device *bdev) +{ + struct cache_set *c; + struct cache *ca; + unsigned i; + + rcu_read_lock(); + list_for_each_entry(c, &bch_cache_sets, list) + for_each_cache_rcu(ca, c, i) + if (ca->disk_sb.bdev == bdev) { + rcu_read_unlock(); + return true; + } + rcu_read_unlock(); + return false; +} + +static bool bch_is_open(struct block_device *bdev) +{ + lockdep_assert_held(&bch_register_lock); + + return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev); +} + +static const char *bch_blkdev_open(const char *path, void *holder, + struct cache_set_opts opts, + struct block_device **ret) +{ + struct block_device *bdev; + fmode_t mode = opts.nochanges > 0 + ? FMODE_READ + : FMODE_READ|FMODE_WRITE|FMODE_EXCL; + const char *err; + + *ret = NULL; + bdev = blkdev_get_by_path(path, mode, holder); + + if (bdev == ERR_PTR(-EBUSY)) { + bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + return "device busy"; + + err = bch_is_open(bdev) + ? "device already registered" + : "device busy"; + + bdput(bdev); + return err; + } + + if (IS_ERR(bdev)) + return "failed to open device"; + + bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; + + *ret = bdev; + return NULL; +} + +/* Update cached mi: */ +int bch_cache_set_mi_update(struct cache_set *c, + struct bch_member *mi, + unsigned nr_devices) +{ + struct cache_member_rcu *new, *old; + struct cache *ca; + unsigned i; + + lockdep_assert_held(&c->sb_lock); + + new = kzalloc(sizeof(struct cache_member_rcu) + + sizeof(struct cache_member_cpu) * nr_devices, + GFP_KERNEL); + if (!new) + return -ENOMEM; + + new->nr_devices = nr_devices; + + for (i = 0; i < nr_devices; i++) + new->m[i] = cache_mi_to_cpu_mi(&mi[i]); + + rcu_read_lock(); + for_each_cache(ca, c, i) + ca->mi = new->m[i]; + rcu_read_unlock(); + + old = rcu_dereference_protected(c->members, + lockdep_is_held(&c->sb_lock)); + + rcu_assign_pointer(c->members, new); + if (old) + kfree_rcu(old, rcu); + + return 0; +} + +static void bch_sb_update(struct cache_set *c) +{ + struct bch_sb *src = c->disk_sb; + + lockdep_assert_held(&c->sb_lock); + + c->sb.uuid = src->uuid; + c->sb.user_uuid = src->user_uuid; + c->sb.block_size = le16_to_cpu(src->block_size); + c->sb.btree_node_size = BCH_SB_BTREE_NODE_SIZE(src); + c->sb.nr_devices = src->nr_devices; + c->sb.clean = BCH_SB_CLEAN(src); + c->sb.meta_replicas_have= BCH_SB_META_REPLICAS_HAVE(src); + c->sb.data_replicas_have= BCH_SB_DATA_REPLICAS_HAVE(src); + c->sb.str_hash_type = BCH_SB_STR_HASH_TYPE(src); + c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); + c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); + c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); + c->sb.time_precision = le32_to_cpu(src->time_precision); +} + +/* doesn't copy member info */ +static void __copy_super(struct bch_sb *dst, struct bch_sb *src) +{ + struct bch_sb_field *src_f, *dst_f; + + dst->version = src->version; + dst->seq = src->seq; + dst->uuid = src->uuid; + dst->user_uuid = src->user_uuid; + memcpy(dst->label, src->label, sizeof(dst->label)); + + dst->block_size = src->block_size; + dst->nr_devices = src->nr_devices; + + dst->time_base_lo = src->time_base_lo; + dst->time_base_hi = src->time_base_hi; + dst->time_precision = src->time_precision; + + memcpy(dst->flags, src->flags, sizeof(dst->flags)); + memcpy(dst->features, src->features, sizeof(dst->features)); + memcpy(dst->compat, src->compat, sizeof(dst->compat)); + + vstruct_for_each(src, src_f) { + if (src_f->type == BCH_SB_FIELD_journal) + continue; + + dst_f = bch_sb_field_get(dst, src_f->type); + dst_f = __bch_sb_field_resize(dst, dst_f, + le32_to_cpu(src_f->u64s)); + + memcpy(dst_f, src_f, vstruct_bytes(src_f)); + } +} + +int bch_sb_to_cache_set(struct cache_set *c, struct bch_sb *src) +{ + struct bch_sb_field_members *members = + bch_sb_get_members(src); + struct bch_sb_field_journal *journal_buckets = + bch_sb_get_journal(src); + unsigned journal_u64s = journal_buckets + ? le32_to_cpu(journal_buckets->field.u64s) + : 0; + + lockdep_assert_held(&c->sb_lock); + + if (bch_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s)) + return -ENOMEM; + + if (bch_cache_set_mi_update(c, members->members, src->nr_devices)) + return -ENOMEM; + + __copy_super(c->disk_sb, src); + bch_sb_update(c); + + return 0; +} + +int bch_sb_from_cache_set(struct cache_set *c, struct cache *ca) +{ + struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb; + struct bch_sb_field_journal *journal_buckets = + bch_sb_get_journal(dst); + unsigned journal_u64s = journal_buckets + ? le32_to_cpu(journal_buckets->field.u64s) + : 0; + unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; + int ret; + + ret = bch_dev_sb_realloc(&ca->disk_sb, u64s); + if (ret) + return ret; + + __copy_super(dst, src); + + return 0; +} + +/* read superblock: */ + +static const char *read_one_super(struct bcache_superblock *sb, u64 offset) +{ + struct bch_csum csum; + size_t bytes; + unsigned order; +reread: + bio_reset(sb->bio); + sb->bio->bi_bdev = sb->bdev; + sb->bio->bi_iter.bi_sector = BCH_SB_SECTOR; + sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order; + bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); + bch_bio_map(sb->bio, sb->sb); + + if (submit_bio_wait(sb->bio)) + return "IO error"; + + if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) + return "Not a bcache superblock"; + + if (le64_to_cpu(sb->sb->version) != BCACHE_SB_VERSION_CDEV_V4) + return "Unsupported superblock version"; + + bytes = vstruct_bytes(sb->sb); + + if (bytes > 512 << sb->sb->layout.sb_max_size_bits) + return "Bad superblock: too big"; + + order = get_order(bytes); + if (order > sb->page_order) { + if (__bch_super_realloc(sb, order)) + return "cannot allocate memory"; + goto reread; + } + + if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) + return "unknown csum type"; + + /* XXX: verify MACs */ + csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), + (struct nonce) { 0 }, sb->sb); + + if (bch_crc_cmp(csum, sb->sb->csum)) + return "bad checksum reading superblock"; + + return NULL; +} + +const char *bch_read_super(struct bcache_superblock *sb, + struct cache_set_opts opts, + const char *path) +{ + struct bch_sb_layout layout; + const char *err; + unsigned i; + + lockdep_assert_held(&bch_register_lock); + + memset(sb, 0, sizeof(*sb)); + + err = bch_blkdev_open(path, &sb, opts, &sb->bdev); + if (err) + return err; + + err = "cannot allocate memory"; + if (__bch_super_realloc(sb, 0)) + goto err; + + err = "dynamic fault"; + if (cache_set_init_fault("read_super")) + goto err; + + err = read_one_super(sb, BCH_SB_SECTOR); + if (!err) + goto got_super; + + pr_err("error reading default super: %s", err); + + /* + * Error reading primary superblock - read location of backup + * superblocks: + */ + bio_reset(sb->bio); + sb->bio->bi_bdev = sb->bdev; + sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; + sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout); + bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); + /* + * use sb buffer to read layout, since sb buffer is page aligned but + * layout won't be: + */ + bch_bio_map(sb->bio, sb->sb); + + err = "IO error"; + if (submit_bio_wait(sb->bio)) + goto err; + + memcpy(&layout, sb->sb, sizeof(layout)); + err = validate_sb_layout(&layout); + if (err) + goto err; + + for (i = 0; i < layout.nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout.sb_offset[i]); + + if (offset == BCH_SB_SECTOR) + continue; + + err = read_one_super(sb, offset); + if (!err) + goto got_super; + } + goto err; +got_super: + pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", + le64_to_cpu(sb->sb->version), + le64_to_cpu(sb->sb->flags), + le64_to_cpu(sb->sb->seq), + le16_to_cpu(sb->sb->u64s)); + + err = "Superblock block size smaller than device block size"; + if (le16_to_cpu(sb->sb->block_size) << 9 < + bdev_logical_block_size(sb->bdev)) + goto err; + + return NULL; +err: + bch_free_super(sb); + return err; +} + +/* write superblock: */ + +static void write_super_endio(struct bio *bio) +{ + struct cache *ca = bio->bi_private; + + /* XXX: return errors directly */ + + cache_fatal_io_err_on(bio->bi_error, ca, "superblock write"); + + bch_account_io_completion(ca); + + closure_put(&ca->set->sb_write); + percpu_ref_put(&ca->ref); +} + +static bool write_one_super(struct cache_set *c, struct cache *ca, unsigned idx) +{ + struct bch_sb *sb = ca->disk_sb.sb; + struct bio *bio = ca->disk_sb.bio; + + if (idx >= sb->layout.nr_superblocks) + return false; + + sb->offset = sb->layout.sb_offset[idx]; + + SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); + sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), + (struct nonce) { 0 }, sb); + + bio_reset(bio); + bio->bi_bdev = ca->disk_sb.bdev; + bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); + bio->bi_iter.bi_size = + roundup(vstruct_bytes(sb), + bdev_logical_block_size(ca->disk_sb.bdev)); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; + bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); + bch_bio_map(bio, sb); + + percpu_ref_get(&ca->ref); + closure_bio_submit_punt(bio, &c->sb_write, c); + + return true; +} + +void bch_write_super(struct cache_set *c) +{ + struct bch_sb_field_members *members = + bch_sb_get_members(c->disk_sb); + struct closure *cl = &c->sb_write; + struct cache *ca; + unsigned i, super_idx = 0; + bool wrote; + + lockdep_assert_held(&c->sb_lock); + + closure_init_stack(cl); + + le64_add_cpu(&c->disk_sb->seq, 1); + + for_each_cache(ca, c, i) + bch_sb_from_cache_set(c, ca); + + do { + wrote = false; + for_each_cache(ca, c, i) + if (write_one_super(c, ca, super_idx)) + wrote = true; + + closure_sync(cl); + super_idx++; + } while (wrote); + + /* Make new options visible after they're persistent: */ + bch_cache_set_mi_update(c, members->members, c->sb.nr_devices); + bch_sb_update(c); +} + +void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k, + bool meta) +{ + struct bch_member *mi; + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + const struct bch_extent_ptr *ptr; + + mutex_lock(&c->sb_lock); + + /* recheck, might have raced */ + if (bch_check_super_marked(c, k, meta)) { + mutex_unlock(&c->sb_lock); + return; + } + + mi = bch_sb_get_members(c->disk_sb)->members; + + extent_for_each_ptr(e, ptr) + if (!ptr->cached) + (meta + ? SET_BCH_MEMBER_HAS_METADATA + : SET_BCH_MEMBER_HAS_DATA)(mi + ptr->dev, true); + + bch_write_super(c); + mutex_unlock(&c->sb_lock); +} diff --git a/libbcache/super-io.h b/libbcache/super-io.h new file mode 100644 index 00000000..1eda57bc --- /dev/null +++ b/libbcache/super-io.h @@ -0,0 +1,141 @@ +#ifndef _BCACHE_SUPER_IO_H +#define _BCACHE_SUPER_IO_H + +#include "extents.h" +#include "super_types.h" + +#include <asm/byteorder.h> + +struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_types); + +#define BCH_SB_FIELD_TYPE(_name) \ +static inline struct bch_sb_field_##_name * \ +bch_sb_get_##_name(struct bch_sb *sb) \ +{ \ + struct bch_sb_field *f = \ + bch_sb_field_get(sb, BCH_SB_FIELD_##_name); \ + \ + return container_of_or_null(f, struct bch_sb_field_##_name, field);\ +} + +BCH_SB_FIELD_TYPE(journal); +BCH_SB_FIELD_TYPE(members); +BCH_SB_FIELD_TYPE(crypt); + +static inline bool bch_sb_test_feature(struct bch_sb *sb, + enum bch_sb_features f) +{ + unsigned w = f / 64; + unsigned b = f % 64; + + return le64_to_cpu(sb->features[w]) & (1ULL << b); +} + +static inline void bch_sb_set_feature(struct bch_sb *sb, + enum bch_sb_features f) +{ + if (!bch_sb_test_feature(sb, f)) { + unsigned w = f / 64; + unsigned b = f % 64; + + le64_add_cpu(&sb->features[w], 1ULL << b); + } +} + +static inline __le64 bch_sb_magic(struct cache_set *c) +{ + __le64 ret; + memcpy(&ret, &c->sb.uuid, sizeof(ret)); + return ret; +} + +static inline __u64 jset_magic(struct cache_set *c) +{ + return __le64_to_cpu(bch_sb_magic(c) ^ JSET_MAGIC); +} + +static inline __u64 pset_magic(struct cache_set *c) +{ + return __le64_to_cpu(bch_sb_magic(c) ^ PSET_MAGIC); +} + +static inline __u64 bset_magic(struct cache_set *c) +{ + return __le64_to_cpu(bch_sb_magic(c) ^ BSET_MAGIC); +} + +static inline struct cache_member_cpu cache_mi_to_cpu_mi(struct bch_member *mi) +{ + return (struct cache_member_cpu) { + .nbuckets = le64_to_cpu(mi->nbuckets), + .first_bucket = le16_to_cpu(mi->first_bucket), + .bucket_size = le16_to_cpu(mi->bucket_size), + .state = BCH_MEMBER_STATE(mi), + .tier = BCH_MEMBER_TIER(mi), + .has_metadata = BCH_MEMBER_HAS_METADATA(mi), + .has_data = BCH_MEMBER_HAS_DATA(mi), + .replacement = BCH_MEMBER_REPLACEMENT(mi), + .discard = BCH_MEMBER_DISCARD(mi), + .valid = !bch_is_zero(mi->uuid.b, sizeof(uuid_le)), + }; +} + +int bch_cache_set_mi_update(struct cache_set *, struct bch_member *, unsigned); + +int bch_sb_to_cache_set(struct cache_set *, struct bch_sb *); +int bch_sb_from_cache_set(struct cache_set *, struct cache *); + +struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *, + struct bch_sb_field *, unsigned); +struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *, + struct bch_sb_field *, unsigned); + +void bch_free_super(struct bcache_superblock *); +int bch_super_realloc(struct bcache_superblock *, unsigned); + +const char *bch_validate_cache_super(struct bcache_superblock *); + +const char *bch_read_super(struct bcache_superblock *, + struct cache_set_opts, const char *); +void bch_write_super(struct cache_set *); + +void bch_check_mark_super_slowpath(struct cache_set *, + const struct bkey_i *, bool); + +#define cache_member_info_get(_c) \ + (rcu_read_lock(), rcu_dereference((_c)->members)) + +#define cache_member_info_put() rcu_read_unlock() + +static inline bool bch_check_super_marked(struct cache_set *c, + const struct bkey_i *k, bool meta) +{ + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + const struct bch_extent_ptr *ptr; + struct cache_member_cpu *mi = cache_member_info_get(c)->m; + bool ret = true; + + extent_for_each_ptr(e, ptr) + if (!ptr->cached && + !(meta + ? mi[ptr->dev].has_metadata + : mi[ptr->dev].has_data)) { + ret = false; + break; + } + + cache_member_info_put(); + + return ret; +} + +static inline void bch_check_mark_super(struct cache_set *c, + const struct bkey_i *k, bool meta) +{ + if (bch_check_super_marked(c, k, meta)) + return; + + bch_check_mark_super_slowpath(c, k, meta); +} + +#endif /* _BCACHE_SUPER_IO_H */ diff --git a/libbcache/super.c b/libbcache/super.c index 296700b3..c026c0dd 100644 --- a/libbcache/super.c +++ b/libbcache/super.c @@ -31,12 +31,14 @@ #include "notify.h" #include "stats.h" #include "super.h" +#include "super-io.h" #include "tier.h" #include "writeback.h" #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/debugfs.h> +#include <linux/device.h> #include <linux/genhd.h> #include <linux/idr.h> #include <linux/kthread.h> @@ -69,70 +71,11 @@ static struct device *bch_chardev; static DEFINE_IDR(bch_chardev_minor); static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); struct workqueue_struct *bcache_io_wq; -struct crypto_shash *bch_sha1; +struct crypto_shash *bch_sha256; static void bch_cache_stop(struct cache *); static int bch_cache_online(struct cache *); -static bool bch_is_open_cache(struct block_device *bdev) -{ - struct cache_set *c; - struct cache *ca; - unsigned i; - - rcu_read_lock(); - list_for_each_entry(c, &bch_cache_sets, list) - for_each_cache_rcu(ca, c, i) - if (ca->disk_sb.bdev == bdev) { - rcu_read_unlock(); - return true; - } - rcu_read_unlock(); - return false; -} - -static bool bch_is_open(struct block_device *bdev) -{ - lockdep_assert_held(&bch_register_lock); - - return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev); -} - -static const char *bch_blkdev_open(const char *path, void *holder, - struct cache_set_opts opts, - struct block_device **ret) -{ - struct block_device *bdev; - fmode_t mode = opts.nochanges > 0 - ? FMODE_READ - : FMODE_READ|FMODE_WRITE|FMODE_EXCL; - const char *err; - - *ret = NULL; - bdev = blkdev_get_by_path(path, mode, holder); - - if (bdev == ERR_PTR(-EBUSY)) { - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) - return "device busy"; - - err = bch_is_open(bdev) - ? "device already registered" - : "device busy"; - - bdput(bdev); - return err; - } - - if (IS_ERR(bdev)) - return "failed to open device"; - - bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; - - *ret = bdev; - return NULL; -} - static int bch_congested_fn(void *data, int bdi_bits) { struct backing_dev_info *bdi; @@ -168,520 +111,6 @@ static int bch_congested_fn(void *data, int bdi_bits) return ret; } -/* Superblock */ - -static struct cache_member_cpu cache_mi_to_cpu_mi(struct cache_member *mi) -{ - return (struct cache_member_cpu) { - .nbuckets = le64_to_cpu(mi->nbuckets), - .first_bucket = le16_to_cpu(mi->first_bucket), - .bucket_size = le16_to_cpu(mi->bucket_size), - .state = CACHE_STATE(mi), - .tier = CACHE_TIER(mi), - .replication_set= CACHE_REPLICATION_SET(mi), - .has_metadata = CACHE_HAS_METADATA(mi), - .has_data = CACHE_HAS_DATA(mi), - .replacement = CACHE_REPLACEMENT(mi), - .discard = CACHE_DISCARD(mi), - .valid = !bch_is_zero(mi->uuid.b, sizeof(uuid_le)), - }; -} - -static const char *validate_cache_super(struct bcache_superblock *disk_sb) -{ - struct cache_sb *sb = disk_sb->sb; - struct cache_member_cpu mi; - u16 block_size; - unsigned i; - - switch (le64_to_cpu(sb->version)) { - case BCACHE_SB_VERSION_CDEV_V0: - case BCACHE_SB_VERSION_CDEV_WITH_UUID: - case BCACHE_SB_VERSION_CDEV_V2: - case BCACHE_SB_VERSION_CDEV_V3: - break; - default: - return"Unsupported superblock version"; - } - - if (CACHE_SET_SYNC(sb) && - le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V3) - return "Unsupported superblock version"; - - block_size = le16_to_cpu(sb->block_size); - - if (!is_power_of_2(block_size) || - block_size > PAGE_SECTORS) - return "Bad block size"; - - if (bch_is_zero(sb->disk_uuid.b, sizeof(uuid_le))) - return "Bad disk UUID"; - - if (bch_is_zero(sb->user_uuid.b, sizeof(uuid_le))) - return "Bad user UUID"; - - if (bch_is_zero(sb->set_uuid.b, sizeof(uuid_le))) - return "Bad set UUID"; - - if (!sb->nr_in_set || - sb->nr_in_set <= sb->nr_this_dev || - sb->nr_in_set > MAX_CACHES_PER_SET) - return "Bad cache device number in set"; - - if (!CACHE_SET_META_REPLICAS_WANT(sb) || - CACHE_SET_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) - return "Invalid number of metadata replicas"; - - if (!CACHE_SET_META_REPLICAS_HAVE(sb) || - CACHE_SET_META_REPLICAS_HAVE(sb) > - CACHE_SET_META_REPLICAS_WANT(sb)) - return "Invalid number of metadata replicas"; - - if (!CACHE_SET_DATA_REPLICAS_WANT(sb) || - CACHE_SET_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) - return "Invalid number of data replicas"; - - if (!CACHE_SET_DATA_REPLICAS_HAVE(sb) || - CACHE_SET_DATA_REPLICAS_HAVE(sb) > - CACHE_SET_DATA_REPLICAS_WANT(sb)) - return "Invalid number of data replicas"; - - if (CACHE_SB_CSUM_TYPE(sb) >= BCH_CSUM_NR) - return "Invalid checksum type"; - - if (!CACHE_SET_BTREE_NODE_SIZE(sb)) - return "Btree node size not set"; - - if (!is_power_of_2(CACHE_SET_BTREE_NODE_SIZE(sb))) - return "Btree node size not a power of two"; - - if (CACHE_SET_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX) - return "Btree node size too large"; - - /* Default value, for old filesystems: */ - if (!CACHE_SET_GC_RESERVE(sb)) - SET_CACHE_SET_GC_RESERVE(sb, 10); - - if (CACHE_SET_GC_RESERVE(sb) < 5) - return "gc reserve percentage too small"; - - if (!CACHE_SET_JOURNAL_ENTRY_SIZE(sb)) - SET_CACHE_SET_JOURNAL_ENTRY_SIZE(sb, 9); - - /* 4 mb max: */ - if (512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX) - return "max journal entry size too big"; - - if (le16_to_cpu(sb->u64s) < bch_journal_buckets_offset(sb)) - return "Invalid superblock: member info area missing"; - - mi = cache_mi_to_cpu_mi(sb->members + sb->nr_this_dev); - - if (mi.nbuckets > LONG_MAX) - return "Too many buckets"; - - if (mi.nbuckets < 1 << 8) - return "Not enough buckets"; - - if (!is_power_of_2(mi.bucket_size) || - mi.bucket_size < PAGE_SECTORS || - mi.bucket_size < block_size) - return "Bad bucket size"; - - if (get_capacity(disk_sb->bdev->bd_disk) < - mi.bucket_size * mi.nbuckets) - return "Invalid superblock: device too small"; - - if (le64_to_cpu(sb->offset) + - (__set_blocks(sb, le16_to_cpu(sb->u64s), - block_size << 9) * block_size) > - mi.first_bucket * mi.bucket_size) - return "Invalid superblock: first bucket comes before end of super"; - - for (i = 0; i < bch_nr_journal_buckets(sb); i++) - if (journal_bucket(sb, i) < mi.first_bucket || - journal_bucket(sb, i) >= mi.nbuckets) - return "bad journal bucket"; - - return NULL; -} - -void free_super(struct bcache_superblock *sb) -{ - if (sb->bio) - bio_put(sb->bio); - if (!IS_ERR_OR_NULL(sb->bdev)) - blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); - - free_pages((unsigned long) sb->sb, sb->page_order); - memset(sb, 0, sizeof(*sb)); -} - -static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order) -{ - struct cache_sb *new_sb; - struct bio *bio; - - if (sb->page_order >= order && sb->sb) - return 0; - - new_sb = (void *) __get_free_pages(GFP_KERNEL, order); - if (!new_sb) - return -ENOMEM; - - bio = (dynamic_fault("bcache:add:super_realloc") - ? NULL - : bio_kmalloc(GFP_KERNEL, 1 << order)); - if (!bio) { - free_pages((unsigned long) new_sb, order); - return -ENOMEM; - } - - if (sb->sb) - memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); - - free_pages((unsigned long) sb->sb, sb->page_order); - sb->sb = new_sb; - - if (sb->bio) - bio_put(sb->bio); - sb->bio = bio; - - sb->page_order = order; - - return 0; -} - -int bch_super_realloc(struct bcache_superblock *sb, unsigned u64s) -{ - struct cache_member *mi = sb->sb->members + sb->sb->nr_this_dev; - char buf[BDEVNAME_SIZE]; - size_t bytes = __set_bytes((struct cache_sb *) NULL, u64s); - u64 want = bytes + (SB_SECTOR << 9); - - u64 first_bucket_offset = (u64) le16_to_cpu(mi->first_bucket) * - ((u64) le16_to_cpu(mi->bucket_size) << 9); - - if (want > first_bucket_offset) { - pr_err("%s: superblock too big: want %llu but have %llu", - bdevname(sb->bdev, buf), want, first_bucket_offset); - return -ENOSPC; - } - - return __bch_super_realloc(sb, get_order(bytes)); -} - -static const char *read_super(struct bcache_superblock *sb, - struct cache_set_opts opts, - const char *path) -{ - const char *err; - unsigned order = 0; - - lockdep_assert_held(&bch_register_lock); - - memset(sb, 0, sizeof(*sb)); - - err = bch_blkdev_open(path, &sb, opts, &sb->bdev); - if (err) - return err; -retry: - err = "cannot allocate memory"; - if (__bch_super_realloc(sb, order)) - goto err; - - err = "dynamic fault"; - if (cache_set_init_fault("read_super")) - goto err; - - bio_reset(sb->bio); - sb->bio->bi_bdev = sb->bdev; - sb->bio->bi_iter.bi_sector = SB_SECTOR; - sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order; - bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); - bch_bio_map(sb->bio, sb->sb); - - err = "IO error"; - if (submit_bio_wait(sb->bio)) - goto err; - - err = "Not a bcache superblock"; - if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) - goto err; - - err = "Superblock has incorrect offset"; - if (le64_to_cpu(sb->sb->offset) != SB_SECTOR) - goto err; - - pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", - le64_to_cpu(sb->sb->version), - le64_to_cpu(sb->sb->flags), - le64_to_cpu(sb->sb->seq), - le16_to_cpu(sb->sb->u64s)); - - err = "Superblock block size smaller than device block size"; - if (le16_to_cpu(sb->sb->block_size) << 9 < - bdev_logical_block_size(sb->bdev)) - goto err; - - order = get_order(__set_bytes(sb->sb, le16_to_cpu(sb->sb->u64s))); - if (order > sb->page_order) - goto retry; - - err = "bad checksum reading superblock"; - if (le64_to_cpu(sb->sb->csum) != - __csum_set(sb->sb, le16_to_cpu(sb->sb->u64s), - le64_to_cpu(sb->sb->version) < - BCACHE_SB_VERSION_CDEV_V3 - ? BCH_CSUM_CRC64 - : CACHE_SB_CSUM_TYPE(sb->sb))) - goto err; - - return NULL; -err: - free_super(sb); - return err; -} - -void __write_super(struct cache_set *c, struct bcache_superblock *disk_sb) -{ - struct cache_sb *sb = disk_sb->sb; - struct bio *bio = disk_sb->bio; - - bio->bi_bdev = disk_sb->bdev; - bio->bi_iter.bi_sector = SB_SECTOR; - bio->bi_iter.bi_size = - roundup(__set_bytes(sb, le16_to_cpu(sb->u64s)), - bdev_logical_block_size(disk_sb->bdev)); - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); - bch_bio_map(bio, sb); - - pr_debug("ver %llu, flags %llu, seq %llu", - le64_to_cpu(sb->version), - le64_to_cpu(sb->flags), - le64_to_cpu(sb->seq)); - - bch_generic_make_request(bio, c); -} - -static void write_super_endio(struct bio *bio) -{ - struct cache *ca = bio->bi_private; - - /* XXX: return errors directly */ - - cache_fatal_io_err_on(bio->bi_error, ca, "superblock write"); - - bch_account_io_completion(ca); - - closure_put(&ca->set->sb_write); - percpu_ref_put(&ca->ref); -} - -static void bcache_write_super_unlock(struct closure *cl) -{ - struct cache_set *c = container_of(cl, struct cache_set, sb_write); - - up(&c->sb_write_mutex); -} - -/* Update cached mi: */ -static int cache_set_mi_update(struct cache_set *c, - struct cache_member *mi, - unsigned nr_in_set) -{ - struct cache_member_rcu *new, *old; - struct cache *ca; - unsigned i; - - mutex_lock(&c->mi_lock); - - new = kzalloc(sizeof(struct cache_member_rcu) + - sizeof(struct cache_member_cpu) * nr_in_set, - GFP_KERNEL); - if (!new) { - mutex_unlock(&c->mi_lock); - return -ENOMEM; - } - - new->nr_in_set = nr_in_set; - - for (i = 0; i < nr_in_set; i++) - new->m[i] = cache_mi_to_cpu_mi(&mi[i]); - - rcu_read_lock(); - for_each_cache(ca, c, i) - ca->mi = new->m[i]; - rcu_read_unlock(); - - old = rcu_dereference_protected(c->members, - lockdep_is_held(&c->mi_lock)); - - rcu_assign_pointer(c->members, new); - if (old) - kfree_rcu(old, rcu); - - mutex_unlock(&c->mi_lock); - return 0; -} - -/* doesn't copy member info */ -static void __copy_super(struct cache_sb *dst, struct cache_sb *src) -{ - dst->version = src->version; - dst->seq = src->seq; - dst->user_uuid = src->user_uuid; - dst->set_uuid = src->set_uuid; - memcpy(dst->label, src->label, SB_LABEL_SIZE); - dst->flags = src->flags; - dst->flags2 = src->flags2; - dst->nr_in_set = src->nr_in_set; - dst->block_size = src->block_size; -} - -static int cache_sb_to_cache_set(struct cache_set *c, struct cache_sb *src) -{ - struct cache_member *new; - - lockdep_assert_held(&bch_register_lock); - - new = kzalloc(sizeof(struct cache_member) * src->nr_in_set, - GFP_KERNEL); - if (!new) - return -ENOMEM; - - memcpy(new, src->members, - src->nr_in_set * sizeof(struct cache_member)); - - if (cache_set_mi_update(c, new, src->nr_in_set)) { - kfree(new); - return -ENOMEM; - } - - kfree(c->disk_mi); - c->disk_mi = new; - - __copy_super(&c->disk_sb, src); - - c->sb.block_size = le16_to_cpu(src->block_size); - c->sb.btree_node_size = CACHE_SET_BTREE_NODE_SIZE(src); - c->sb.nr_in_set = src->nr_in_set; - c->sb.clean = CACHE_SET_CLEAN(src); - c->sb.meta_replicas_have= CACHE_SET_META_REPLICAS_HAVE(src); - c->sb.data_replicas_have= CACHE_SET_DATA_REPLICAS_HAVE(src); - c->sb.str_hash_type = CACHE_SET_STR_HASH_TYPE(src); - - return 0; -} - -static int cache_sb_from_cache_set(struct cache_set *c, struct cache *ca) -{ - struct cache_sb *src = &c->disk_sb, *dst = ca->disk_sb.sb; - - if (src->nr_in_set != dst->nr_in_set) { - /* - * We have to preserve the list of journal buckets on the - * cache's superblock: - */ - unsigned old_offset = bch_journal_buckets_offset(dst); - unsigned u64s = bch_journal_buckets_offset(src) - + bch_nr_journal_buckets(dst); - int ret = bch_super_realloc(&ca->disk_sb, u64s); - - if (ret) - return ret; - - dst->nr_in_set = src->nr_in_set; - dst->u64s = cpu_to_le16(u64s); - - memmove(dst->_data + bch_journal_buckets_offset(dst), - dst->_data + old_offset, - bch_nr_journal_buckets(dst) * sizeof(u64)); - } - - memcpy(dst->_data, - c->disk_mi, - src->nr_in_set * sizeof(struct cache_member)); - - __copy_super(dst, src); - - return 0; -} - -static void __bcache_write_super(struct cache_set *c) -{ - struct closure *cl = &c->sb_write; - struct cache *ca; - unsigned i; - - cache_set_mi_update(c, c->disk_mi, c->sb.nr_in_set); - - closure_init(cl, &c->cl); - - if (c->opts.nochanges) - goto no_io; - - le64_add_cpu(&c->disk_sb.seq, 1); - - for_each_cache(ca, c, i) { - struct cache_sb *sb = ca->disk_sb.sb; - struct bio *bio = ca->disk_sb.bio; - - cache_sb_from_cache_set(c, ca); - - SET_CACHE_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); - sb->csum = cpu_to_le64(__csum_set(sb, - le16_to_cpu(sb->u64s), - CACHE_SB_CSUM_TYPE(sb))); - - bio_reset(bio); - bio->bi_bdev = ca->disk_sb.bdev; - bio->bi_end_io = write_super_endio; - bio->bi_private = ca; - - closure_get(cl); - percpu_ref_get(&ca->ref); - __write_super(c, &ca->disk_sb); - } -no_io: - closure_return_with_destructor(cl, bcache_write_super_unlock); -} - -void bcache_write_super(struct cache_set *c) -{ - down(&c->sb_write_mutex); - __bcache_write_super(c); -} - -void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k, - bool meta) -{ - struct cache_member *mi; - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); - const struct bch_extent_ptr *ptr; - - if (!CACHE_SET_SYNC(&c->disk_sb)) - return; - - down(&c->sb_write_mutex); - - /* recheck, might have raced */ - if (bch_check_super_marked(c, k, meta)) { - up(&c->sb_write_mutex); - return; - } - - mi = c->disk_mi; - - extent_for_each_ptr(e, ptr) - if (bch_extent_ptr_is_dirty(c, e, ptr)) - (meta - ? SET_CACHE_HAS_METADATA - : SET_CACHE_HAS_DATA)(mi + ptr->dev, true); - - __bcache_write_super(c); -} - /* Cache set RO/RW: */ /* @@ -768,8 +197,10 @@ static void bch_cache_set_read_only_work(struct work_struct *work) if (!bch_journal_error(&c->journal) && !test_bit(CACHE_SET_ERROR, &c->flags)) { - SET_CACHE_SET_CLEAN(&c->disk_sb, true); - bcache_write_super(c); + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb, true); + bch_write_super(c); + mutex_unlock(&c->sb_lock); } } else { /* @@ -848,7 +279,7 @@ static const char *__bch_cache_set_read_write(struct cache_set *c) err = "error starting allocator thread"; for_each_cache(ca, c, i) - if (ca->mi.state == CACHE_ACTIVE && + if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && bch_cache_allocator_start(ca)) { percpu_ref_put(&ca->ref); goto err; @@ -859,7 +290,7 @@ static const char *__bch_cache_set_read_write(struct cache_set *c) goto err; for_each_cache(ca, c, i) { - if (ca->mi.state != CACHE_ACTIVE) + if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE) continue; err = "error starting moving GC thread"; @@ -913,6 +344,7 @@ static void cache_set_free(struct cache_set *c) cancel_work_sync(&c->bio_submit_work); cancel_work_sync(&c->read_retry_work); + bch_cache_set_encryption_free(c); bch_btree_cache_free(c); bch_journal_free(&c->journal); bch_io_clock_exit(&c->io_clock[WRITE]); @@ -939,7 +371,7 @@ static void cache_set_free(struct cache_set *c) destroy_workqueue(c->wq); kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */ - kfree(c->disk_mi); + free_pages((unsigned long) c->disk_sb, c->disk_sb_order); kfree(c); module_put(THIS_MODULE); } @@ -1043,15 +475,18 @@ void bch_cache_set_unregister(struct cache_set *c) static unsigned cache_set_nr_devices(struct cache_set *c) { + struct bch_sb_field_members *mi; unsigned i, nr = 0; - struct cache_member *mi = c->disk_mi; - lockdep_assert_held(&bch_register_lock); + mutex_lock(&c->sb_lock); + mi = bch_sb_get_members(c->disk_sb); - for (i = 0; i < c->disk_sb.nr_in_set; i++) - if (!bch_is_zero(mi[i].uuid.b, sizeof(uuid_le))) + for (i = 0; i < c->disk_sb->nr_devices; i++) + if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) nr++; + mutex_unlock(&c->sb_lock); + return nr; } @@ -1059,7 +494,7 @@ static unsigned cache_set_nr_online_devices(struct cache_set *c) { unsigned i, nr = 0; - for (i = 0; i < c->sb.nr_in_set; i++) + for (i = 0; i < c->sb.nr_devices; i++) if (c->cache[i]) nr++; @@ -1069,7 +504,7 @@ static unsigned cache_set_nr_online_devices(struct cache_set *c) #define alloc_bucket_pages(gfp, ca) \ ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca)))) -static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb, +static struct cache_set *bch_cache_set_alloc(struct bch_sb *sb, struct cache_set_opts opts) { struct cache_set *c; @@ -1083,13 +518,12 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb, c->minor = -1; - sema_init(&c->sb_write_mutex, 1); + mutex_init(&c->sb_lock); INIT_RADIX_TREE(&c->devices, GFP_KERNEL); mutex_init(&c->btree_cache_lock); mutex_init(&c->bucket_lock); mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch_cache_set_read_only_work); - mutex_init(&c->mi_lock); init_rwsem(&c->gc_lock); @@ -1146,10 +580,16 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb, mutex_init(&c->uevent_lock); - if (cache_sb_to_cache_set(c, sb)) + mutex_lock(&c->sb_lock); + + if (bch_sb_to_cache_set(c, sb)) { + mutex_unlock(&c->sb_lock); goto err; + } + + mutex_unlock(&c->sb_lock); - scnprintf(c->name, sizeof(c->name), "%pU", &c->disk_sb.user_uuid); + scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); c->opts = cache_superblock_opts(sb); cache_set_opts_apply(&c->opts, opts); @@ -1165,7 +605,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb, iter_size = (btree_blocks(c) + 1) * 2 * sizeof(struct btree_node_iter_set); - journal_entry_bytes = 512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb); + journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb); if (!(c->wq = alloc_workqueue("bcache", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || @@ -1185,7 +625,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb, mempool_init_page_pool(&c->bio_bounce_pages, max_t(unsigned, c->sb.btree_node_size, - CRC32_EXTENT_SIZE_MAX) / + BCH_ENCODED_EXTENT_MAX) / PAGE_SECTORS, 0) || !(c->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache_set)) || lg_lock_init(&c->bucket_stats_lock) || @@ -1196,7 +636,9 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb, bch_io_clock_init(&c->io_clock[WRITE]) || bch_journal_alloc(&c->journal, journal_entry_bytes) || bch_btree_cache_alloc(c) || - bch_compress_init(c)) + bch_cache_set_encryption_init(c) || + bch_compress_init(c) || + bch_check_set_has_compressed_data(c, c->opts.compression)) goto err; c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; @@ -1247,7 +689,7 @@ static int bch_cache_set_online(struct cache_set *c) if (IS_ERR(c->chardev)) return PTR_ERR(c->chardev); - if (kobject_add(&c->kobj, NULL, "%pU", c->disk_sb.user_uuid.b) || + if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || kobject_add(&c->internal, &c->kobj, "internal") || kobject_add(&c->opts_dir, &c->kobj, "options") || kobject_add(&c->time_stats, &c->kobj, "time_stats") || @@ -1267,6 +709,7 @@ static int bch_cache_set_online(struct cache_set *c) static const char *run_cache_set(struct cache_set *c) { const char *err = "cannot allocate memory"; + struct bch_sb_field_members *mi; struct cache *ca; unsigned i, id; time64_t now; @@ -1285,15 +728,9 @@ static const char *run_cache_set(struct cache_set *c) * we start testing it. */ for_each_cache(ca, c, i) - cache_sb_from_cache_set(c, ca); + bch_sb_from_cache_set(c, ca); - /* - * CACHE_SET_SYNC is true if the cache set has already been run - * and potentially has data. - * It is false if it is the first time it is run. - */ - - if (CACHE_SET_SYNC(&c->disk_sb)) { + if (BCH_SB_INITIALIZED(c->disk_sb)) { ret = bch_journal_read(c, &journal); if (ret) goto err; @@ -1363,7 +800,7 @@ static const char *run_cache_set(struct cache_set *c) err = "error starting allocator thread"; for_each_cache(ca, c, i) - if (ca->mi.state == CACHE_ACTIVE && + if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && bch_cache_allocator_start(ca)) { percpu_ref_put(&ca->ref); goto err; @@ -1381,25 +818,16 @@ static const char *run_cache_set(struct cache_set *c) if (c->opts.norecovery) goto recovery_done; - /* - * Write a new journal entry _before_ we start journalling new - * data - otherwise, we could end up with btree node bsets with - * journal seqs arbitrarily far in the future vs. the most - * recently written journal entry on disk, if we crash before - * writing the next journal entry: - */ - err = "error writing journal entry"; - if (bch_journal_meta(&c->journal)) - goto err; - bch_verbose(c, "starting fsck:"); err = "error in fsck"; ret = bch_fsck(c, !c->opts.nofsck); if (ret) goto err; + bch_verbose(c, "fsck done"); } else { - struct bkey_i_inode inode; + struct bch_inode_unpacked inode; + struct bkey_inode_buf packed_inode; struct closure cl; closure_init_stack(&cl); @@ -1424,7 +852,7 @@ static const char *run_cache_set(struct cache_set *c) err = "error starting allocator thread"; for_each_cache(ca, c, i) - if (ca->mi.state == CACHE_ACTIVE && + if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE && bch_cache_allocator_start(ca)) { percpu_ref_put(&ca->ref); goto err; @@ -1442,10 +870,13 @@ static const char *run_cache_set(struct cache_set *c) bch_inode_init(c, &inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0); - inode.k.p.inode = BCACHE_ROOT_INO; + inode.inum = BCACHE_ROOT_INO; + + bch_inode_pack(&packed_inode, &inode); err = "error creating root directory"; - if (bch_btree_insert(c, BTREE_ID_INODES, &inode.k_i, + if (bch_btree_insert(c, BTREE_ID_INODES, + &packed_inode.inode.k_i, NULL, NULL, NULL, 0)) goto err; @@ -1462,16 +893,21 @@ recovery_done: goto err; } + mutex_lock(&c->sb_lock); + mi = bch_sb_get_members(c->disk_sb); now = ktime_get_seconds(); + rcu_read_lock(); for_each_cache_rcu(ca, c, i) - c->disk_mi[ca->sb.nr_this_dev].last_mount = cpu_to_le64(now); + mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); rcu_read_unlock(); - /* Mark cache set as initialized: */ - SET_CACHE_SET_SYNC(&c->disk_sb, true); - SET_CACHE_SET_CLEAN(&c->disk_sb, false); - bcache_write_super(c); + SET_BCH_SB_INITIALIZED(c->disk_sb, true); + SET_BCH_SB_CLEAN(c->disk_sb, false); + c->disk_sb->version = BCACHE_SB_VERSION_CDEV; + + bch_write_super(c); + mutex_unlock(&c->sb_lock); err = "dynamic fault"; if (cache_set_init_fault("run_cache_set")) @@ -1527,41 +963,46 @@ err: goto out; } -static const char *can_add_cache(struct cache_sb *sb, +static const char *can_add_cache(struct bch_sb *sb, struct cache_set *c) { + struct bch_sb_field_members *sb_mi; + + sb_mi = bch_sb_get_members(sb); + if (!sb_mi) + return "Invalid superblock: member info area missing"; + if (le16_to_cpu(sb->block_size) != c->sb.block_size) return "mismatched block size"; - if (le16_to_cpu(sb->members[sb->nr_this_dev].bucket_size) < - CACHE_SET_BTREE_NODE_SIZE(&c->disk_sb)) + if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < + BCH_SB_BTREE_NODE_SIZE(c->disk_sb)) return "new cache bucket_size is too small"; return NULL; } -static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c) +static const char *can_attach_cache(struct bch_sb *sb, struct cache_set *c) { + struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb); + struct bch_sb_field_members *dev_mi = bch_sb_get_members(sb); + uuid_le dev_uuid = dev_mi->members[sb->dev_idx].uuid; const char *err; - bool match; err = can_add_cache(sb, c); if (err) return err; + if (bch_is_zero(&dev_uuid, sizeof(dev_uuid))) + return "device has been removed"; + /* * When attaching an existing device, the cache set superblock must * already contain member_info with a matching UUID */ - match = le64_to_cpu(sb->seq) <= le64_to_cpu(c->disk_sb.seq) - ? (sb->nr_this_dev < c->disk_sb.nr_in_set && - !memcmp(&c->disk_mi[sb->nr_this_dev].uuid, - &sb->disk_uuid, sizeof(uuid_le))) - : (sb->nr_this_dev < sb->nr_in_set && - !memcmp(&sb->members[sb->nr_this_dev].uuid, - &sb->disk_uuid, sizeof(uuid_le))); - - if (!match) + if (sb->dev_idx >= c->disk_sb->nr_devices || + memcmp(&mi->members[sb->dev_idx].uuid, + &dev_uuid, sizeof(uuid_le))) return "cache sb does not match set"; return NULL; @@ -1572,13 +1013,14 @@ static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c) bool bch_cache_read_only(struct cache *ca) { struct cache_set *c = ca->set; + struct bch_sb_field_members *mi; char buf[BDEVNAME_SIZE]; bdevname(ca->disk_sb.bdev, buf); lockdep_assert_held(&bch_register_lock); - if (ca->mi.state != CACHE_ACTIVE) + if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE) return false; if (!bch_cache_may_remove(ca)) { @@ -1609,8 +1051,12 @@ bool bch_cache_read_only(struct cache *ca) bch_notice(c, "%s read only", bdevname(ca->disk_sb.bdev, buf)); bch_notify_cache_read_only(ca); - SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_RO); - bcache_write_super(c); + mutex_lock(&c->sb_lock); + mi = bch_sb_get_members(c->disk_sb); + SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], + BCH_MEMBER_STATE_RO); + bch_write_super(c); + mutex_unlock(&c->sb_lock); return true; } @@ -1618,7 +1064,7 @@ static const char *__bch_cache_read_write(struct cache_set *c, struct cache *ca) { lockdep_assert_held(&bch_register_lock); - if (ca->mi.state == CACHE_ACTIVE) + if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) return NULL; if (test_bit(CACHE_DEV_REMOVING, &ca->flags)) @@ -1645,14 +1091,19 @@ static const char *__bch_cache_read_write(struct cache_set *c, struct cache *ca) const char *bch_cache_read_write(struct cache *ca) { struct cache_set *c = ca->set; + struct bch_sb_field_members *mi; const char *err; err = __bch_cache_read_write(c, ca); if (err) return err; - SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_ACTIVE); - bcache_write_super(c); + mutex_lock(&c->sb_lock); + mi = bch_sb_get_members(c->disk_sb); + SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], + BCH_MEMBER_STATE_ACTIVE); + bch_write_super(c); + mutex_unlock(&c->sb_lock); return NULL; } @@ -1681,14 +1132,14 @@ static void bch_cache_free_work(struct work_struct *work) if (c && c->kobj.state_in_sysfs) { char buf[12]; - sprintf(buf, "cache%u", ca->sb.nr_this_dev); + sprintf(buf, "cache%u", ca->dev_idx); sysfs_remove_link(&c->kobj, buf); } if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); - free_super(&ca->disk_sb); + bch_free_super(&ca->disk_sb); /* * bch_cache_stop can be called in the middle of initialization @@ -1697,10 +1148,10 @@ static void bch_cache_free_work(struct work_struct *work) * However, they were zeroed when the object was allocated. */ + bch_journal_free_cache(ca); free_percpu(ca->sectors_written); bioset_exit(&ca->replica_set); free_percpu(ca->bucket_stats_percpu); - kfree(ca->journal.bucket_seq); free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); kfree(ca->prio_buckets); kfree(ca->bio_prio); @@ -1754,8 +1205,8 @@ static void bch_cache_stop(struct cache *ca) lockdep_assert_held(&bch_register_lock); if (c) { - BUG_ON(rcu_access_pointer(c->cache[ca->sb.nr_this_dev]) != ca); - rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], NULL); + BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca); + rcu_assign_pointer(c->cache[ca->dev_idx], NULL); } call_rcu(&ca->free_rcu, bch_cache_free_rcu); @@ -1764,10 +1215,11 @@ static void bch_cache_stop(struct cache *ca) static void bch_cache_remove_work(struct work_struct *work) { struct cache *ca = container_of(work, struct cache, remove_work); + struct bch_sb_field_members *mi; struct cache_set *c = ca->set; char name[BDEVNAME_SIZE]; bool force = test_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags); - unsigned dev = ca->sb.nr_this_dev; + unsigned dev_idx = ca->dev_idx; bdevname(ca->disk_sb.bdev, name); @@ -1780,17 +1232,21 @@ static void bch_cache_remove_work(struct work_struct *work) if (!ca->mi.has_data) { /* Nothing to do: */ } else if (!bch_move_data_off_device(ca)) { - lockdep_assert_held(&bch_register_lock); - SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false); + mutex_lock(&c->sb_lock); + mi = bch_sb_get_members(c->disk_sb); + SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false); - bcache_write_super(c); + bch_write_super(c); + mutex_unlock(&c->sb_lock); } else if (force) { bch_flag_data_bad(ca); - lockdep_assert_held(&bch_register_lock); - SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false); + mutex_lock(&c->sb_lock); + mi = bch_sb_get_members(c->disk_sb); + SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false); - bcache_write_super(c); + bch_write_super(c); + mutex_unlock(&c->sb_lock); } else { bch_err(c, "Remove of %s failed, unable to migrate data off", name); @@ -1803,10 +1259,12 @@ static void bch_cache_remove_work(struct work_struct *work) if (!ca->mi.has_metadata) { /* Nothing to do: */ } else if (!bch_move_meta_data_off_device(ca)) { - lockdep_assert_held(&bch_register_lock); - SET_CACHE_HAS_METADATA(&c->disk_mi[ca->sb.nr_this_dev], false); + mutex_lock(&c->sb_lock); + mi = bch_sb_get_members(c->disk_sb); + SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false); - bcache_write_super(c); + bch_write_super(c); + mutex_unlock(&c->sb_lock); } else { bch_err(c, "Remove of %s failed, unable to migrate metadata off", name); @@ -1821,7 +1279,7 @@ static void bch_cache_remove_work(struct work_struct *work) bch_notify_cache_removed(ca); spin_lock(&c->journal.lock); - c->journal.prio_buckets[dev] = 0; + c->journal.prio_buckets[dev_idx] = 0; spin_unlock(&c->journal.lock); bch_journal_meta(&c->journal); @@ -1844,12 +1302,16 @@ static void bch_cache_remove_work(struct work_struct *work) lockdep_assert_held(&bch_register_lock); /* - * Free this device's slot in the cache_member array - all pointers to + * Free this device's slot in the bch_member array - all pointers to * this device must be gone: */ - memset(&c->disk_mi[dev].uuid, 0, sizeof(c->disk_mi[dev].uuid)); + mutex_lock(&c->sb_lock); + mi = bch_sb_get_members(c->disk_sb); + memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); + + bch_write_super(c); + mutex_unlock(&c->sb_lock); - bcache_write_super(c); mutex_unlock(&bch_register_lock); closure_put(&c->cl); @@ -1891,7 +1353,7 @@ static int bch_cache_online(struct cache *ca) lockdep_assert_held(&bch_register_lock); - sprintf(buf, "cache%u", ca->sb.nr_this_dev); + sprintf(buf, "cache%u", ca->dev_idx); if (kobject_add(&ca->kobj, &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, @@ -1907,13 +1369,14 @@ static const char *cache_alloc(struct bcache_superblock *sb, struct cache_set *c, struct cache **ret) { + struct bch_member *member; size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve; size_t heap_size; - unsigned i, journal_entry_pages; + unsigned i; const char *err = "cannot allocate memory"; struct cache *ca; - if (c->sb.nr_in_set == 1) + if (c->sb.nr_devices == 1) bdevname(sb->bdev, c->name); if (cache_set_init_fault("cache_alloc")) @@ -1934,7 +1397,7 @@ static const char *cache_alloc(struct bcache_superblock *sb, spin_lock_init(&ca->self.lock); ca->self.nr_devices = 1; rcu_assign_pointer(ca->self.d[0].dev, ca); - ca->sb.nr_this_dev = sb->sb->nr_this_dev; + ca->dev_idx = sb->sb->dev_idx; INIT_WORK(&ca->free_work, bch_cache_free_work); INIT_WORK(&ca->remove_work, bch_cache_remove_work); @@ -1953,8 +1416,11 @@ static const char *cache_alloc(struct bcache_superblock *sb, if (cache_set_init_fault("cache_alloc")) goto err; - ca->mi = cache_mi_to_cpu_mi(ca->disk_sb.sb->members + - ca->disk_sb.sb->nr_this_dev); + member = bch_sb_get_members(ca->disk_sb.sb)->members + + ca->disk_sb.sb->dev_idx; + + ca->mi = cache_mi_to_cpu_mi(member); + ca->uuid = member->uuid; ca->bucket_bits = ilog2(ca->mi.bucket_size); /* XXX: tune these */ @@ -1968,10 +1434,6 @@ static const char *cache_alloc(struct bcache_superblock *sb, free_inc_reserve = movinggc_reserve / 2; heap_size = movinggc_reserve * 8; - journal_entry_pages = - DIV_ROUND_UP(1U << CACHE_SET_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb), - PAGE_SECTORS); - if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_MOVINGGC], @@ -1987,13 +1449,11 @@ static const char *cache_alloc(struct bcache_superblock *sb, 2, GFP_KERNEL)) || !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || !(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache)) || - !(ca->journal.bucket_seq = kcalloc(bch_nr_journal_buckets(ca->disk_sb.sb), - sizeof(u64), GFP_KERNEL)) || - !(ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages)) || - !(ca->bio_prio = bio_kmalloc(GFP_KERNEL, bucket_pages(ca))) || + !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) || bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio)) || - !(ca->sectors_written = alloc_percpu(*ca->sectors_written))) + !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) || + bch_journal_init_cache(ca)) goto err; ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); @@ -2006,15 +1466,6 @@ static const char *cache_alloc(struct bcache_superblock *sb, ca->copygc_write_point.group = &ca->self; ca->tiering_write_point.group = &ca->self; - kobject_get(&c->kobj); - ca->set = c; - - kobject_get(&ca->kobj); - rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], ca); - - if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb.seq)) - cache_sb_to_cache_set(c, ca->disk_sb.sb); - /* * Increase journal write timeout if flushes to this device are * expensive: @@ -2024,6 +1475,19 @@ static const char *cache_alloc(struct bcache_superblock *sb, c->journal.write_delay_ms = max(c->journal.write_delay_ms, 1000U); + kobject_get(&c->kobj); + ca->set = c; + + kobject_get(&ca->kobj); + rcu_assign_pointer(c->cache[ca->dev_idx], ca); + + mutex_lock(&c->sb_lock); + + if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb->seq)) + bch_sb_to_cache_set(c, ca->disk_sb.sb); + + mutex_unlock(&c->sb_lock); + err = "error creating kobject"; if (c->kobj.state_in_sysfs && bch_cache_online(ca)) @@ -2046,7 +1510,7 @@ static struct cache_set *cache_set_lookup(uuid_le uuid) lockdep_assert_held(&bch_register_lock); list_for_each_entry(c, &bch_cache_sets, list) - if (!memcmp(&c->disk_sb.set_uuid, &uuid, sizeof(uuid_le))) + if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le))) return c; return NULL; @@ -2060,13 +1524,13 @@ static const char *register_cache(struct bcache_superblock *sb, struct cache_set *c; bool allocated_cache_set = false; - err = validate_cache_super(sb); + err = bch_validate_cache_super(sb); if (err) return err; bdevname(sb->bdev, name); - c = cache_set_lookup(sb->sb->set_uuid); + c = cache_set_lookup(sb->sb->uuid); if (c) { err = can_attach_cache(sb->sb, c); if (err) @@ -2106,20 +1570,23 @@ int bch_cache_set_add_cache(struct cache_set *c, const char *path) struct bcache_superblock sb; const char *err; struct cache *ca; - struct cache_member *new_mi = NULL; - struct cache_member mi; - unsigned nr_this_dev, nr_in_set, u64s; + struct bch_sb_field *f; + struct bch_sb_field_members *mi, *dev_mi; + struct bch_member saved_mi; + unsigned dev_idx, nr_devices, u64s; int ret = -EINVAL; mutex_lock(&bch_register_lock); - err = read_super(&sb, c->opts, path); + err = bch_read_super(&sb, c->opts, path); if (err) - goto err_unlock; + goto err_unlock_register; - err = validate_cache_super(&sb); + err = bch_validate_cache_super(&sb); if (err) - goto err_unlock; + goto err_unlock_register; + + mutex_lock(&c->sb_lock); err = can_add_cache(sb.sb, c); if (err) @@ -2129,8 +1596,9 @@ int bch_cache_set_add_cache(struct cache_set *c, const char *path) * Preserve the old cache member information (esp. tier) * before we start bashing the disk stuff. */ - mi = sb.sb->members[sb.sb->nr_this_dev]; - mi.last_mount = cpu_to_le64(ktime_get_seconds()); + dev_mi = bch_sb_get_members(sb.sb); + saved_mi = dev_mi->members[sb.sb->dev_idx]; + saved_mi.last_mount = cpu_to_le64(ktime_get_seconds()); down_read(&c->gc_lock); @@ -2140,9 +1608,10 @@ int bch_cache_set_add_cache(struct cache_set *c, const char *path) if (test_bit(CACHE_SET_GC_FAILURE, &c->flags)) goto no_slot; - for (nr_this_dev = 0; nr_this_dev < MAX_CACHES_PER_SET; nr_this_dev++) - if (nr_this_dev >= c->sb.nr_in_set || - bch_is_zero(c->disk_mi[nr_this_dev].uuid.b, + mi = bch_sb_get_members(c->disk_sb); + for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) + if (dev_idx >= c->sb.nr_devices || + bch_is_zero(mi->members[dev_idx].uuid.b, sizeof(uuid_le))) goto have_slot; no_slot: @@ -2153,52 +1622,46 @@ no_slot: goto err_unlock; have_slot: - nr_in_set = max_t(unsigned, nr_this_dev + 1, c->sb.nr_in_set); up_read(&c->gc_lock); - u64s = nr_in_set * (sizeof(struct cache_member) / sizeof(u64)); + nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); + u64s = (sizeof(struct bch_sb_field_members) + + sizeof(struct bch_member) * nr_devices) / sizeof(u64); err = "no space in superblock for member info"; - if (bch_super_realloc(&sb, u64s)) + + f = bch_fs_sb_field_resize(c, &mi->field, u64s); + if (!f) goto err_unlock; - new_mi = dynamic_fault("bcache:add:member_info_realloc") - ? NULL - : kmalloc(sizeof(struct cache_member) * nr_in_set, - GFP_KERNEL); - if (!new_mi) { - err = "cannot allocate memory"; - ret = -ENOMEM; + mi = container_of(f, struct bch_sb_field_members, field); + + f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s); + if (!f) goto err_unlock; - } - memcpy(new_mi, c->disk_mi, - sizeof(struct cache_member) * nr_in_set); - new_mi[nr_this_dev] = mi; + dev_mi = container_of(f, struct bch_sb_field_members, field); + memcpy(dev_mi, mi, u64s * sizeof(u64)); + dev_mi->members[dev_idx] = saved_mi; - sb.sb->nr_this_dev = nr_this_dev; - sb.sb->nr_in_set = nr_in_set; - sb.sb->u64s = cpu_to_le16(u64s); - memcpy(sb.sb->members, new_mi, - sizeof(struct cache_member) * nr_in_set); + sb.sb->dev_idx = dev_idx; + sb.sb->nr_devices = nr_devices; - if (cache_set_mi_update(c, new_mi, nr_in_set)) { + if (bch_cache_set_mi_update(c, dev_mi->members, nr_devices)) { err = "cannot allocate memory"; ret = -ENOMEM; goto err_unlock; } /* commit new member info */ - swap(c->disk_mi, new_mi); - kfree(new_mi); - new_mi = NULL; - c->disk_sb.nr_in_set = nr_in_set; - c->sb.nr_in_set = nr_in_set; + memcpy(mi, dev_mi, u64s * sizeof(u64)); + c->disk_sb->nr_devices = nr_devices; + c->sb.nr_devices = nr_devices; err = cache_alloc(&sb, c, &ca); if (err) goto err_unlock; - bcache_write_super(c); + bch_write_super(c); err = "journal alloc failed"; if (bch_cache_journal_alloc(ca)) @@ -2206,21 +1669,23 @@ have_slot: bch_notify_cache_added(ca); - if (ca->mi.state == CACHE_ACTIVE) { + if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) { err = __bch_cache_read_write(c, ca); if (err) goto err_put; } kobject_put(&ca->kobj); + mutex_unlock(&c->sb_lock); mutex_unlock(&bch_register_lock); return 0; err_put: bch_cache_stop(ca); err_unlock: - kfree(new_mi); - free_super(&sb); + mutex_unlock(&c->sb_lock); +err_unlock_register: mutex_unlock(&bch_register_lock); + bch_free_super(&sb); bch_err(c, "Unable to add device: %s", err); return ret ?: -EINVAL; @@ -2250,14 +1715,14 @@ const char *bch_register_cache_set(char * const *devices, unsigned nr_devices, goto err; /* - * read_super() needs to happen under register_lock, so that the + * bch_read_super() needs to happen under register_lock, so that the * exclusive open is atomic with adding the new cache set to the list of * cache sets: */ mutex_lock(&bch_register_lock); for (i = 0; i < nr_devices; i++) { - err = read_super(&sb[i], opts, devices[i]); + err = bch_read_super(&sb[i], opts, devices[i]); if (err) goto err_unlock; @@ -2265,13 +1730,13 @@ const char *bch_register_cache_set(char * const *devices, unsigned nr_devices, if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version))) goto err_unlock; - err = validate_cache_super(&sb[i]); + err = bch_validate_cache_super(&sb[i]); if (err) goto err_unlock; } err = "cache set already registered"; - if (cache_set_lookup(sb->sb->set_uuid)) + if (cache_set_lookup(sb->sb->uuid)) goto err_unlock; err = "cannot allocate memory"; @@ -2317,7 +1782,7 @@ err_unlock: mutex_unlock(&bch_register_lock); err: for (i = 0; i < nr_devices; i++) - free_super(&sb[i]); + bch_free_super(&sb[i]); goto out; } @@ -2329,7 +1794,7 @@ const char *bch_register_one(const char *path) mutex_lock(&bch_register_lock); - err = read_super(&sb, opts, path); + err = bch_read_super(&sb, opts, path); if (err) goto err; @@ -2338,7 +1803,7 @@ const char *bch_register_one(const char *path) else err = register_cache(&sb, opts); - free_super(&sb); + bch_free_super(&sb); err: mutex_unlock(&bch_register_lock); return err; @@ -2440,8 +1905,8 @@ static void bcache_exit(void) class_destroy(bch_chardev_class); if (bch_chardev_major > 0) unregister_chrdev(bch_chardev_major, "bcache"); - if (!IS_ERR_OR_NULL(bch_sha1)) - crypto_free_shash(bch_sha1); + if (!IS_ERR_OR_NULL(bch_sha256)) + crypto_free_shash(bch_sha256); unregister_reboot_notifier(&reboot); } @@ -2459,8 +1924,8 @@ static int __init bcache_init(void) closure_debug_init(); bkey_pack_test(); - bch_sha1 = crypto_alloc_shash("sha1", 0, 0); - if (IS_ERR(bch_sha1)) + bch_sha256 = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(bch_sha256)) goto err; bch_chardev_major = register_chrdev(0, "bcache-ctl", &bch_chardev_fops); diff --git a/libbcache/super.h b/libbcache/super.h index 635e1a6f..014d7aed 100644 --- a/libbcache/super.h +++ b/libbcache/super.h @@ -18,17 +18,12 @@ static inline sector_t bucket_remainder(const struct cache *ca, sector_t s) return s & (ca->mi.bucket_size - 1); } -#define cache_member_info_get(_c) \ - (rcu_read_lock(), rcu_dereference((_c)->members)) - -#define cache_member_info_put() rcu_read_unlock() - static inline struct cache *bch_next_cache_rcu(struct cache_set *c, unsigned *iter) { struct cache *ret = NULL; - while (*iter < c->sb.nr_in_set && + while (*iter < c->sb.nr_devices && !(ret = rcu_dereference(c->cache[*iter]))) (*iter)++; @@ -59,40 +54,6 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c, (ca = bch_get_next_cache(c, &(iter))); \ percpu_ref_put(&ca->ref), (iter)++) -void bch_check_mark_super_slowpath(struct cache_set *, - const struct bkey_i *, bool); - -static inline bool bch_check_super_marked(struct cache_set *c, - const struct bkey_i *k, bool meta) -{ - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); - const struct bch_extent_ptr *ptr; - struct cache_member_cpu *mi = cache_member_info_get(c)->m; - bool ret = true; - - extent_for_each_ptr(e, ptr) - if (!(meta - ? mi[ptr->dev].has_metadata - : mi[ptr->dev].has_data) && - bch_extent_ptr_is_dirty(c, e, ptr)) { - ret = false; - break; - } - - cache_member_info_put(); - - return ret; -} - -static inline void bch_check_mark_super(struct cache_set *c, - const struct bkey_i *k, bool meta) -{ - if (bch_check_super_marked(c, k, meta)) - return; - - bch_check_mark_super_slowpath(c, k, meta); -} - static inline bool bch_cache_may_remove(struct cache *ca) { struct cache_set *c = ca->set; @@ -119,11 +80,6 @@ static inline bool bch_cache_may_remove(struct cache *ca) rcu_access_pointer(tier->d[0].dev) != ca; } -void free_super(struct bcache_superblock *); -int bch_super_realloc(struct bcache_superblock *, unsigned); -void bcache_write_super(struct cache_set *); -void __write_super(struct cache_set *, struct bcache_superblock *); - void bch_cache_set_release(struct kobject *); void bch_cache_release(struct kobject *); @@ -149,7 +105,7 @@ extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; extern struct idr bch_cache_set_minor; extern struct workqueue_struct *bcache_io_wq; -extern struct crypto_shash *bch_sha1; +extern struct crypto_shash *bch_sha256; extern struct kobj_type bch_cache_set_ktype; extern struct kobj_type bch_cache_set_internal_ktype; diff --git a/libbcache/super_types.h b/libbcache/super_types.h index d89f780f..41eaf0dd 100644 --- a/libbcache/super_types.h +++ b/libbcache/super_types.h @@ -2,7 +2,7 @@ #define _BCACHE_SUPER_TYPES_H struct bcache_superblock { - struct cache_sb *sb; + struct bch_sb *sb; struct block_device *bdev; struct bio *bio; unsigned page_order; diff --git a/libbcache/sysfs.c b/libbcache/sysfs.c index 58a71259..57b7dd9d 100644 --- a/libbcache/sysfs.c +++ b/libbcache/sysfs.c @@ -8,9 +8,11 @@ #include "bcache.h" #include "alloc.h" #include "blockdev.h" +#include "compress.h" #include "sysfs.h" #include "btree_cache.h" #include "btree_iter.h" +#include "btree_update.h" #include "btree_gc.h" #include "buckets.h" #include "inode.h" @@ -19,6 +21,7 @@ #include "move.h" #include "opts.h" #include "request.h" +#include "super-io.h" #include "writeback.h" #include <linux/blkdev.h> @@ -139,14 +142,14 @@ read_attribute(tier); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM -#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ +#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ static struct attribute sysfs_opt_##_name = { \ .name = #_name, \ .mode = S_IRUGO|(_perm ? S_IWUSR : 0) \ }; - CACHE_SET_VISIBLE_OPTS() -#undef CACHE_SET_OPT + BCH_VISIBLE_OPTS() +#undef BCH_OPT #define BCH_TIME_STAT(name, frequency_units, duration_units) \ sysfs_time_stats_attribute(name, frequency_units, duration_units); @@ -193,8 +196,8 @@ SHOW(bch_cached_dev) sysfs_print(state, states[BDEV_STATE(dc->disk_sb.sb)]); if (attr == &sysfs_label) { - memcpy(buf, dc->disk_sb.sb->label, SB_LABEL_SIZE); - buf[SB_LABEL_SIZE + 1] = '\0'; + memcpy(buf, dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE); + buf[BCH_SB_LABEL_SIZE + 1] = '\0'; strcat(buf, "\n"); return strlen(buf); } @@ -248,24 +251,25 @@ STORE(__cached_dev) u64 journal_seq = 0; int ret = 0; - if (size > SB_LABEL_SIZE) + if (size > BCH_SB_LABEL_SIZE) return -EINVAL; mutex_lock(&dc->disk.inode_lock); memcpy(dc->disk_sb.sb->label, buf, size); - if (size < SB_LABEL_SIZE) + if (size < BCH_SB_LABEL_SIZE) dc->disk_sb.sb->label[size] = '\0'; if (size && dc->disk_sb.sb->label[size - 1] == '\n') dc->disk_sb.sb->label[size - 1] = '\0'; memcpy(dc->disk.inode.v.i_label, - dc->disk_sb.sb->label, SB_LABEL_SIZE); + dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE); bch_write_bdev_super(dc, NULL); if (dc->disk.c) - ret = bch_inode_update(dc->disk.c, &dc->disk.inode.k_i, + ret = bch_btree_update(dc->disk.c, BTREE_ID_INODES, + &dc->disk.inode.k_i, &journal_seq); mutex_unlock(&dc->disk.inode_lock); @@ -367,8 +371,8 @@ SHOW(bch_blockdev_volume) sysfs_hprint(size, le64_to_cpu(d->inode.v.i_size)); if (attr == &sysfs_label) { - memcpy(buf, d->inode.v.i_label, SB_LABEL_SIZE); - buf[SB_LABEL_SIZE + 1] = '\0'; + memcpy(buf, d->inode.v.i_label, BCH_SB_LABEL_SIZE); + buf[BCH_SB_LABEL_SIZE + 1] = '\0'; strcat(buf, "\n"); return strlen(buf); } @@ -397,7 +401,8 @@ STORE(__bch_blockdev_volume) } } d->inode.v.i_size = cpu_to_le64(v); - ret = bch_inode_update(d->c, &d->inode.k_i, &journal_seq); + ret = bch_btree_update(d->c, BTREE_ID_INODES, + &d->inode.k_i, &journal_seq); mutex_unlock(&d->inode_lock); @@ -417,8 +422,9 @@ STORE(__bch_blockdev_volume) mutex_lock(&d->inode_lock); - memcpy(d->inode.v.i_label, buf, SB_LABEL_SIZE); - ret = bch_inode_update(d->c, &d->inode.k_i, &journal_seq); + memcpy(d->inode.v.i_label, buf, BCH_SB_LABEL_SIZE); + ret = bch_btree_update(d->c, BTREE_ID_INODES, + &d->inode.k_i, &journal_seq); mutex_unlock(&d->inode_lock); @@ -677,10 +683,8 @@ SHOW(bch_cache_set) sysfs_print(tiering_percent, c->tiering_percent); sysfs_pd_controller_show(tiering, &c->tiering_pd); - sysfs_printf(meta_replicas_have, "%llu", - CACHE_SET_META_REPLICAS_HAVE(&c->disk_sb)); - sysfs_printf(data_replicas_have, "%llu", - CACHE_SET_DATA_REPLICAS_HAVE(&c->disk_sb)); + sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have); + sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have); /* Debugging: */ @@ -705,7 +709,7 @@ SHOW(bch_cache_set) if (attr == &sysfs_compression_stats) return bch_compression_stats(c, buf); - sysfs_printf(internal_uuid, "%pU", c->disk_sb.set_uuid.b); + sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); return 0; } @@ -945,15 +949,15 @@ SHOW(bch_cache_set_opts_dir) { struct cache_set *c = container_of(kobj, struct cache_set, opts_dir); -#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ +#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ if (attr == &sysfs_opt_##_name) \ return _choices == bch_bool_opt || _choices == bch_uint_opt\ ? snprintf(buf, PAGE_SIZE, "%i\n", c->opts._name)\ : bch_snprint_string_list(buf, PAGE_SIZE, \ _choices, c->opts._name);\ - CACHE_SET_VISIBLE_OPTS() -#undef CACHE_SET_OPT + BCH_VISIBLE_OPTS() +#undef BCH_OPT return 0; } @@ -962,7 +966,7 @@ STORE(bch_cache_set_opts_dir) { struct cache_set *c = container_of(kobj, struct cache_set, opts_dir); -#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ +#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ if (attr == &sysfs_opt_##_name) { \ ssize_t v = (_choices == bch_bool_opt || \ _choices == bch_uint_opt) \ @@ -972,18 +976,28 @@ STORE(bch_cache_set_opts_dir) if (v < 0) \ return v; \ \ - c->opts._name = v; \ + mutex_lock(&c->sb_lock); \ + if (attr == &sysfs_opt_compression) { \ + int ret = bch_check_set_has_compressed_data(c, v);\ + if (ret) { \ + mutex_unlock(&c->sb_lock); \ + return ret; \ + } \ + } \ \ - if (_sb_opt##_BITS && v != _sb_opt(&c->disk_sb)) { \ - SET_##_sb_opt(&c->disk_sb, v); \ - bcache_write_super(c); \ + if (_sb_opt##_BITS && v != _sb_opt(c->disk_sb)) { \ + SET_##_sb_opt(c->disk_sb, v); \ + bch_write_super(c); \ } \ \ + c->opts._name = v; \ + mutex_unlock(&c->sb_lock); \ + \ return size; \ } - CACHE_SET_VISIBLE_OPTS() -#undef CACHE_SET_OPT + BCH_VISIBLE_OPTS() +#undef BCH_OPT return size; } @@ -993,11 +1007,11 @@ static void bch_cache_set_opts_dir_release(struct kobject *k) } static struct attribute *bch_cache_set_opts_dir_files[] = { -#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ +#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm) \ &sysfs_opt_##_name, - CACHE_SET_VISIBLE_OPTS() -#undef CACHE_SET_OPT + BCH_VISIBLE_OPTS() +#undef BCH_OPT NULL }; @@ -1176,7 +1190,7 @@ SHOW(bch_cache) struct cache_set *c = ca->set; struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca); - sysfs_printf(uuid, "%pU\n", ca->disk_sb.sb->disk_uuid.b); + sysfs_printf(uuid, "%pU\n", ca->uuid.b); sysfs_hprint(bucket_size, bucket_bytes(ca)); sysfs_print(bucket_size_bytes, bucket_bytes(ca)); @@ -1242,17 +1256,21 @@ STORE(__bch_cache) { struct cache *ca = container_of(kobj, struct cache, kobj); struct cache_set *c = ca->set; - struct cache_member *mi = &c->disk_mi[ca->sb.nr_this_dev]; + struct bch_member *mi; sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd); if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); - if (v != CACHE_DISCARD(mi)) { - SET_CACHE_DISCARD(mi, v); - bcache_write_super(c); + mutex_lock(&c->sb_lock); + mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx]; + + if (v != BCH_MEMBER_DISCARD(mi)) { + SET_BCH_MEMBER_DISCARD(mi, v); + bch_write_super(c); } + mutex_unlock(&c->sb_lock); } if (attr == &sysfs_cache_replacement_policy) { @@ -1261,10 +1279,14 @@ STORE(__bch_cache) if (v < 0) return v; - if ((unsigned) v != CACHE_REPLACEMENT(mi)) { - SET_CACHE_REPLACEMENT(mi, v); - bcache_write_super(c); + mutex_lock(&c->sb_lock); + mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx]; + + if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { + SET_BCH_MEMBER_REPLACEMENT(mi, v); + bch_write_super(c); } + mutex_unlock(&c->sb_lock); } if (attr == &sysfs_state_rw) { @@ -1279,14 +1301,14 @@ STORE(__bch_cache) return size; switch (v) { - case CACHE_ACTIVE: + case BCH_MEMBER_STATE_ACTIVE: err = bch_cache_read_write(ca); break; - case CACHE_RO: + case BCH_MEMBER_STATE_RO: bch_cache_read_only(ca); break; - case CACHE_FAILED: - case CACHE_SPARE: + case BCH_MEMBER_STATE_FAILED: + case BCH_MEMBER_STATE_SPARE: /* * XXX: need to migrate data off and set correct state */ diff --git a/libbcache/tier.c b/libbcache/tier.c index 39b04f7b..46864594 100644 --- a/libbcache/tier.c +++ b/libbcache/tier.c @@ -8,6 +8,7 @@ #include "io.h" #include "keylist.h" #include "move.h" +#include "super-io.h" #include "tier.h" #include <linux/freezer.h> @@ -40,7 +41,7 @@ static bool tiering_pred(struct cache_set *c, mi = cache_member_info_get(c); extent_for_each_ptr(e, ptr) - if (ptr->dev < mi->nr_in_set && + if (ptr->dev < mi->nr_devices && mi->m[ptr->dev].tier >= s->tier_idx) replicas++; cache_member_info_put(); diff --git a/libbcache/vstructs.h b/libbcache/vstructs.h new file mode 100644 index 00000000..ce2cece0 --- /dev/null +++ b/libbcache/vstructs.h @@ -0,0 +1,62 @@ +#ifndef _VSTRUCTS_H +#define _VSTRUCTS_H + +#include "util.h" + +/* + * NOTE: we can't differentiate between __le64 and u64 with type_is - this + * assumes u64 is little endian: + */ +#define __vstruct_u64s(_s) \ +({ \ + ( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s) \ + : type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s) \ + : type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s) \ + : ((_s)->u64s)); \ +}) + +#define __vstruct_bytes(_type, _u64s) \ +({ \ + BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ + \ + (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ +}) + +#define vstruct_bytes(_s) \ + __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) + +#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ + (round_up(__vstruct_bytes(_type, _u64s), \ + 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) + +#define vstruct_blocks(_s, _sector_block_bits) \ + __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) + +#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ + __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ + __vstruct_u64s(_s) + (_u64s)) + +#define vstruct_sectors(_s, _sector_block_bits) \ + (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) + +#define vstruct_next(_s) \ + ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) +#define vstruct_last(_s) \ + ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) +#define vstruct_end(_s) \ + ((void *) ((_s)->_data + __vstruct_u64s(_s))) + +#define vstruct_for_each(_s, _i) \ + for (_i = (_s)->start; \ + _i < vstruct_last(_s); \ + _i = vstruct_next(_i)) + +#define vstruct_for_each_safe(_s, _i, _t) \ + for (_i = (_s)->start; \ + _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ + _i = _t) + +#define vstruct_idx(_s, _idx) \ + ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) + +#endif /* _VSTRUCTS_H */ diff --git a/libbcache/xattr.c b/libbcache/xattr.c index e9e0a9a7..56a8e8f8 100644 --- a/libbcache/xattr.c +++ b/libbcache/xattr.c @@ -9,7 +9,6 @@ #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> -#include <crypto/hash.h> struct xattr_search_key { u8 type; @@ -22,37 +21,13 @@ struct xattr_search_key { static u64 bch_xattr_hash(const struct bch_hash_info *info, const struct xattr_search_key *key) { - switch (info->type) { - case BCH_STR_HASH_SHA1: { - SHASH_DESC_ON_STACK(desc, bch_sha1); - u8 digest[SHA1_DIGEST_SIZE]; - u64 ret; + struct bch_str_hash_ctx ctx; - desc->tfm = bch_sha1; - desc->flags = 0; - crypto_shash_init(desc); + bch_str_hash_init(&ctx, info); + bch_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); + bch_str_hash_update(&ctx, info, key->name.name, key->name.len); - crypto_shash_update(desc, (void *) &info->seed, sizeof(info->seed)); - - crypto_shash_update(desc, (void *) &key->type, sizeof(key->type)); - crypto_shash_update(desc, (void *) key->name.name, key->name.len); - - crypto_shash_final(desc, digest); - memcpy(&ret, &digest, sizeof(ret)); - return ret >> 1; - } - default: { - struct bch_str_hash_ctx ctx; - - bch_str_hash_init(&ctx, info->type); - bch_str_hash_update(&ctx, info->type, &info->seed, sizeof(info->seed)); - - bch_str_hash_update(&ctx, info->type, &key->type, sizeof(key->type)); - bch_str_hash_update(&ctx, info->type, key->name.name, key->name.len); - - return bch_str_hash_end(&ctx, info->type); - } - } + return bch_str_hash_end(&ctx, info); } #define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len) diff --git a/linux/crypto/algapi.c b/linux/crypto/algapi.c deleted file mode 100644 index 5e8e97b9..00000000 --- a/linux/crypto/algapi.c +++ /dev/null @@ -1,315 +0,0 @@ -/* - * Cryptographic API for algorithms (i.e., low-level API). - * - * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - */ - -#include <linux/byteorder.h> -#include <linux/err.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/list.h> -#include <linux/module.h> -#include <linux/printk.h> -#include <linux/rtnetlink.h> -#include <linux/slab.h> -#include <linux/string.h> - -#include "internal.h" - -static inline int crypto_set_driver_name(struct crypto_alg *alg) -{ - static const char suffix[] = "-generic"; - char *driver_name = alg->cra_driver_name; - int len; - - if (*driver_name) - return 0; - - len = strlcpy(driver_name, alg->cra_name, CRYPTO_MAX_ALG_NAME); - if (len + sizeof(suffix) > CRYPTO_MAX_ALG_NAME) - return -ENAMETOOLONG; - - memcpy(driver_name + len, suffix, sizeof(suffix)); - return 0; -} - -static int crypto_check_alg(struct crypto_alg *alg) -{ - if (alg->cra_alignmask & (alg->cra_alignmask + 1)) - return -EINVAL; - - if (alg->cra_blocksize > PAGE_SIZE / 8) - return -EINVAL; - - if (alg->cra_priority < 0) - return -EINVAL; - - atomic_set(&alg->cra_refcnt, 1); - - return crypto_set_driver_name(alg); -} - -static int __crypto_register_alg(struct crypto_alg *alg) -{ - struct crypto_alg *q; - int ret = -EAGAIN; - - INIT_LIST_HEAD(&alg->cra_users); - - ret = -EEXIST; - - list_for_each_entry(q, &crypto_alg_list, cra_list) { - if (q == alg) - goto err; - - if (!strcmp(q->cra_driver_name, alg->cra_name) || - !strcmp(q->cra_name, alg->cra_driver_name)) - goto err; - } - - list_add(&alg->cra_list, &crypto_alg_list); - return 0; -err: - return ret; -} - -void crypto_remove_final(struct list_head *list) -{ - struct crypto_alg *alg; - struct crypto_alg *n; - - list_for_each_entry_safe(alg, n, list, cra_list) { - list_del_init(&alg->cra_list); - crypto_alg_put(alg); - } -} - -int crypto_register_alg(struct crypto_alg *alg) -{ - int err; - - err = crypto_check_alg(alg); - if (err) - return err; - - down_write(&crypto_alg_sem); - err = __crypto_register_alg(alg); - up_write(&crypto_alg_sem); - - return err; -} - -static int crypto_remove_alg(struct crypto_alg *alg, struct list_head *list) -{ - if (unlikely(list_empty(&alg->cra_list))) - return -ENOENT; - - list_del_init(&alg->cra_list); - return 0; -} - -int crypto_unregister_alg(struct crypto_alg *alg) -{ - int ret; - LIST_HEAD(list); - - down_write(&crypto_alg_sem); - ret = crypto_remove_alg(alg, &list); - up_write(&crypto_alg_sem); - - if (ret) - return ret; - - BUG_ON(atomic_read(&alg->cra_refcnt) != 1); - if (alg->cra_destroy) - alg->cra_destroy(alg); - - crypto_remove_final(&list); - return 0; -} - -int crypto_register_algs(struct crypto_alg *algs, int count) -{ - int i, ret; - - for (i = 0; i < count; i++) { - ret = crypto_register_alg(&algs[i]); - if (ret) - goto err; - } - - return 0; - -err: - for (--i; i >= 0; --i) - crypto_unregister_alg(&algs[i]); - - return ret; -} - -int crypto_unregister_algs(struct crypto_alg *algs, int count) -{ - int i, ret; - - for (i = 0; i < count; i++) { - ret = crypto_unregister_alg(&algs[i]); - if (ret) - pr_err("Failed to unregister %s %s: %d\n", - algs[i].cra_driver_name, algs[i].cra_name, ret); - } - - return 0; -} - -struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb) -{ - struct rtattr *rta = tb[0]; - struct crypto_attr_type *algt; - - if (!rta) - return ERR_PTR(-ENOENT); - if (RTA_PAYLOAD(rta) < sizeof(*algt)) - return ERR_PTR(-EINVAL); - if (rta->rta_type != CRYPTOA_TYPE) - return ERR_PTR(-EINVAL); - - algt = RTA_DATA(rta); - - return algt; -} - -int crypto_check_attr_type(struct rtattr **tb, u32 type) -{ - struct crypto_attr_type *algt; - - algt = crypto_get_attr_type(tb); - if (IS_ERR(algt)) - return PTR_ERR(algt); - - if ((algt->type ^ type) & algt->mask) - return -EINVAL; - - return 0; -} - -const char *crypto_attr_alg_name(struct rtattr *rta) -{ - struct crypto_attr_alg *alga; - - if (!rta) - return ERR_PTR(-ENOENT); - if (RTA_PAYLOAD(rta) < sizeof(*alga)) - return ERR_PTR(-EINVAL); - if (rta->rta_type != CRYPTOA_ALG) - return ERR_PTR(-EINVAL); - - alga = RTA_DATA(rta); - alga->name[CRYPTO_MAX_ALG_NAME - 1] = 0; - - return alga->name; -} - -struct crypto_alg *crypto_attr_alg2(struct rtattr *rta, - const struct crypto_type *frontend, - u32 type, u32 mask) -{ - const char *name; - - name = crypto_attr_alg_name(rta); - if (IS_ERR(name)) - return ERR_CAST(name); - - return crypto_find_alg(name, frontend, type, mask); -} - -int crypto_attr_u32(struct rtattr *rta, u32 *num) -{ - struct crypto_attr_u32 *nu32; - - if (!rta) - return -ENOENT; - if (RTA_PAYLOAD(rta) < sizeof(*nu32)) - return -EINVAL; - if (rta->rta_type != CRYPTOA_U32) - return -EINVAL; - - nu32 = RTA_DATA(rta); - *num = nu32->num; - - return 0; -} - -static inline void crypto_inc_byte(u8 *a, unsigned int size) -{ - u8 *b = (a + size); - u8 c; - - for (; size; size--) { - c = *--b + 1; - *b = c; - if (c) - break; - } -} - -void crypto_inc(u8 *a, unsigned int size) -{ - __be32 *b = (__be32 *)(a + size); - u32 c; - - for (; size >= 4; size -= 4) { - c = be32_to_cpu(*--b) + 1; - *b = cpu_to_be32(c); - if (c) - return; - } - - crypto_inc_byte(a, size); -} - -static inline void crypto_xor_byte(u8 *a, const u8 *b, unsigned int size) -{ - for (; size; size--) - *a++ ^= *b++; -} - -void crypto_xor(u8 *dst, const u8 *src, unsigned int size) -{ - u32 *a = (u32 *)dst; - u32 *b = (u32 *)src; - - for (; size >= 4; size -= 4) - *a++ ^= *b++; - - crypto_xor_byte((u8 *)a, (u8 *)b, size); -} - -unsigned int crypto_alg_extsize(struct crypto_alg *alg) -{ - return alg->cra_ctxsize + - (alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1)); -} - -int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, - u32 type, u32 mask) -{ - int ret = 0; - struct crypto_alg *alg = crypto_find_alg(name, frontend, type, mask); - - if (!IS_ERR(alg)) { - crypto_alg_put(alg); - ret = 1; - } - - return ret; -} - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Cryptographic algorithms API"); diff --git a/linux/crypto/api.c b/linux/crypto/api.c index 513a48aa..2d24630e 100644 --- a/linux/crypto/api.c +++ b/linux/crypto/api.c @@ -1,12 +1,7 @@ /* - * Scatterlist Cryptographic API. + * Cryptographic API for algorithms (i.e., low-level API). * - * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> - * Copyright (c) 2002 David S. Miller (davem@redhat.com) - * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> - * - * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no> - * and Nettle, by Niels Möller. + * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -19,183 +14,80 @@ #include <linux/err.h> #include <linux/errno.h> #include <linux/kernel.h> -#include <linux/module.h> -#include <linux/param.h> -#include <linux/sched.h> +#include <linux/list.h> +#include <linux/rwsem.h> #include <linux/slab.h> #include <linux/string.h> + +#include <crypto/algapi.h> #include "internal.h" -LIST_HEAD(crypto_alg_list); -DECLARE_RWSEM(crypto_alg_sem); +static LIST_HEAD(crypto_alg_list); +static DECLARE_RWSEM(crypto_alg_sem); -static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type, - u32 mask) +static unsigned crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask) { - struct crypto_alg *q, *alg = NULL; - int best = -2; - - list_for_each_entry(q, &crypto_alg_list, cra_list) { - int exact, fuzzy; - - if ((q->cra_flags ^ type) & mask) - continue; - - exact = !strcmp(q->cra_driver_name, name); - fuzzy = !strcmp(q->cra_name, name); - if (!exact && !(fuzzy && q->cra_priority > best)) - continue; - - if (unlikely(!crypto_alg_get(q))) - continue; - - best = q->cra_priority; - if (alg) - crypto_alg_put(alg); - alg = q; - - if (exact) - break; - } + return alg->cra_type->ctxsize(alg, type, mask); +} - return alg; +unsigned crypto_alg_extsize(struct crypto_alg *alg) +{ + return alg->cra_ctxsize; } struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; - /* - * If the internal flag is set for a cipher, require a caller to - * to invoke the cipher with the internal flag to use that cipher. - * Also, if a caller wants to allocate a cipher that may or may - * not be an internal cipher, use type | CRYPTO_ALG_INTERNAL and - * !(mask & CRYPTO_ALG_INTERNAL). - */ - if (!((type | mask) & CRYPTO_ALG_INTERNAL)) - mask |= CRYPTO_ALG_INTERNAL; - down_read(&crypto_alg_sem); - alg = __crypto_alg_lookup(name, type, mask); - up_read(&crypto_alg_sem); + list_for_each_entry(alg, &crypto_alg_list, cra_list) + if (!((alg->cra_flags ^ type) & mask) && + !strcmp(alg->cra_name, name)) + goto found; - return alg ?: ERR_PTR(-ENOENT); -} - -static int crypto_init_ops(struct crypto_tfm *tfm, u32 type, u32 mask) -{ - const struct crypto_type *type_obj = tfm->__crt_alg->cra_type; - - if (type_obj) - return type_obj->init(tfm, type, mask); - - switch (crypto_tfm_alg_type(tfm)) { - case CRYPTO_ALG_TYPE_CIPHER: - return crypto_init_cipher_ops(tfm); - default: - break; - } + alg = ERR_PTR(-ENOENT); +found: + up_read(&crypto_alg_sem); - BUG(); - return -EINVAL; + return alg; } static void crypto_exit_ops(struct crypto_tfm *tfm) { - const struct crypto_type *type = tfm->__crt_alg->cra_type; - - if (type) { - if (tfm->exit) - tfm->exit(tfm); - return; - } - - switch (crypto_tfm_alg_type(tfm)) { - case CRYPTO_ALG_TYPE_CIPHER: - crypto_exit_cipher_ops(tfm); - break; - - default: - BUG(); - } -} - -static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask) -{ - const struct crypto_type *type_obj = alg->cra_type; - unsigned int len; - - len = alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1); - if (type_obj) - return len + type_obj->ctxsize(alg, type, mask); - - switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) { - default: - BUG(); - - case CRYPTO_ALG_TYPE_CIPHER: - len += crypto_cipher_ctxsize(alg); - break; - } - - return len; + if (tfm->exit) + tfm->exit(tfm); } -struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type, - u32 mask) +static struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, + u32 type, u32 mask) { struct crypto_tfm *tfm = NULL; - unsigned int tfm_size; + unsigned tfm_size; int err = -ENOMEM; tfm_size = sizeof(*tfm) + crypto_ctxsize(alg, type, mask); tfm = kzalloc(tfm_size, GFP_KERNEL); if (tfm == NULL) - goto out_err; + return ERR_PTR(-ENOMEM); tfm->__crt_alg = alg; - err = crypto_init_ops(tfm, type, mask); + err = alg->cra_type->init(tfm, type, mask); if (err) goto out_free_tfm; if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm))) goto cra_init_failed; - goto out; + return tfm; cra_init_failed: crypto_exit_ops(tfm); out_free_tfm: kfree(tfm); -out_err: - tfm = ERR_PTR(err); -out: - return tfm; + return ERR_PTR(err); } -/* - * crypto_alloc_base - Locate algorithm and allocate transform - * @alg_name: Name of algorithm - * @type: Type of algorithm - * @mask: Mask for type comparison - * - * This function should not be used by new algorithm types. - * Please use crypto_alloc_tfm instead. - * - * crypto_alloc_base() will first attempt to locate an already loaded - * algorithm. If that fails and the kernel supports dynamically loadable - * modules, it will then attempt to load a module of the same name or - * alias. If that fails it will send a query to any loaded crypto manager - * to construct an algorithm on the fly. A refcount is grabbed on the - * algorithm which is then associated with the new transform. - * - * The returned transform is of a non-determinate type. Most people - * should use one of the more specific allocation functions such as - * crypto_alloc_blkcipher. - * - * In case of error the return value is an error pointer. - */ struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask) { struct crypto_alg *alg; @@ -208,31 +100,29 @@ struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask) } tfm = __crypto_alloc_tfm(alg, type, mask); - if (IS_ERR(tfm)) { - crypto_alg_put(alg); + if (IS_ERR(tfm)) return tfm; - } return tfm; } -void *crypto_create_tfm(struct crypto_alg *alg, - const struct crypto_type *frontend) +static void *crypto_create_tfm(struct crypto_alg *alg, + const struct crypto_type *frontend) { - char *mem; struct crypto_tfm *tfm = NULL; - unsigned int tfmsize; - unsigned int total; + unsigned tfmsize; + unsigned total; + void *mem; int err = -ENOMEM; tfmsize = frontend->tfmsize; total = tfmsize + sizeof(*tfm) + frontend->extsize(alg); mem = kzalloc(total, GFP_KERNEL); - if (mem == NULL) + if (!mem) goto out_err; - tfm = (struct crypto_tfm *)(mem + tfmsize); + tfm = mem + tfmsize; tfm->__crt_alg = alg; err = frontend->init_tfm(tfm); @@ -254,28 +144,23 @@ out: return mem; } -struct crypto_alg *crypto_find_alg(const char *alg_name, - const struct crypto_type *frontend, - u32 type, u32 mask) +static struct crypto_alg *crypto_find_alg(const char *alg_name, + const struct crypto_type *frontend, + u32 type, u32 mask) { - struct crypto_alg *(*lookup)(const char *name, u32 type, u32 mask) = - crypto_alg_mod_lookup; - if (frontend) { type &= frontend->maskclear; mask &= frontend->maskclear; type |= frontend->type; mask |= frontend->maskset; - - if (frontend->lookup) - lookup = frontend->lookup; } - return lookup(alg_name, type, mask); + return crypto_alg_mod_lookup(alg_name, type, mask); } void *crypto_alloc_tfm(const char *alg_name, - const struct crypto_type *frontend, u32 type, u32 mask) + const struct crypto_type *frontend, + u32 type, u32 mask) { struct crypto_alg *alg; void *tfm; @@ -285,10 +170,8 @@ void *crypto_alloc_tfm(const char *alg_name, return ERR_CAST(alg); tfm = crypto_create_tfm(alg, frontend); - if (IS_ERR(tfm)) { - crypto_alg_put(alg); + if (IS_ERR(tfm)) return tfm; - } return tfm; } @@ -305,22 +188,16 @@ void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm) if (!tfm->exit && alg->cra_exit) alg->cra_exit(tfm); crypto_exit_ops(tfm); - crypto_alg_put(alg); kzfree(mem); } -int crypto_has_alg(const char *name, u32 type, u32 mask) +int crypto_register_alg(struct crypto_alg *alg) { - int ret = 0; - struct crypto_alg *alg = crypto_alg_mod_lookup(name, type, mask); + INIT_LIST_HEAD(&alg->cra_users); - if (!IS_ERR(alg)) { - crypto_alg_put(alg); - ret = 1; - } + down_write(&crypto_alg_sem); + list_add(&alg->cra_list, &crypto_alg_list); + up_write(&crypto_alg_sem); - return ret; + return 0; } - -MODULE_DESCRIPTION("Cryptographic core API"); -MODULE_LICENSE("GPL"); diff --git a/linux/crypto/blkcipher.c b/linux/crypto/blkcipher.c new file mode 100644 index 00000000..31f91418 --- /dev/null +++ b/linux/crypto/blkcipher.c @@ -0,0 +1,47 @@ +/* + * Block chaining cipher operations. + * + * Generic encrypt/decrypt wrapper for ciphers, handles operations across + * multiple page boundaries by using temporary blocks. In user context, + * the kernel is given a chance to schedule us once per page. + * + * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string.h> + +#include <crypto/algapi.h> +#include "internal.h" + +static unsigned crypto_blkcipher_ctxsize(struct crypto_alg *alg, + u32 type, u32 mask) +{ + return alg->cra_ctxsize; +} + +static int crypto_init_blkcipher_ops(struct crypto_tfm *tfm, u32 type, u32 mask) +{ + struct blkcipher_tfm *crt = &tfm->crt_blkcipher; + struct blkcipher_alg *alg = &tfm->__crt_alg->cra_blkcipher; + + BUG_ON((mask & CRYPTO_ALG_TYPE_MASK) != CRYPTO_ALG_TYPE_MASK); + + crt->setkey = alg->setkey; + crt->encrypt = alg->encrypt; + crt->decrypt = alg->decrypt; + return 0; +} + +const struct crypto_type crypto_blkcipher_type = { + .ctxsize = crypto_blkcipher_ctxsize, + .init = crypto_init_blkcipher_ops, +}; diff --git a/linux/crypto/chacha20_generic.c b/linux/crypto/chacha20_generic.c new file mode 100644 index 00000000..7ac68321 --- /dev/null +++ b/linux/crypto/chacha20_generic.c @@ -0,0 +1,99 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539 + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/byteorder.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/scatterlist.h> +#include <asm/unaligned.h> + +#include <linux/crypto.h> +#include <crypto/algapi.h> +#include <crypto/chacha20.h> + +#include <sodium/crypto_stream_chacha20.h> + +struct chacha20_ctx { + u32 key[8]; +}; + +static int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keysize) +{ + struct chacha20_ctx *ctx = crypto_tfm_ctx(tfm); + int i; + + if (keysize != CHACHA20_KEY_SIZE) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(ctx->key); i++) + ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32)); + + return 0; +} + +static int crypto_chacha20_crypt(struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, + unsigned nbytes) +{ + struct chacha20_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct scatterlist *sg = src; + u32 iv[4]; + int ret; + + BUG_ON(src != dst); + + memcpy(iv, desc->info, sizeof(iv)); + + while (1) { + ret = crypto_stream_chacha20_xor_ic(sg_virt(sg), + sg_virt(sg), + sg->length, + (void *) &iv[2], + iv[0] | ((u64) iv[1] << 32), + (void *) ctx->key); + BUG_ON(ret); + + nbytes -= sg->length; + + if (sg_is_last(sg)) + break; + + BUG_ON(sg->length % CHACHA20_BLOCK_SIZE); + iv[0] += sg->length / CHACHA20_BLOCK_SIZE; + sg = sg_next(sg); + }; + + BUG_ON(nbytes); + + return 0; +} + +static struct crypto_alg alg = { + .cra_name = "chacha20", + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_type = &crypto_blkcipher_type, + .cra_ctxsize = sizeof(struct chacha20_ctx), + .cra_u = { + .blkcipher = { + .setkey = crypto_chacha20_setkey, + .encrypt = crypto_chacha20_crypt, + .decrypt = crypto_chacha20_crypt, + }, + }, +}; + +__attribute__((constructor(110))) +static int chacha20_generic_mod_init(void) +{ + return crypto_register_alg(&alg); +} diff --git a/linux/crypto/cipher.c b/linux/crypto/cipher.c deleted file mode 100644 index 6f47ac6c..00000000 --- a/linux/crypto/cipher.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Cryptographic API. - * - * Cipher operations. - * - * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> - * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - */ - -#include <linux/kernel.h> -#include <linux/crypto.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/string.h> -#include "internal.h" - -static int setkey_unaligned(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct cipher_alg *cia = &tfm->__crt_alg->cra_cipher; - unsigned long alignmask = crypto_tfm_alg_alignmask(tfm); - int ret; - u8 *buffer, *alignbuffer; - unsigned long absize; - - absize = keylen + alignmask; - buffer = kmalloc(absize, GFP_ATOMIC); - if (!buffer) - return -ENOMEM; - - alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); - memcpy(alignbuffer, key, keylen); - ret = cia->cia_setkey(tfm, alignbuffer, keylen); - memset(alignbuffer, 0, keylen); - kfree(buffer); - return ret; - -} - -static int setkey_default(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) -{ - struct cipher_alg *cia = &tfm->__crt_alg->cra_cipher; - unsigned long alignmask = crypto_tfm_alg_alignmask(tfm); - - tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; - if (keylen < cia->cia_min_keysize || keylen > cia->cia_max_keysize) { - tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; - return -EINVAL; - } - - if ((unsigned long)key & alignmask) - return setkey_unaligned(tfm, key, keylen); - - return cia->cia_setkey(tfm, key, keylen); -} - -static void cipher_crypt_unaligned(void (*fn)(struct crypto_tfm *, u8 *, - const u8 *), - struct crypto_tfm *tfm, - u8 *dst, const u8 *src) -{ - unsigned long alignmask = crypto_tfm_alg_alignmask(tfm); - unsigned int size = crypto_tfm_alg_blocksize(tfm); - u8 buffer[size + alignmask]; - u8 *tmp = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); - - memcpy(tmp, src, size); - fn(tfm, tmp, tmp); - memcpy(dst, tmp, size); -} - -static void cipher_encrypt_unaligned(struct crypto_tfm *tfm, - u8 *dst, const u8 *src) -{ - unsigned long alignmask = crypto_tfm_alg_alignmask(tfm); - struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher; - - if (unlikely(((unsigned long)dst | (unsigned long)src) & alignmask)) { - cipher_crypt_unaligned(cipher->cia_encrypt, tfm, dst, src); - return; - } - - cipher->cia_encrypt(tfm, dst, src); -} - -static void cipher_decrypt_unaligned(struct crypto_tfm *tfm, - u8 *dst, const u8 *src) -{ - unsigned long alignmask = crypto_tfm_alg_alignmask(tfm); - struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher; - - if (unlikely(((unsigned long)dst | (unsigned long)src) & alignmask)) { - cipher_crypt_unaligned(cipher->cia_decrypt, tfm, dst, src); - return; - } - - cipher->cia_decrypt(tfm, dst, src); -} - -int crypto_init_cipher_ops(struct crypto_tfm *tfm) -{ - struct cipher_tfm *ops = &tfm->crt_cipher; - struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher; - - ops->cit_setkey = setkey_default; - ops->cit_encrypt_one = crypto_tfm_alg_alignmask(tfm) ? - cipher_encrypt_unaligned : cipher->cia_encrypt; - ops->cit_decrypt_one = crypto_tfm_alg_alignmask(tfm) ? - cipher_decrypt_unaligned : cipher->cia_decrypt; - - return 0; -} - -void crypto_exit_cipher_ops(struct crypto_tfm *tfm) -{ -} diff --git a/linux/crypto/internal.h b/linux/crypto/internal.h index b00dcea2..5b21f836 100644 --- a/linux/crypto/internal.h +++ b/linux/crypto/internal.h @@ -13,66 +13,11 @@ #ifndef _CRYPTO_INTERNAL_H #define _CRYPTO_INTERNAL_H -#include <crypto/algapi.h> -#include <linux/completion.h> -#include <linux/mm.h> -#include <linux/list.h> -#include <linux/kernel.h> -#include <linux/notifier.h> -#include <linux/rwsem.h> -#include <linux/slab.h> +struct crypto_type; +struct crypto_alg; -struct crypto_instance; -struct crypto_template; - -struct crypto_larval { - struct crypto_alg alg; - struct crypto_alg *adult; - struct completion completion; - u32 mask; -}; - -extern struct list_head crypto_alg_list; -extern struct rw_semaphore crypto_alg_sem; - -static inline unsigned int crypto_cipher_ctxsize(struct crypto_alg *alg) -{ - return alg->cra_ctxsize; -} - -int crypto_init_cipher_ops(struct crypto_tfm *tfm); -void crypto_exit_cipher_ops(struct crypto_tfm *tfm); - -void crypto_remove_final(struct list_head *list); -struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type, - u32 mask); -void *crypto_create_tfm(struct crypto_alg *alg, - const struct crypto_type *frontend); -struct crypto_alg *crypto_find_alg(const char *alg_name, - const struct crypto_type *frontend, - u32 type, u32 mask); -void *crypto_alloc_tfm(const char *alg_name, - const struct crypto_type *frontend, u32 type, u32 mask); - -int crypto_register_notifier(struct notifier_block *nb); -int crypto_unregister_notifier(struct notifier_block *nb); - -unsigned int crypto_alg_extsize(struct crypto_alg *alg); - -int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, - u32 type, u32 mask); - -static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg) -{ - atomic_inc(&alg->cra_refcnt); - return alg; -} - -static inline void crypto_alg_put(struct crypto_alg *alg) -{ - if (atomic_dec_and_test(&alg->cra_refcnt) && alg->cra_destroy) - alg->cra_destroy(alg); -} +void *crypto_alloc_tfm(const char *, const struct crypto_type *, u32, u32); +unsigned int crypto_alg_extsize(struct crypto_alg *); #endif /* _CRYPTO_INTERNAL_H */ diff --git a/linux/crypto/poly1305_generic.c b/linux/crypto/poly1305_generic.c new file mode 100644 index 00000000..5d385d54 --- /dev/null +++ b/linux/crypto/poly1305_generic.c @@ -0,0 +1,76 @@ +/* + * Poly1305 authenticator algorithm, RFC7539 + * + * Copyright (C) 2015 Martin Willi + * + * Based on public domain code by Andrew Moon and Daniel J. Bernstein. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/byteorder.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <asm/unaligned.h> + +#include <linux/crypto.h> +#include <crypto/algapi.h> +#include <crypto/internal/hash.h> +#include <crypto/poly1305.h> + +struct poly1305_desc_ctx { + bool key_done; + crypto_onetimeauth_poly1305_state s; +}; + + +static int poly1305_init(struct shash_desc *desc) +{ + struct poly1305_desc_ctx *state = shash_desc_ctx(desc); + + state->key_done = false; + return 0; +} + +static int poly1305_update(struct shash_desc *desc, + const u8 *src, unsigned len) +{ + struct poly1305_desc_ctx *state = shash_desc_ctx(desc); + + if (!state->key_done) { + BUG_ON(len != crypto_onetimeauth_poly1305_KEYBYTES); + + state->key_done = true; + return crypto_onetimeauth_poly1305_init(&state->s, src); + } + + return crypto_onetimeauth_poly1305_update(&state->s, src, len); +} + +static int poly1305_final(struct shash_desc *desc, u8 *out) +{ + struct poly1305_desc_ctx *state = shash_desc_ctx(desc); + + return crypto_onetimeauth_poly1305_final(&state->s, out); +} + +static struct shash_alg poly1305_alg = { + .digestsize = crypto_onetimeauth_poly1305_BYTES, + .init = poly1305_init, + .update = poly1305_update, + .final = poly1305_final, + .descsize = sizeof(struct poly1305_desc_ctx), + .base = { + .cra_name = "poly1305", + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + }, +}; + +__attribute__((constructor(110))) +static int poly1305_mod_init(void) +{ + return crypto_register_shash(&poly1305_alg); +} diff --git a/linux/crypto/sha1_generic.c b/linux/crypto/sha1_generic.c deleted file mode 100644 index 31b5d12e..00000000 --- a/linux/crypto/sha1_generic.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Cryptographic API. - * - * SHA1 Secure Hash Algorithm. - * - * Derived from cryptoapi implementation, adapted for in-place - * scatterlist interface. - * - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> - * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - */ -#include <crypto/internal/hash.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/cryptohash.h> -#include <linux/types.h> -#include <crypto/sha.h> -#include <crypto/sha1_base.h> -#include <asm/byteorder.h> - -const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = { - 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d, - 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90, - 0xaf, 0xd8, 0x07, 0x09 -}; - -static void sha1_generic_block_fn(struct sha1_state *sst, u8 const *src, - int blocks) -{ - u32 temp[SHA_WORKSPACE_WORDS]; - - while (blocks--) { - sha_transform(sst->state, src, temp); - src += SHA1_BLOCK_SIZE; - } - memzero_explicit(temp, sizeof(temp)); -} - -int crypto_sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha1_base_do_update(desc, data, len, sha1_generic_block_fn); -} - -static int sha1_final(struct shash_desc *desc, u8 *out) -{ - sha1_base_do_finalize(desc, sha1_generic_block_fn); - return sha1_base_finish(desc, out); -} - -int crypto_sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha1_base_do_update(desc, data, len, sha1_generic_block_fn); - return sha1_final(desc, out); -} - -static struct shash_alg alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = crypto_sha1_update, - .final = sha1_final, - .finup = crypto_sha1_finup, - .descsize = sizeof(struct sha1_state), - .base = { - .cra_name = "sha1", - .cra_driver_name= "sha1-generic", - .cra_flags = CRYPTO_ALG_TYPE_SHASH, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -__attribute__((constructor(110))) -static int __init sha1_generic_mod_init(void) -{ - return crypto_register_shash(&alg); -} diff --git a/linux/crypto/sha256_generic.c b/linux/crypto/sha256_generic.c new file mode 100644 index 00000000..0bd272f0 --- /dev/null +++ b/linux/crypto/sha256_generic.c @@ -0,0 +1,69 @@ +/* + * Cryptographic API. + * + * SHA-256, as specified in + * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf + * + * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> + * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <linux/bitops.h> +#include <linux/byteorder.h> +#include <linux/types.h> +#include <asm/unaligned.h> + +#include <linux/crypto.h> +#include <crypto/internal/hash.h> + +#include <sodium/crypto_hash_sha256.h> + +static int sha256_init(struct shash_desc *desc) +{ + crypto_hash_sha256_state *state = shash_desc_ctx(desc); + + return crypto_hash_sha256_init(state); +} + +static int sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + crypto_hash_sha256_state *state = shash_desc_ctx(desc); + + return crypto_hash_sha256_update(state, data, len); +} + +static int sha256_final(struct shash_desc *desc, u8 *out) +{ + crypto_hash_sha256_state *state = shash_desc_ctx(desc); + + return crypto_hash_sha256_final(state, out); +} + +static struct shash_alg sha256_alg = { + .digestsize = crypto_hash_sha256_BYTES, + .init = sha256_init, + .update = sha256_update, + .final = sha256_final, + .descsize = sizeof(crypto_hash_sha256_state), + .base = { + .cra_name = "sha256", + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + } +}; + +__attribute__((constructor(110))) +static int __init sha256_generic_mod_init(void) +{ + return crypto_register_shash(&sha256_alg); +} diff --git a/linux/crypto/shash.c b/linux/crypto/shash.c index 406ddfe8..4f07a8b8 100644 --- a/linux/crypto/shash.c +++ b/linux/crypto/shash.c @@ -13,181 +13,25 @@ #include <crypto/internal/hash.h> #include <linux/err.h> #include <linux/kernel.h> -#include <linux/module.h> #include <linux/printk.h> #include <linux/slab.h> #include "internal.h" -static int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, - unsigned int keylen) -{ - return -ENOSYS; -} - -static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key, - unsigned int keylen) -{ - struct shash_alg *shash = crypto_shash_alg(tfm); - unsigned long alignmask = crypto_shash_alignmask(tfm); - unsigned long absize; - u8 *buffer, *alignbuffer; - int err; - - absize = keylen + (alignmask & ~(crypto_tfm_ctx_alignment() - 1)); - buffer = kmalloc(absize, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); - memcpy(alignbuffer, key, keylen); - err = shash->setkey(tfm, alignbuffer, keylen); - kzfree(buffer); - return err; -} - -int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key, - unsigned int keylen) -{ - struct shash_alg *shash = crypto_shash_alg(tfm); - unsigned long alignmask = crypto_shash_alignmask(tfm); - - if ((unsigned long)key & alignmask) - return shash_setkey_unaligned(tfm, key, keylen); - - return shash->setkey(tfm, key, keylen); -} - -static inline unsigned int shash_align_buffer_size(unsigned len, - unsigned long mask) -{ - typedef u8 __attribute__ ((aligned)) u8_aligned; - return len + (mask & ~(__alignof__(u8_aligned) - 1)); -} - -static int shash_update_unaligned(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct crypto_shash *tfm = desc->tfm; - struct shash_alg *shash = crypto_shash_alg(tfm); - unsigned long alignmask = crypto_shash_alignmask(tfm); - unsigned int unaligned_len = alignmask + 1 - - ((unsigned long)data & alignmask); - u8 ubuf[shash_align_buffer_size(unaligned_len, alignmask)] - __attribute__ ((aligned)); - u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1); - int err; - - if (unaligned_len > len) - unaligned_len = len; - - memcpy(buf, data, unaligned_len); - err = shash->update(desc, buf, unaligned_len); - memset(buf, 0, unaligned_len); - - return err ?: - shash->update(desc, data + unaligned_len, len - unaligned_len); -} - -int crypto_shash_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct crypto_shash *tfm = desc->tfm; - struct shash_alg *shash = crypto_shash_alg(tfm); - unsigned long alignmask = crypto_shash_alignmask(tfm); - - if ((unsigned long)data & alignmask) - return shash_update_unaligned(desc, data, len); - - return shash->update(desc, data, len); -} - -static int shash_final_unaligned(struct shash_desc *desc, u8 *out) -{ - struct crypto_shash *tfm = desc->tfm; - unsigned long alignmask = crypto_shash_alignmask(tfm); - struct shash_alg *shash = crypto_shash_alg(tfm); - unsigned int ds = crypto_shash_digestsize(tfm); - u8 ubuf[shash_align_buffer_size(ds, alignmask)] - __attribute__ ((aligned)); - u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1); - int err; - - err = shash->final(desc, buf); - if (err) - goto out; - - memcpy(out, buf, ds); - -out: - memset(buf, 0, ds); - return err; -} - -int crypto_shash_final(struct shash_desc *desc, u8 *out) -{ - struct crypto_shash *tfm = desc->tfm; - struct shash_alg *shash = crypto_shash_alg(tfm); - unsigned long alignmask = crypto_shash_alignmask(tfm); - - if ((unsigned long)out & alignmask) - return shash_final_unaligned(desc, out); - - return shash->final(desc, out); -} - -static int shash_finup_unaligned(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int shash_finup(struct shash_desc *desc, const u8 *data, + unsigned len, u8 *out) { return crypto_shash_update(desc, data, len) ?: crypto_shash_final(desc, out); } -int crypto_shash_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct crypto_shash *tfm = desc->tfm; - struct shash_alg *shash = crypto_shash_alg(tfm); - unsigned long alignmask = crypto_shash_alignmask(tfm); - - if (((unsigned long)data | (unsigned long)out) & alignmask) - return shash_finup_unaligned(desc, data, len, out); - - return shash->finup(desc, data, len, out); -} - -static int shash_digest_unaligned(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int shash_digest(struct shash_desc *desc, const u8 *data, + unsigned len, u8 *out) { return crypto_shash_init(desc) ?: crypto_shash_finup(desc, data, len, out); } -int crypto_shash_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct crypto_shash *tfm = desc->tfm; - struct shash_alg *shash = crypto_shash_alg(tfm); - unsigned long alignmask = crypto_shash_alignmask(tfm); - - if (((unsigned long)data | (unsigned long)out) & alignmask) - return shash_digest_unaligned(desc, data, len, out); - - return shash->digest(desc, data, len, out); -} - -static int shash_default_export(struct shash_desc *desc, void *out) -{ - memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(desc->tfm)); - return 0; -} - -static int shash_default_import(struct shash_desc *desc, const void *in) -{ - memcpy(shash_desc_ctx(desc), in, crypto_shash_descsize(desc->tfm)); - return 0; -} - static int crypto_shash_init_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); @@ -197,98 +41,32 @@ static int crypto_shash_init_tfm(struct crypto_tfm *tfm) } static const struct crypto_type crypto_shash_type = { - .extsize = crypto_alg_extsize, - .init_tfm = crypto_shash_init_tfm, - .maskclear = ~CRYPTO_ALG_TYPE_MASK, - .maskset = CRYPTO_ALG_TYPE_MASK, - .type = CRYPTO_ALG_TYPE_SHASH, - .tfmsize = offsetof(struct crypto_shash, base), + .extsize = crypto_alg_extsize, + .init_tfm = crypto_shash_init_tfm, + .maskclear = ~CRYPTO_ALG_TYPE_MASK, + .maskset = CRYPTO_ALG_TYPE_MASK, + .type = CRYPTO_ALG_TYPE_SHASH, + .tfmsize = offsetof(struct crypto_shash, base), }; -struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, - u32 mask) +struct crypto_shash *crypto_alloc_shash(const char *alg_name, + u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask); } -static int shash_prepare_alg(struct shash_alg *alg) +int crypto_register_shash(struct shash_alg *alg) { struct crypto_alg *base = &alg->base; - if (alg->digestsize > PAGE_SIZE / 8 || - alg->descsize > PAGE_SIZE / 8 || - alg->statesize > PAGE_SIZE / 8) - return -EINVAL; - base->cra_type = &crypto_shash_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_SHASH; if (!alg->finup) - alg->finup = shash_finup_unaligned; + alg->finup = shash_finup; if (!alg->digest) - alg->digest = shash_digest_unaligned; - if (!alg->export) { - alg->export = shash_default_export; - alg->import = shash_default_import; - alg->statesize = alg->descsize; - } - if (!alg->setkey) - alg->setkey = shash_no_setkey; - - return 0; -} - -int crypto_register_shash(struct shash_alg *alg) -{ - struct crypto_alg *base = &alg->base; - int err; - - err = shash_prepare_alg(alg); - if (err) - return err; + alg->digest = shash_digest; return crypto_register_alg(base); } - -int crypto_unregister_shash(struct shash_alg *alg) -{ - return crypto_unregister_alg(&alg->base); -} - -int crypto_register_shashes(struct shash_alg *algs, int count) -{ - int i, ret; - - for (i = 0; i < count; i++) { - ret = crypto_register_shash(&algs[i]); - if (ret) - goto err; - } - - return 0; - -err: - for (--i; i >= 0; --i) - crypto_unregister_shash(&algs[i]); - - return ret; -} - -int crypto_unregister_shashes(struct shash_alg *algs, int count) -{ - int i, ret; - - for (i = count - 1; i >= 0; --i) { - ret = crypto_unregister_shash(&algs[i]); - if (ret) - pr_err("Failed to unregister %s %s: %d\n", - algs[i].base.cra_driver_name, - algs[i].base.cra_name, ret); - } - - return 0; -} - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Synchronous cryptographic hash type"); diff --git a/linux/lz4hc_compress.c b/linux/lz4hc_compress.c deleted file mode 100644 index b64ded0d..00000000 --- a/linux/lz4hc_compress.c +++ /dev/null @@ -1,454 +0,0 @@ -/* - * LZ4 HC - High Compression Mode of LZ4 - * Copyright (C) 2011-2012, Yann Collet. - * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * You can contact the author at : - * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html - * - LZ4 source repository : http://code.google.com/p/lz4/ - * - * Changed for kernel use by: - * Chanho Min <chanho.min@lge.com> - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/lz4.h> -#include <asm/unaligned.h> -#include "lz4defs.h" - -struct lz4hc_data { - const u8 *base; - HTYPE hashtable[HASHTABLESIZE]; - u16 chaintable[MAXD]; - const u8 *nexttoupdate; -} __attribute__((__packed__)); - -static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base) -{ - memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable)); - memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable)); - -#if LZ4_ARCH64 - hc4->nexttoupdate = base + 1; -#else - hc4->nexttoupdate = base; -#endif - hc4->base = base; - return 1; -} - -/* Update chains up to ip (excluded) */ -static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip) -{ - u16 *chaintable = hc4->chaintable; - HTYPE *hashtable = hc4->hashtable; -#if LZ4_ARCH64 - const u8 * const base = hc4->base; -#else - const int base = 0; -#endif - - while (hc4->nexttoupdate < ip) { - const u8 *p = hc4->nexttoupdate; - size_t delta = p - (hashtable[HASH_VALUE(p)] + base); - if (delta > MAX_DISTANCE) - delta = MAX_DISTANCE; - chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta; - hashtable[HASH_VALUE(p)] = (p) - base; - hc4->nexttoupdate++; - } -} - -static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4, - const u8 *ip, const u8 *const matchlimit, const u8 **matchpos) -{ - u16 *const chaintable = hc4->chaintable; - HTYPE *const hashtable = hc4->hashtable; - const u8 *ref; -#if LZ4_ARCH64 - const u8 * const base = hc4->base; -#else - const int base = 0; -#endif - int nbattempts = MAX_NB_ATTEMPTS; - size_t repl = 0, ml = 0; - u16 delta; - - /* HC4 match finder */ - lz4hc_insert(hc4, ip); - ref = hashtable[HASH_VALUE(ip)] + base; - - /* potential repetition */ - if (ref >= ip-4) { - /* confirmed */ - if (A32(ref) == A32(ip)) { - delta = (u16)(ip-ref); - repl = ml = common_length(ip + MINMATCH, - ref + MINMATCH, matchlimit) + MINMATCH; - *matchpos = ref; - } - ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; - } - - while ((ref >= ip - MAX_DISTANCE) && nbattempts) { - nbattempts--; - if (*(ref + ml) == *(ip + ml)) { - if (A32(ref) == A32(ip)) { - size_t mlt = - common_length(ip + MINMATCH, - ref + MINMATCH, matchlimit) + MINMATCH; - if (mlt > ml) { - ml = mlt; - *matchpos = ref; - } - } - } - ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; - } - - /* Complete table */ - if (repl) { - const u8 *ptr = ip; - const u8 *end; - end = ip + repl - (MINMATCH-1); - /* Pre-Load */ - while (ptr < end - delta) { - chaintable[(size_t)(ptr) & MAXD_MASK] = delta; - ptr++; - } - do { - chaintable[(size_t)(ptr) & MAXD_MASK] = delta; - /* Head of chain */ - hashtable[HASH_VALUE(ptr)] = (ptr) - base; - ptr++; - } while (ptr < end); - hc4->nexttoupdate = end; - } - - return (int)ml; -} - -static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4, - const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest, - const u8 **matchpos, const u8 **startpos) -{ - u16 *const chaintable = hc4->chaintable; - HTYPE *const hashtable = hc4->hashtable; -#if LZ4_ARCH64 - const u8 * const base = hc4->base; -#else - const int base = 0; -#endif - const u8 *ref; - int nbattempts = MAX_NB_ATTEMPTS; - int delta = (int)(ip - startlimit); - - /* First Match */ - lz4hc_insert(hc4, ip); - ref = hashtable[HASH_VALUE(ip)] + base; - - while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base) - && (nbattempts)) { - nbattempts--; - if (*(startlimit + longest) == *(ref - delta + longest)) { - if (A32(ref) == A32(ip)) { - const u8 *reft = ref; - const u8 *startt = ip; - unsigned length = - common_length(ip + MINMATCH, - ref + MINMATCH, - matchlimit); - - while ((startt > startlimit) - && (reft > hc4->base) - && (startt[-1] == reft[-1])) { - startt--; - reft--; - length++; - } - - if (length > longest) { - longest = length; - *matchpos = reft; - *startpos = startt; - } - } - } - ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; - } - return longest; -} - -static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor, - int ml, const u8 *ref) -{ - unsigned length; - u8 *token; - - /* Encode Literal length */ - length = *ip - *anchor; - token = (*op)++; - *token = encode_length(op, length) << ML_BITS; - - /* Copy Literals */ - MEMCPY_ADVANCE_CHUNKED(*op, *anchor, length); - - /* Encode Offset */ - PUT_LE16_ADVANCE(*op, (u16)(*ip - ref)); - - *token += encode_length(op, ml - MINMATCH); - - /* Prepare next loop */ - *ip += ml; - *anchor = *ip; - - return 0; -} - -static int lz4_compresshcctx(struct lz4hc_data *ctx, - const char *source, - char *dest, - int isize) -{ - const u8 *ip = (const u8 *)source; - const u8 *anchor = ip; - const u8 *const iend = ip + isize; - const u8 *const mflimit = iend - MFLIMIT; - const u8 *const matchlimit = (iend - LASTLITERALS); - - u8 *op = (u8 *)dest; - - int ml, ml2, ml3, ml0; - const u8 *ref = NULL; - const u8 *start2 = NULL; - const u8 *ref2 = NULL; - const u8 *start3 = NULL; - const u8 *ref3 = NULL; - const u8 *start0; - const u8 *ref0; - int lastrun; - - ip++; - - /* Main Loop */ - while (ip < mflimit) { - ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref)); - if (!ml) { - ip++; - continue; - } - - /* saved, in case we would skip too much */ - start0 = ip; - ref0 = ref; - ml0 = ml; -_search2: - if (ip+ml < mflimit) - ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2, - ip + 1, matchlimit, ml, &ref2, &start2); - else - ml2 = ml; - /* No better match */ - if (ml2 == ml) { - lz4_encodesequence(&ip, &op, &anchor, ml, ref); - continue; - } - - if (start0 < ip) { - /* empirical */ - if (start2 < ip + ml0) { - ip = start0; - ref = ref0; - ml = ml0; - } - } - /* - * Here, start0==ip - * First Match too small : removed - */ - if ((start2 - ip) < 3) { - ml = ml2; - ip = start2; - ref = ref2; - goto _search2; - } - -_search3: - /* - * Currently we have : - * ml2 > ml1, and - * ip1+3 <= ip2 (usually < ip1+ml1) - */ - if ((start2 - ip) < OPTIMAL_ML) { - int correction; - int new_ml = ml; - if (new_ml > OPTIMAL_ML) - new_ml = OPTIMAL_ML; - if (ip + new_ml > start2 + ml2 - MINMATCH) - new_ml = (int)(start2 - ip) + ml2 - MINMATCH; - correction = new_ml - (int)(start2 - ip); - if (correction > 0) { - start2 += correction; - ref2 += correction; - ml2 -= correction; - } - } - /* - * Now, we have start2 = ip+new_ml, - * with new_ml=min(ml, OPTIMAL_ML=18) - */ - if (start2 + ml2 < mflimit) - ml3 = lz4hc_insertandgetwidermatch(ctx, - start2 + ml2 - 3, start2, matchlimit, - ml2, &ref3, &start3); - else - ml3 = ml2; - - /* No better match : 2 sequences to encode */ - if (ml3 == ml2) { - /* ip & ref are known; Now for ml */ - if (start2 < ip+ml) - ml = (int)(start2 - ip); - - /* Now, encode 2 sequences */ - lz4_encodesequence(&ip, &op, &anchor, ml, ref); - ip = start2; - lz4_encodesequence(&ip, &op, &anchor, ml2, ref2); - continue; - } - - /* Not enough space for match 2 : remove it */ - if (start3 < ip + ml + 3) { - /* - * can write Seq1 immediately ==> Seq2 is removed, - * so Seq3 becomes Seq1 - */ - if (start3 >= (ip + ml)) { - if (start2 < ip + ml) { - int correction = - (int)(ip + ml - start2); - start2 += correction; - ref2 += correction; - ml2 -= correction; - if (ml2 < MINMATCH) { - start2 = start3; - ref2 = ref3; - ml2 = ml3; - } - } - - lz4_encodesequence(&ip, &op, &anchor, ml, ref); - ip = start3; - ref = ref3; - ml = ml3; - - start0 = start2; - ref0 = ref2; - ml0 = ml2; - goto _search2; - } - - start2 = start3; - ref2 = ref3; - ml2 = ml3; - goto _search3; - } - - /* - * OK, now we have 3 ascending matches; let's write at least - * the first one ip & ref are known; Now for ml - */ - if (start2 < ip + ml) { - if ((start2 - ip) < (int)ML_MASK) { - int correction; - if (ml > OPTIMAL_ML) - ml = OPTIMAL_ML; - if (ip + ml > start2 + ml2 - MINMATCH) - ml = (int)(start2 - ip) + ml2 - - MINMATCH; - correction = ml - (int)(start2 - ip); - if (correction > 0) { - start2 += correction; - ref2 += correction; - ml2 -= correction; - } - } else - ml = (int)(start2 - ip); - } - lz4_encodesequence(&ip, &op, &anchor, ml, ref); - - ip = start2; - ref = ref2; - ml = ml2; - - start2 = start3; - ref2 = ref3; - ml2 = ml3; - - goto _search3; - } - - /* Encode Last Literals */ - lastrun = (int)(iend - anchor); - if (lastrun >= (int)RUN_MASK) { - *op++ = (RUN_MASK << ML_BITS); - lastrun -= RUN_MASK; - for (; lastrun > 254 ; lastrun -= 255) - *op++ = 255; - *op++ = (u8) lastrun; - } else - *op++ = (lastrun << ML_BITS); - memcpy(op, anchor, iend - anchor); - op += iend - anchor; - /* End */ - return (int) (((char *)op) - dest); -} - -int lz4hc_compress(const unsigned char *src, size_t src_len, - unsigned char *dst, size_t *dst_len, void *wrkmem) -{ - int ret = -1; - int out_len = 0; - - struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem; - lz4hc_init(hc4, (const u8 *)src); - out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src, - (char *)dst, (int)src_len); - - if (out_len < 0) - goto exit; - - *dst_len = out_len; - return 0; - -exit: - return ret; -} -EXPORT_SYMBOL(lz4hc_compress); - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_DESCRIPTION("LZ4HC compressor"); diff --git a/linux/sha1.c b/linux/sha1.c deleted file mode 100644 index 5a56dfd7..00000000 --- a/linux/sha1.c +++ /dev/null @@ -1,201 +0,0 @@ -/* - * SHA1 routine optimized to do word accesses rather than byte accesses, - * and to avoid unnecessary copies into the context array. - * - * This was based on the git SHA1 implementation. - */ - -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/bitops.h> -#include <linux/cryptohash.h> -#include <asm/unaligned.h> - -/* - * If you have 32 registers or more, the compiler can (and should) - * try to change the array[] accesses into registers. However, on - * machines with less than ~25 registers, that won't really work, - * and at least gcc will make an unholy mess of it. - * - * So to avoid that mess which just slows things down, we force - * the stores to memory to actually happen (we might be better off - * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as - * suggested by Artur Skawina - that will also make gcc unable to - * try to do the silly "optimize away loads" part because it won't - * see what the value will be). - * - * Ben Herrenschmidt reports that on PPC, the C version comes close - * to the optimized asm with this (ie on PPC you don't want that - * 'volatile', since there are lots of registers). - * - * On ARM we get the best code generation by forcing a full memory barrier - * between each SHA_ROUND, otherwise gcc happily get wild with spilling and - * the stack frame size simply explode and performance goes down the drain. - */ - -#ifdef CONFIG_X86 - #define setW(x, val) (*(volatile __u32 *)&W(x) = (val)) -#elif defined(CONFIG_ARM) - #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0) -#else - #define setW(x, val) (W(x) = (val)) -#endif - -/* This "rolls" over the 512-bit array */ -#define W(x) (array[(x)&15]) - -/* - * Where do we get the source from? The first 16 iterations get it from - * the input data, the next mix it from the 512-bit array. - */ -#define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t) -#define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1) - -#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \ - __u32 TEMP = input(t); setW(t, TEMP); \ - E += TEMP + rol32(A,5) + (fn) + (constant); \ - B = ror32(B, 2); } while (0) - -#define T_0_15(t, A, B, C, D, E) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) -#define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) -#define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E ) -#define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E ) -#define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6, A, B, C, D, E ) - -/** - * sha_transform - single block SHA1 transform - * - * @digest: 160 bit digest to update - * @data: 512 bits of data to hash - * @array: 16 words of workspace (see note) - * - * This function generates a SHA1 digest for a single 512-bit block. - * Be warned, it does not handle padding and message digest, do not - * confuse it with the full FIPS 180-1 digest algorithm for variable - * length messages. - * - * Note: If the hash is security sensitive, the caller should be sure - * to clear the workspace. This is left to the caller to avoid - * unnecessary clears between chained hashing operations. - */ -void sha_transform(__u32 *digest, const char *data, __u32 *array) -{ - __u32 A, B, C, D, E; - - A = digest[0]; - B = digest[1]; - C = digest[2]; - D = digest[3]; - E = digest[4]; - - /* Round 1 - iterations 0-16 take their input from 'data' */ - T_0_15( 0, A, B, C, D, E); - T_0_15( 1, E, A, B, C, D); - T_0_15( 2, D, E, A, B, C); - T_0_15( 3, C, D, E, A, B); - T_0_15( 4, B, C, D, E, A); - T_0_15( 5, A, B, C, D, E); - T_0_15( 6, E, A, B, C, D); - T_0_15( 7, D, E, A, B, C); - T_0_15( 8, C, D, E, A, B); - T_0_15( 9, B, C, D, E, A); - T_0_15(10, A, B, C, D, E); - T_0_15(11, E, A, B, C, D); - T_0_15(12, D, E, A, B, C); - T_0_15(13, C, D, E, A, B); - T_0_15(14, B, C, D, E, A); - T_0_15(15, A, B, C, D, E); - - /* Round 1 - tail. Input from 512-bit mixing array */ - T_16_19(16, E, A, B, C, D); - T_16_19(17, D, E, A, B, C); - T_16_19(18, C, D, E, A, B); - T_16_19(19, B, C, D, E, A); - - /* Round 2 */ - T_20_39(20, A, B, C, D, E); - T_20_39(21, E, A, B, C, D); - T_20_39(22, D, E, A, B, C); - T_20_39(23, C, D, E, A, B); - T_20_39(24, B, C, D, E, A); - T_20_39(25, A, B, C, D, E); - T_20_39(26, E, A, B, C, D); - T_20_39(27, D, E, A, B, C); - T_20_39(28, C, D, E, A, B); - T_20_39(29, B, C, D, E, A); - T_20_39(30, A, B, C, D, E); - T_20_39(31, E, A, B, C, D); - T_20_39(32, D, E, A, B, C); - T_20_39(33, C, D, E, A, B); - T_20_39(34, B, C, D, E, A); - T_20_39(35, A, B, C, D, E); - T_20_39(36, E, A, B, C, D); - T_20_39(37, D, E, A, B, C); - T_20_39(38, C, D, E, A, B); - T_20_39(39, B, C, D, E, A); - - /* Round 3 */ - T_40_59(40, A, B, C, D, E); - T_40_59(41, E, A, B, C, D); - T_40_59(42, D, E, A, B, C); - T_40_59(43, C, D, E, A, B); - T_40_59(44, B, C, D, E, A); - T_40_59(45, A, B, C, D, E); - T_40_59(46, E, A, B, C, D); - T_40_59(47, D, E, A, B, C); - T_40_59(48, C, D, E, A, B); - T_40_59(49, B, C, D, E, A); - T_40_59(50, A, B, C, D, E); - T_40_59(51, E, A, B, C, D); - T_40_59(52, D, E, A, B, C); - T_40_59(53, C, D, E, A, B); - T_40_59(54, B, C, D, E, A); - T_40_59(55, A, B, C, D, E); - T_40_59(56, E, A, B, C, D); - T_40_59(57, D, E, A, B, C); - T_40_59(58, C, D, E, A, B); - T_40_59(59, B, C, D, E, A); - - /* Round 4 */ - T_60_79(60, A, B, C, D, E); - T_60_79(61, E, A, B, C, D); - T_60_79(62, D, E, A, B, C); - T_60_79(63, C, D, E, A, B); - T_60_79(64, B, C, D, E, A); - T_60_79(65, A, B, C, D, E); - T_60_79(66, E, A, B, C, D); - T_60_79(67, D, E, A, B, C); - T_60_79(68, C, D, E, A, B); - T_60_79(69, B, C, D, E, A); - T_60_79(70, A, B, C, D, E); - T_60_79(71, E, A, B, C, D); - T_60_79(72, D, E, A, B, C); - T_60_79(73, C, D, E, A, B); - T_60_79(74, B, C, D, E, A); - T_60_79(75, A, B, C, D, E); - T_60_79(76, E, A, B, C, D); - T_60_79(77, D, E, A, B, C); - T_60_79(78, C, D, E, A, B); - T_60_79(79, B, C, D, E, A); - - digest[0] += A; - digest[1] += B; - digest[2] += C; - digest[3] += D; - digest[4] += E; -} -EXPORT_SYMBOL(sha_transform); - -/** - * sha_init - initialize the vectors for a SHA1 digest - * @buf: vector to initialize - */ -void sha_init(__u32 *buf) -{ - buf[0] = 0x67452301; - buf[1] = 0xefcdab89; - buf[2] = 0x98badcfe; - buf[3] = 0x10325476; - buf[4] = 0xc3d2e1f0; -} -EXPORT_SYMBOL(sha_init); |