summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2018-02-16 15:36:33 -0500
committerKent Overstreet <kent.overstreet@gmail.com>2018-02-16 18:13:24 -0500
commit807abf36c1e119825d42cda6f6b249649ca44eb5 (patch)
tree59228cffd1e9b60d822e7381b897df8036788ffe
parent8d95f4a437a2a4a5f8a18be00d8d74dcc4767f51 (diff)
Update bcachefs sources to e99d29e402 bcachefs: zstd support, compression refactoring
-rw-r--r--.bcachefs_revision2
-rw-r--r--Makefile11
-rw-r--r--cmd_migrate.c4
-rw-r--r--debian/control2
-rw-r--r--libbcachefs/alloc.c67
-rw-r--r--libbcachefs/bcachefs.h13
-rw-r--r--libbcachefs/bcachefs_format.h25
-rw-r--r--libbcachefs/btree_cache.c29
-rw-r--r--libbcachefs/btree_gc.c5
-rw-r--r--libbcachefs/btree_io.c5
-rw-r--r--libbcachefs/btree_update_interior.c10
-rw-r--r--libbcachefs/btree_update_leaf.c13
-rw-r--r--libbcachefs/checksum.c44
-rw-r--r--libbcachefs/checksum.h19
-rw-r--r--libbcachefs/compress.c396
-rw-r--r--libbcachefs/extents.c11
-rw-r--r--libbcachefs/extents.h37
-rw-r--r--libbcachefs/fs-io.c19
-rw-r--r--libbcachefs/io.c45
-rw-r--r--libbcachefs/io.h2
-rw-r--r--libbcachefs/journal.c24
-rw-r--r--libbcachefs/keylist.h11
-rw-r--r--libbcachefs/migrate.c125
-rw-r--r--libbcachefs/migrate.h1
-rw-r--r--libbcachefs/move.c276
-rw-r--r--libbcachefs/move.h34
-rw-r--r--libbcachefs/movinggc.c22
-rw-r--r--libbcachefs/opts.c1
-rw-r--r--libbcachefs/opts.h7
-rw-r--r--libbcachefs/quota.c12
-rw-r--r--libbcachefs/quota.h7
-rw-r--r--libbcachefs/siphash.c1
-rw-r--r--libbcachefs/super-io.c110
-rw-r--r--libbcachefs/super-io.h12
-rw-r--r--libbcachefs/super.c75
-rw-r--r--libbcachefs/super.h5
-rw-r--r--libbcachefs/super_types.h2
-rw-r--r--libbcachefs/sysfs.c9
-rw-r--r--libbcachefs/tier.c22
-rw-r--r--libbcachefs/util.h15
40 files changed, 925 insertions, 605 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 274236e..76acdf9 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-d5e561b3cc023dd247d2b3d08b680709ec21b477
+e99d29e40210f6d9b7ec9e5b7aee1e48ae7655c5
diff --git a/Makefile b/Makefile
index ef1eacf..af7a206 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,7 @@ CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall \
-D_GNU_SOURCE \
-D_LGPL_SOURCE \
-DRCU_MEMBARRIER \
+ -DZSTD_STATIC_LINKING_ONLY \
-DNO_BCACHEFS_CHARDEV \
-DNO_BCACHEFS_FS \
-DNO_BCACHEFS_SYSFS \
@@ -31,9 +32,15 @@ ifdef D
endif
PKGCONFIG_LIBS="blkid uuid liburcu libsodium zlib"
+PKGCONFIG_LIBS_STATIC="libzstd"
+
CFLAGS+=`pkg-config --cflags ${PKGCONFIG_LIBS}`
-LDLIBS+=`pkg-config --libs ${PKGCONFIG_LIBS}` \
- -lm -lpthread -lrt -lscrypt -lkeyutils -laio
+LDLIBS+=`pkg-config --libs ${PKGCONFIG_LIBS}`
+
+CFLAGS+=`pkg-config --static --cflags ${PKGCONFIG_LIBS_STATIC}`
+LDLIBS+=`pkg-config --static --libs ${PKGCONFIG_LIBS_STATIC}`
+
+LDLIBS+=-lm -lpthread -lrt -lscrypt -lkeyutils -laio
ifeq ($(PREFIX),/usr)
ROOT_SBINDIR=/sbin
diff --git a/cmd_migrate.c b/cmd_migrate.c
index d676bb5..4ba3538 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -344,8 +344,8 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
die("error reserving space in new filesystem: %s",
strerror(-ret));
- bch2_check_mark_super(c, BCH_DATA_USER,
- bch2_bkey_devs(extent_i_to_s_c(e).s_c));
+ bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+ extent_i_to_s_c(e).s_c);
ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
&res, NULL, NULL, 0);
diff --git a/debian/control b/debian/control
index 07f2f2f..08673f4 100644
--- a/debian/control
+++ b/debian/control
@@ -5,7 +5,7 @@ Priority: optional
Standards-Version: 3.9.5
Build-Depends: debhelper (>= 9), pkg-config, libblkid-dev, uuid-dev,
libscrypt-dev, libsodium-dev, libkeyutils-dev, liburcu-dev, zlib1g-dev,
- libattr1-dev, libaio-dev
+ libattr1-dev, libaio-dev, libzstd-dev
Homepage: http://bcache.evilpiepirate.org/
Package: bcachefs-tools
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index c195ffb..339ffd0 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -1201,43 +1201,56 @@ out:
return ob - c->open_buckets;
}
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
- struct write_point *wp,
- struct bch_devs_mask *devs)
+static int __dev_alloc_cmp(struct bch_fs *c,
+ struct write_point *wp,
+ unsigned l, unsigned r)
{
- struct dev_alloc_list ret = { .nr = 0 };
- struct bch_dev *ca, *ca2;
- unsigned i, j;
+ struct bch_dev *ca_l = rcu_dereference(c->devs[l]);
+ struct bch_dev *ca_r = rcu_dereference(c->devs[r]);
- for_each_member_device_rcu(ca, c, i, devs) {
- for (j = 0; j < ret.nr; j++) {
- unsigned idx = ret.devs[j];
+ if (ca_l && ca_r && ca_l->mi.tier != ca_r->mi.tier)
+ return ((ca_l->mi.tier > ca_r->mi.tier) -
+ (ca_l->mi.tier < ca_r->mi.tier));
- ca2 = rcu_dereference(c->devs[idx]);
- if (!ca2)
- break;
+ return ((wp->next_alloc[l] > wp->next_alloc[r]) -
+ (wp->next_alloc[l] < wp->next_alloc[r]));
+}
- if (ca->mi.tier < ca2->mi.tier)
- break;
+#define dev_alloc_cmp(l, r) __dev_alloc_cmp(c, wp, l, r)
- if (ca->mi.tier == ca2->mi.tier &&
- wp->next_alloc[i] < wp->next_alloc[idx])
- break;
- }
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
+ struct write_point *wp,
+ struct bch_devs_mask *devs)
+{
+ struct dev_alloc_list ret = { .nr = 0 };
+ struct bch_dev *ca;
+ unsigned i;
- array_insert_item(ret.devs, ret.nr, j, i);
- }
+ for_each_member_device_rcu(ca, c, i, devs)
+ ret.devs[ret.nr++] = i;
+ bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
return ret;
}
void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
struct write_point *wp)
{
- unsigned i;
+ u64 *v = wp->next_alloc + ca->dev_idx;
+ u64 free_space = dev_buckets_free(c, ca);
+ u64 free_space_inv = free_space
+ ? div64_u64(1ULL << 48, free_space)
+ : 1ULL << 48;
+ u64 scale = *v / 4;
+
+ if (*v + free_space_inv >= *v)
+ *v += free_space_inv;
+ else
+ *v = U64_MAX;
- for (i = 0; i < ARRAY_SIZE(wp->next_alloc); i++)
- wp->next_alloc[i] >>= 1;
+ for (v = wp->next_alloc;
+ v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
+ *v = *v < scale ? 0 : *v - scale;
}
static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
@@ -1249,7 +1262,6 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
{
enum bucket_alloc_ret ret = NO_DEVICES;
struct dev_alloc_list devs_sorted;
- u64 buckets_free;
unsigned i;
BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs));
@@ -1281,13 +1293,6 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs));
wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob;
- buckets_free = U64_MAX, dev_buckets_free(c, ca);
- if (buckets_free)
- wp->next_alloc[ca->dev_idx] +=
- div64_u64(U64_MAX, buckets_free *
- ca->mi.bucket_size);
- else
- wp->next_alloc[ca->dev_idx] = U64_MAX;
bch2_wp_rescale(c, ca, wp);
__clear_bit(ca->dev_idx, devs->d);
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index cb9906c..5a3e99b 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -194,6 +194,7 @@
#include <linux/shrinker.h>
#include <linux/types.h>
#include <linux/workqueue.h>
+#include <linux/zstd.h>
#include "bcachefs_format.h"
#include "bset.h"
@@ -231,6 +232,12 @@ do { \
bch_info(c, fmt, ##__VA_ARGS__); \
} while (0)
+#define pr_verbose_init(opts, fmt, ...) \
+do { \
+ if (opt_get(opts, verbose_init)) \
+ pr_info(fmt, ##__VA_ARGS__); \
+} while (0)
+
/* Parameters that are useful for debugging, but should always be compiled in: */
#define BCH_DEBUG_PARAMS_ALWAYS() \
BCH_DEBUG_PARAM(key_merging_disabled, \
@@ -646,10 +653,10 @@ struct bch_fs {
struct mutex bio_bounce_pages_lock;
mempool_t bio_bounce_pages;
- mempool_t lz4_workspace_pool;
- void *zlib_workspace;
- struct mutex zlib_workspace_lock;
mempool_t compression_bounce[2];
+ mempool_t compress_workspace[BCH_COMPRESSION_NR];
+ mempool_t decompress_workspace;
+ ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
struct crypto_skcipher *chacha20;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 854e1c3..5e40627 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -6,7 +6,6 @@
*/
#include <asm/types.h>
-#include <linux/compiler.h>
#include <asm/byteorder.h>
#include <linux/uuid.h>
@@ -370,7 +369,8 @@ enum bch_compression_type {
BCH_COMPRESSION_LZ4_OLD = 1,
BCH_COMPRESSION_GZIP = 2,
BCH_COMPRESSION_LZ4 = 3,
- BCH_COMPRESSION_NR = 4,
+ BCH_COMPRESSION_ZSTD = 4,
+ BCH_COMPRESSION_NR = 5,
};
enum bch_extent_entry_type {
@@ -1082,6 +1082,7 @@ LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
enum bch_sb_features {
BCH_FEATURE_LZ4 = 0,
BCH_FEATURE_GZIP = 1,
+ BCH_FEATURE_ZSTD = 2,
};
/* options: */
@@ -1109,11 +1110,17 @@ enum bch_str_hash_opts {
BCH_STR_HASH_NR = 3,
};
+#define BCH_COMPRESSION_TYPES() \
+ x(NONE) \
+ x(LZ4) \
+ x(GZIP) \
+ x(ZSTD)
+
enum bch_compression_opts {
- BCH_COMPRESSION_OPT_NONE = 0,
- BCH_COMPRESSION_OPT_LZ4 = 1,
- BCH_COMPRESSION_OPT_GZIP = 2,
- BCH_COMPRESSION_OPT_NR = 3,
+#define x(t) BCH_COMPRESSION_OPT_##t,
+ BCH_COMPRESSION_TYPES()
+#undef x
+ BCH_COMPRESSION_OPT_NR
};
/*
@@ -1322,8 +1329,10 @@ struct btree_node {
};
} __attribute__((packed, aligned(8)));
-LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4);
-LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
+LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4);
+LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
+/* 8-32 unused */
+LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64);
struct btree_node_entry {
struct bch_csum csum;
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 0bde449..7eae4d2 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -373,19 +373,23 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
unsigned i;
- int ret;
+ int ret = 0;
+
+ pr_verbose_init(c->opts, "");
ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
if (ret)
- return ret;
+ goto out;
bc->table_init_done = true;
bch2_recalc_btree_reserve(c);
for (i = 0; i < bc->reserve; i++)
- if (!btree_node_mem_alloc(c, GFP_KERNEL))
- return -ENOMEM;
+ if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out;
+ }
list_splice_init(&bc->live, &bc->freeable);
@@ -393,12 +397,16 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
mutex_init(&c->verify_lock);
c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
- if (!c->verify_ondisk)
- return -ENOMEM;
+ if (!c->verify_ondisk) {
+ ret = -ENOMEM;
+ goto out;
+ }
c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
- if (!c->verify_data)
- return -ENOMEM;
+ if (!c->verify_data) {
+ ret = -ENOMEM;
+ goto out;
+ }
list_del_init(&c->verify_data->list);
#endif
@@ -408,8 +416,9 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bc->shrink.seeks = 4;
bc->shrink.batch = btree_pages(c) * 2;
register_shrinker(&bc->shrink);
-
- return 0;
+out:
+ pr_verbose_init(c->opts, "ret %i", ret);
+ return ret;
}
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 6350866..f2e9c10 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -148,14 +148,13 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
{
enum bch_data_type data_type = type == BKEY_TYPE_BTREE
? BCH_DATA_BTREE : BCH_DATA_USER;
- struct bch_devs_list devs = bch2_bkey_devs(k);
int ret = 0;
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_sb_has_replicas(c, data_type, devs), c,
+ fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c,
"superblock not marked as containing replicas (type %u)",
data_type)) {
- ret = bch2_check_mark_super(c, data_type, devs);
+ ret = bch2_mark_bkey_replicas(c, data_type, k);
if (ret)
return ret;
}
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 9b4eff1..d805fb4 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1135,6 +1135,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
unsigned sectors, whiteout_u64s = 0;
struct nonce nonce;
struct bch_csum csum;
+ bool first = !b->written;
if (!b->written) {
i = &b->data->keys;
@@ -1194,10 +1195,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
}
if (ret) {
- btree_err_on(!b->written,
+ btree_err_on(first,
BTREE_ERR_FIXABLE, c, b, i,
"first btree node bset has blacklisted journal seq");
- if (b->written)
+ if (!first)
continue;
}
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index c45527a..0e0156d 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -430,6 +430,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
n->data->min_key = b->data->min_key;
n->data->max_key = b->data->max_key;
n->data->format = format;
+ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
btree_node_set_format(n, format);
@@ -559,8 +560,8 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
goto err_free;
}
- ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
- bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+ bkey_i_to_s_c(&b->key));
if (ret)
goto err_free;
@@ -1225,6 +1226,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
n2->data->max_key = n1->data->max_key;
n2->data->format = n1->format;
+ SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
n2->key.k.p = n1->key.k.p;
btree_node_set_format(n2, n2->data->format);
@@ -2019,8 +2021,8 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
goto err;
}
- ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
- bch2_extent_devs(extent_i_to_s_c(new_key)));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+ extent_i_to_s_c(new_key).s_c);
if (ret)
goto err_free_update;
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 4b252b6..007aa5e 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -272,15 +272,10 @@ static void multi_unlock_write(struct btree_insert *trans)
bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
}
-static inline void btree_trans_sort(struct btree_insert *trans)
+static inline int btree_trans_cmp(struct btree_insert_entry l,
+ struct btree_insert_entry r)
{
- int i, end = trans->nr;
-
- while (--end > 0)
- for (i = 0; i < end; i++)
- if (btree_iter_cmp(trans->entries[i].iter,
- trans->entries[i + 1].iter) > 0)
- swap(trans->entries[i], trans->entries[i + 1]);
+ return btree_iter_cmp(l.iter, r.iter);
}
/* Normal update interface: */
@@ -313,7 +308,7 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
bkey_i_to_s_c(i->k)));
}
- btree_trans_sort(trans);
+ bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
if (unlikely(!percpu_ref_tryget(&c->writes)))
return -EROFS;
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 0875585..56bd99f 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -219,12 +219,16 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
crypto_alloc_skcipher("chacha20", 0, 0);
int ret;
- if (!chacha20)
+ if (!chacha20) {
+ pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
return PTR_ERR(chacha20);
+ }
ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
- if (ret)
+ if (ret) {
+ pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
+ }
do_encrypt(chacha20, nonce, buf, len);
err:
@@ -567,7 +571,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
ret = bch2_request_key(c->disk_sb, &user_key);
if (ret) {
- bch_err(c, "error requesting encryption key");
+ bch_err(c, "error requesting encryption key: %i", ret);
goto err;
}
@@ -594,13 +598,19 @@ static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
- if (IS_ERR(c->chacha20))
+ if (IS_ERR(c->chacha20)) {
+ bch_err(c, "error requesting chacha20 module: %li",
+ PTR_ERR(c->chacha20));
return PTR_ERR(c->chacha20);
+ }
if (!c->poly1305)
c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
- if (IS_ERR(c->poly1305))
+ if (IS_ERR(c->poly1305)) {
+ bch_err(c, "error requesting poly1305 module: %li",
+ PTR_ERR(c->poly1305));
return PTR_ERR(c->poly1305);
+ }
return 0;
}
@@ -660,7 +670,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
if (keyed) {
ret = bch2_request_key(c->disk_sb, &user_key);
if (ret) {
- bch_err(c, "error requesting encryption key");
+ bch_err(c, "error requesting encryption key: %i", ret);
goto err;
}
@@ -707,27 +717,35 @@ int bch2_fs_encryption_init(struct bch_fs *c)
{
struct bch_sb_field_crypt *crypt;
struct bch_key key;
- int ret;
+ int ret = 0;
+
+ pr_verbose_init(c->opts, "");
c->sha256 = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(c->sha256))
- return PTR_ERR(c->sha256);
+ if (IS_ERR(c->sha256)) {
+ bch_err(c, "error requesting sha256 module");
+ ret = PTR_ERR(c->sha256);
+ goto out;
+ }
crypt = bch2_sb_get_crypt(c->disk_sb);
if (!crypt)
- return 0;
+ goto out;
ret = bch2_alloc_ciphers(c);
if (ret)
- return ret;
+ goto out;
ret = bch2_decrypt_sb_key(c, crypt, &key);
if (ret)
- goto err;
+ goto out;
ret = crypto_skcipher_setkey(c->chacha20,
(void *) &key.key, sizeof(key.key));
-err:
+ if (ret)
+ goto out;
+out:
memzero_explicit(&key, sizeof(key));
+ pr_verbose_init(c->opts, "ret %i", ret);
return ret;
}
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h
index b0c8a50..7862294 100644
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -91,20 +91,11 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
}
-static inline enum bch_compression_type
-bch2_compression_opt_to_type(enum bch_compression_opts type)
-{
- switch (type) {
- case BCH_COMPRESSION_OPT_NONE:
- return BCH_COMPRESSION_NONE;
- case BCH_COMPRESSION_OPT_LZ4:
- return BCH_COMPRESSION_LZ4;
- case BCH_COMPRESSION_OPT_GZIP:
- return BCH_COMPRESSION_GZIP;
- default:
- BUG();
- }
-}
+static const unsigned bch2_compression_opt_to_type[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t,
+ BCH_COMPRESSION_TYPES()
+#undef x
+};
static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
unsigned type)
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index 6407998..7726cfd 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -8,6 +8,7 @@
#include "lz4.h"
#include <linux/lz4.h>
#include <linux/zlib.h>
+#include <linux/zstd.h>
/* Bounce buffer: */
struct bbuf {
@@ -151,6 +152,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
struct bbuf src_data = { NULL };
size_t src_len = src->bi_iter.bi_size;
size_t dst_len = crc.uncompressed_size << 9;
+ void *workspace;
int ret;
src_data = bio_map_or_bounce(c, src, READ);
@@ -159,57 +161,64 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
case BCH_COMPRESSION_LZ4_OLD:
ret = bch2_lz4_decompress(src_data.b, &src_len,
dst_data, dst_len);
- if (ret) {
- ret = -EIO;
+ if (ret)
goto err;
- }
break;
case BCH_COMPRESSION_LZ4:
ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
src_len, dst_len, dst_len);
- if (ret != dst_len) {
- ret = -EIO;
+ if (ret != dst_len)
goto err;
- }
break;
case BCH_COMPRESSION_GZIP: {
- void *workspace;
- z_stream strm;
-
- workspace = kmalloc(zlib_inflate_workspacesize(),
- GFP_NOIO|__GFP_NOWARN);
- if (!workspace) {
- mutex_lock(&c->zlib_workspace_lock);
- workspace = c->zlib_workspace;
- }
+ z_stream strm = {
+ .next_in = src_data.b,
+ .avail_in = src_len,
+ .next_out = dst_data,
+ .avail_out = dst_len,
+ };
+
+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
- strm.next_in = src_data.b;
- strm.avail_in = src_len;
- strm.next_out = dst_data;
- strm.avail_out = dst_len;
zlib_set_workspace(&strm, workspace);
zlib_inflateInit2(&strm, -MAX_WBITS);
-
ret = zlib_inflate(&strm, Z_FINISH);
- if (workspace == c->zlib_workspace)
- mutex_unlock(&c->zlib_workspace_lock);
- else
- kfree(workspace);
+ mempool_free(workspace, &c->decompress_workspace);
- if (ret != Z_STREAM_END) {
- ret = -EIO;
+ if (ret != Z_STREAM_END)
+ goto err;
+ break;
+ }
+ case BCH_COMPRESSION_ZSTD: {
+ ZSTD_DCtx *ctx;
+ size_t len;
+
+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+ ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
+
+ src_len = le32_to_cpup(src_data.b);
+
+ len = ZSTD_decompressDCtx(ctx,
+ dst_data, dst_len,
+ src_data.b + 4, src_len);
+
+ mempool_free(workspace, &c->decompress_workspace);
+
+ if (len != dst_len)
goto err;
- }
break;
}
default:
BUG();
}
ret = 0;
-err:
+out:
bio_unmap_or_unbounce(c, src_data);
return ret;
+err:
+ ret = -EIO;
+ goto out;
}
int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
@@ -282,113 +291,129 @@ err:
return ret;
}
+static int attempt_compress(struct bch_fs *c,
+ void *workspace,
+ void *dst, size_t dst_len,
+ void *src, size_t src_len,
+ unsigned compression_type)
+{
+ switch (compression_type) {
+ case BCH_COMPRESSION_LZ4: {
+ int len = src_len;
+ int ret = LZ4_compress_destSize(
+ src, dst,
+ &len, dst_len,
+ workspace);
+
+ if (len < src_len)
+ return -len;
+
+ return ret;
+ }
+ case BCH_COMPRESSION_GZIP: {
+ z_stream strm = {
+ .next_in = src,
+ .avail_in = src_len,
+ .next_out = dst,
+ .avail_out = dst_len,
+ };
+
+ zlib_set_workspace(&strm, workspace);
+ zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+ Z_DEFAULT_STRATEGY);
+
+ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
+ return 0;
+
+ if (zlib_deflateEnd(&strm) != Z_OK)
+ return 0;
+
+ return strm.total_out;
+ }
+ case BCH_COMPRESSION_ZSTD: {
+ ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
+ ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
+
+ size_t len = ZSTD_compressCCtx(ctx,
+ dst + 4, dst_len - 4,
+ src, src_len,
+ c->zstd_params);
+ if (ZSTD_isError(len))
+ return 0;
+
+ *((__le32 *) dst) = cpu_to_le32(len);
+ return len + 4;
+ }
+ default:
+ BUG();
+ }
+}
+
static unsigned __bio_compress(struct bch_fs *c,
struct bio *dst, size_t *dst_len,
struct bio *src, size_t *src_len,
unsigned compression_type)
{
struct bbuf src_data = { NULL }, dst_data = { NULL };
+ void *workspace;
unsigned pad;
int ret = 0;
/* If it's only one block, don't bother trying to compress: */
if (bio_sectors(src) <= c->opts.block_size)
- goto err;
+ return 0;
dst_data = bio_map_or_bounce(c, dst, WRITE);
src_data = bio_map_or_bounce(c, src, READ);
- switch (compression_type) {
- case BCH_COMPRESSION_LZ4_OLD:
- compression_type = BCH_COMPRESSION_LZ4;
+ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
- case BCH_COMPRESSION_LZ4: {
- void *workspace;
- int len = src->bi_iter.bi_size;
-
- workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
-
- while (1) {
- if (len <= block_bytes(c)) {
- ret = 0;
- break;
- }
-
- ret = LZ4_compress_destSize(
- src_data.b, dst_data.b,
- &len, dst->bi_iter.bi_size,
- workspace);
- if (ret >= len) {
- /* uncompressible: */
- ret = 0;
- break;
- }
-
- if (!(len & (block_bytes(c) - 1)))
- break;
- len = round_down(len, block_bytes(c));
- }
- mempool_free(workspace, &c->lz4_workspace_pool);
+ *src_len = src->bi_iter.bi_size;
+ *dst_len = dst->bi_iter.bi_size;
- if (!ret)
- goto err;
-
- *src_len = len;
- *dst_len = ret;
- ret = 0;
- break;
- }
- case BCH_COMPRESSION_GZIP: {
- void *workspace;
- z_stream strm;
-
- workspace = kmalloc(zlib_deflate_workspacesize(MAX_WBITS,
- DEF_MEM_LEVEL),
- GFP_NOIO|__GFP_NOWARN);
- if (!workspace) {
- mutex_lock(&c->zlib_workspace_lock);
- workspace = c->zlib_workspace;
+ /*
+ * XXX: this algorithm sucks when the compression code doesn't tell us
+ * how much would fit, like LZ4 does:
+ */
+ while (1) {
+ if (*src_len <= block_bytes(c)) {
+ ret = -1;
+ break;
}
- strm.next_in = src_data.b;
- strm.avail_in = min(src->bi_iter.bi_size,
- dst->bi_iter.bi_size);
- strm.next_out = dst_data.b;
- strm.avail_out = dst->bi_iter.bi_size;
- zlib_set_workspace(&strm, workspace);
- zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
- Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
- Z_DEFAULT_STRATEGY);
-
- ret = zlib_deflate(&strm, Z_FINISH);
- if (ret != Z_STREAM_END) {
- ret = -EIO;
- goto zlib_err;
+ ret = attempt_compress(c, workspace,
+ dst_data.b, *dst_len,
+ src_data.b, *src_len,
+ compression_type);
+ if (ret > 0) {
+ *dst_len = ret;
+ ret = 0;
+ break;
}
- ret = zlib_deflateEnd(&strm);
- if (ret != Z_OK) {
- ret = -EIO;
- goto zlib_err;
+ /* Didn't fit: should we retry with a smaller amount? */
+ if (*src_len <= *dst_len) {
+ ret = -1;
+ break;
}
- ret = 0;
-zlib_err:
- if (workspace == c->zlib_workspace)
- mutex_unlock(&c->zlib_workspace_lock);
+ /*
+ * If ret is negative, it's a hint as to how much data would fit
+ */
+ BUG_ON(-ret >= *src_len);
+
+ if (ret < 0)
+ *src_len = -ret;
else
- kfree(workspace);
+ *src_len -= (*src_len - *dst_len) / 2;
+ *src_len = round_down(*src_len, block_bytes(c));
+ }
- if (ret)
- goto err;
+ mempool_free(workspace, &c->compress_workspace[compression_type]);
- *dst_len = strm.total_out;
- *src_len = strm.total_in;
- break;
- }
- default:
- BUG();
- }
+ if (ret)
+ goto err;
/* Didn't get smaller: */
if (round_up(*dst_len, block_bytes(c)) >= *src_len)
@@ -429,6 +454,9 @@ unsigned bch2_bio_compress(struct bch_fs *c,
/* Don't generate a bigger output than input: */
dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+ if (compression_type == BCH_COMPRESSION_LZ4_OLD)
+ compression_type = BCH_COMPRESSION_LZ4;
+
compression_type =
__bio_compress(c, dst, dst_len, src, src_len, compression_type);
@@ -437,81 +465,147 @@ unsigned bch2_bio_compress(struct bch_fs *c,
return compression_type;
}
+#define BCH_FEATURE_NONE 0
+
+static const unsigned bch2_compression_opt_to_feature[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+ BCH_COMPRESSION_TYPES()
+#undef x
+};
+
+#undef BCH_FEATURE_NONE
+
/* doesn't write superblock: */
int bch2_check_set_has_compressed_data(struct bch_fs *c,
unsigned compression_type)
{
- switch (compression_type) {
- case BCH_COMPRESSION_OPT_NONE:
- return 0;
- case BCH_COMPRESSION_OPT_LZ4:
- if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4))
- return 0;
+ unsigned f;
+ int ret = 0;
- bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4);
- break;
- case BCH_COMPRESSION_OPT_GZIP:
- if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
- return 0;
+ pr_verbose_init(c->opts, "");
- bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP);
- break;
- default:
- BUG();
- }
+ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+
+ if (!compression_type)
+ goto out;
- return bch2_fs_compress_init(c);
+ f = bch2_compression_opt_to_feature[compression_type];
+ if (bch2_sb_test_feature(c->disk_sb, f))
+ goto out;
+
+ bch2_sb_set_feature(c->disk_sb, f);
+ ret = bch2_fs_compress_init(c);
+out:
+ pr_verbose_init(c->opts, "ret %i", ret);
+ return ret;
}
void bch2_fs_compress_exit(struct bch_fs *c)
{
- vfree(c->zlib_workspace);
- mempool_exit(&c->lz4_workspace_pool);
+ unsigned i;
+
+ mempool_exit(&c->decompress_workspace);
+ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
+ mempool_exit(&c->compress_workspace[i]);
mempool_exit(&c->compression_bounce[WRITE]);
mempool_exit(&c->compression_bounce[READ]);
}
-#define COMPRESSION_WORKSPACE_SIZE \
- max_t(size_t, zlib_inflate_workspacesize(), \
- zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
+static void *mempool_kvpmalloc(gfp_t gfp_mask, void *pool_data)
+{
+ size_t size = (size_t)pool_data;
+ return kvpmalloc(size, gfp_mask);
+}
+
+void mempool_kvpfree(void *element, void *pool_data)
+{
+ size_t size = (size_t)pool_data;
+ kvpfree(element, size);
+}
+
+static int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+ return !mempool_initialized(pool)
+ ? mempool_init(pool, min_nr, mempool_kvpmalloc,
+ mempool_kvpfree, (void *) size)
+ : 0;
+}
int bch2_fs_compress_init(struct bch_fs *c)
{
- unsigned order = get_order(c->sb.encoded_extent_max << 9);
- int ret;
+ size_t max_extent = c->sb.encoded_extent_max << 9;
+ size_t order = get_order(max_extent);
+ size_t decompress_workspace_size = 0;
+ bool decompress_workspace_needed;
+ ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0);
+ struct {
+ unsigned feature;
+ unsigned type;
+ size_t compress_workspace;
+ size_t decompress_workspace;
+ } compression_types[] = {
+ { BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 },
+ { BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP,
+ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+ zlib_inflate_workspacesize(), },
+ { BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD,
+ ZSTD_CCtxWorkspaceBound(params.cParams),
+ ZSTD_DCtxWorkspaceBound() },
+ }, *i;
+ int ret = 0;
- if (!bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
- !bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
- return 0;
+ pr_verbose_init(c->opts, "");
+
+ c->zstd_params = params;
+
+ for (i = compression_types;
+ i < compression_types + ARRAY_SIZE(compression_types);
+ i++)
+ if (bch2_sb_test_feature(c->disk_sb, i->feature))
+ goto have_compressed;
+
+ goto out;
+have_compressed:
if (!mempool_initialized(&c->compression_bounce[READ])) {
ret = mempool_init_page_pool(&c->compression_bounce[READ],
1, order);
if (ret)
- return ret;
+ goto out;
}
if (!mempool_initialized(&c->compression_bounce[WRITE])) {
ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
1, order);
if (ret)
- return ret;
+ goto out;
}
- if (!mempool_initialized(&c->lz4_workspace_pool) &&
- bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) {
- ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool,
- 1, LZ4_MEM_COMPRESS);
- if (ret)
- return ret;
- }
+ for (i = compression_types;
+ i < compression_types + ARRAY_SIZE(compression_types);
+ i++) {
+ decompress_workspace_size =
+ max(decompress_workspace_size, i->decompress_workspace);
- if (!c->zlib_workspace &&
- bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) {
- c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE);
- if (!c->zlib_workspace)
- return -ENOMEM;
+ if (!bch2_sb_test_feature(c->disk_sb, i->feature))
+ continue;
+
+ if (i->decompress_workspace)
+ decompress_workspace_needed = true;
+
+ ret = mempool_init_kvpmalloc_pool(
+ &c->compress_workspace[i->type],
+ 1, i->compress_workspace);
+ if (ret)
+ goto out;
}
- return 0;
+ ret = mempool_init_kmalloc_pool(
+ &c->decompress_workspace,
+ 1, decompress_workspace_size);
+ if (ret)
+ goto out;
+out:
+ pr_verbose_init(c->opts, "ret %i", ret);
+ return ret;
}
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index f5dccfa..ce1f8ba 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -694,7 +694,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
goto err;
}
- if (!bch2_sb_has_replicas(c, BCH_DATA_BTREE, bch2_extent_devs(e))) {
+ if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) {
bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), k);
bch2_fs_bug(c,
@@ -1834,7 +1834,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
}
if (!bkey_extent_is_cached(e.k) &&
- !bch2_sb_has_replicas(c, BCH_DATA_USER, bch2_extent_devs(e))) {
+ !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) {
bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), e.s_c);
bch2_fs_bug(c,
@@ -2013,17 +2013,18 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
}
void bch2_extent_mark_replicas_cached(struct bch_fs *c,
- struct bkey_s_extent e)
+ struct bkey_s_extent e,
+ unsigned nr_desired_replicas)
{
struct bch_extent_ptr *ptr;
unsigned tier = 0, nr_cached = 0;
unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
bool have_higher_tier;
- if (nr_good <= c->opts.data_replicas)
+ if (nr_good <= nr_desired_replicas)
return;
- nr_cached = nr_good - c->opts.data_replicas;
+ nr_cached = nr_good - nr_desired_replicas;
do {
have_higher_tier = false;
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index e8f54f2..7557927 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -38,7 +38,8 @@ bch2_insert_fixup_extent(struct btree_insert *,
struct btree_insert_entry *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent);
+void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
+ unsigned);
const struct bch_extent_ptr *
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
@@ -430,6 +431,18 @@ static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent
return ret;
}
+static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e)
+{
+ struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+ const struct bch_extent_ptr *ptr;
+
+ extent_for_each_ptr(e, ptr)
+ if (ptr->cached)
+ ret.devs[ret.nr++] = ptr->dev;
+
+ return ret;
+}
+
static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
{
switch (k.k->type) {
@@ -441,6 +454,28 @@ static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
}
}
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case BCH_EXTENT:
+ case BCH_EXTENT_CACHED:
+ return bch2_extent_dirty_devs(bkey_s_c_to_extent(k));
+ default:
+ return (struct bch_devs_list) { .nr = 0 };
+ }
+}
+
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case BCH_EXTENT:
+ case BCH_EXTENT_CACHED:
+ return bch2_extent_cached_devs(bkey_s_c_to_extent(k));
+ default:
+ return (struct bch_devs_list) { .nr = 0 };
+ }
+}
+
bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
struct bch_extent_crc_unpacked);
bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 1bffddf..00475b9 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -452,14 +452,18 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
ret = bch2_btree_insert_at(wop->c, &wop->res,
&hook.hook, op_journal_seq(wop),
- BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_USE_RESERVE,
BTREE_INSERT_ENTRY(&extent_iter, k),
BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter,
&hook.inode_p.inode.k_i, 2));
} else {
ret = bch2_btree_insert_at(wop->c, &wop->res,
&hook.hook, op_journal_seq(wop),
- BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_USE_RESERVE,
BTREE_INSERT_ENTRY(&extent_iter, k));
}
@@ -502,7 +506,7 @@ static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
bch2_write_op_init(&op->op, c);
op->op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
- op->op.compression_type = bch2_compression_opt_to_type(opts.compression);
+ op->op.compression_type = bch2_compression_opt_to_type[opts.compression];
op->op.devs = c->fastest_devs;
op->op.index_update_fn = bchfs_write_index_update;
op_journal_seq_set(&op->op, &inode->ei_journal_seq);
@@ -2692,6 +2696,10 @@ void bch2_fs_fsio_exit(struct bch_fs *c)
int bch2_fs_fsio_init(struct bch_fs *c)
{
+ int ret = 0;
+
+ pr_verbose_init(c->opts, "");
+
if (bioset_init(&c->writepage_bioset,
4, offsetof(struct bch_writepage_io, op.op.wbio.bio),
BIOSET_NEED_BVECS) ||
@@ -2701,9 +2709,10 @@ int bch2_fs_fsio_init(struct bch_fs *c)
bioset_init(&c->dio_write_bioset,
4, offsetof(struct dio_write, iop.op.wbio.bio),
BIOSET_NEED_BVECS))
- return -ENOMEM;
+ ret = -ENOMEM;
- return 0;
+ pr_verbose_init(c->opts, "ret %i", ret);
+ return ret;
}
#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 7cddbcc..13495d4 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -209,17 +209,6 @@ static void bch2_write_done(struct closure *cl)
closure_return(cl);
}
-static u64 keylist_sectors(struct keylist *keys)
-{
- struct bkey_i *k;
- u64 ret = 0;
-
- for_each_keylist_key(keys, k)
- ret += k->k.size;
-
- return ret;
-}
-
int bch2_write_index_default(struct bch_write_op *op)
{
struct keylist *keys = &op->insert_keys;
@@ -232,7 +221,8 @@ int bch2_write_index_default(struct bch_write_op *op)
ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
NULL, op_journal_seq(op),
- BTREE_INSERT_NOFAIL);
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE);
bch2_btree_iter_unlock(&iter);
return ret;
@@ -268,8 +258,7 @@ static void bch2_write_index(struct closure *cl)
}
if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
- ret = bch2_check_mark_super(c, BCH_DATA_USER,
- bch2_extent_devs(e.c));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, e.s_c);
if (ret)
goto err;
}
@@ -910,18 +899,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
rbio->promote = NULL;
- bch2_write_op_init(&op->write.op, c);
- op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum);
- op->write.op.compression_type =
- bch2_compression_opt_to_type(rbio->opts.compression);
-
- op->write.move_dev = -1;
- op->write.op.devs = c->fastest_devs;
- op->write.op.write_point = writepoint_hashed((unsigned long) current);
- op->write.op.flags |= BCH_WRITE_ALLOC_NOWAIT;
- op->write.op.flags |= BCH_WRITE_CACHED;
-
- bch2_migrate_write_init(&op->write, rbio);
+ bch2_migrate_read_done(&op->write, rbio);
closure_init(cl, NULL);
closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
@@ -932,13 +910,16 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
* XXX: multiple promotes can race with each other, wastefully. Keep a list of
* outstanding promotes?
*/
-static struct promote_op *promote_alloc(struct bch_read_bio *rbio)
+static struct promote_op *promote_alloc(struct bch_read_bio *rbio,
+ struct bkey_s_c k)
{
+ struct bch_fs *c = rbio->c;
struct promote_op *op;
struct bio *bio;
/* data might have to be decompressed in the write path: */
unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size,
PAGE_SECTORS);
+ int ret;
BUG_ON(!rbio->bounce);
BUG_ON(pages < rbio->bio.bi_vcnt);
@@ -954,6 +935,14 @@ static struct promote_op *promote_alloc(struct bch_read_bio *rbio)
memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+ ret = bch2_migrate_write_init(c, &op->write, c->fastest_devs,
+ writepoint_hashed((unsigned long) current),
+ rbio->opts,
+ DATA_PROMOTE,
+ (struct data_opts) { 0 },
+ k);
+ BUG_ON(ret);
+
return op;
}
@@ -1407,7 +1396,7 @@ noclone:
rbio->pick = *pick;
rbio->pos = pos;
rbio->version = e.k->version;
- rbio->promote = promote ? promote_alloc(rbio) : NULL;
+ rbio->promote = promote ? promote_alloc(rbio, e.s_c) : NULL;
INIT_WORK(&rbio->work, NULL);
bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev);
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index 71eee4f..4208fd4 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -70,7 +70,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
op->error = 0;
op->csum_type = bch2_data_checksum_type(c, c->opts.data_checksum);
op->compression_type =
- bch2_compression_opt_to_type(c->opts.compression);
+ bch2_compression_opt_to_type[c->opts.compression];
op->nr_replicas = 0;
op->nr_replicas_required = c->opts.data_replicas_required;
op->alloc_reserve = RESERVE_NONE;
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index a1e4562..8ce1745 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -1046,12 +1046,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
if (!degraded &&
(test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_sb_has_replicas(c, BCH_DATA_JOURNAL,
+ fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
i->devs), c,
"superblock not marked as containing replicas (type %u)",
BCH_DATA_JOURNAL))) {
- ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL,
- i->devs);
+ ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
if (ret)
return ret;
}
@@ -2232,7 +2231,7 @@ static void journal_write_done(struct closure *cl)
goto err;
}
- if (bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs))
+ if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
goto err;
out:
__bch2_time_stats_update(j->write_time, j->write_start_time);
@@ -2851,7 +2850,7 @@ int bch2_journal_flush_device(struct journal *j, int dev_idx)
seq++;
spin_unlock(&j->lock);
- ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs);
+ ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
spin_lock(&j->lock);
}
spin_unlock(&j->lock);
@@ -2946,7 +2945,11 @@ void bch2_fs_journal_exit(struct journal *j)
int bch2_fs_journal_init(struct journal *j)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
static struct lock_class_key res_key;
+ int ret = 0;
+
+ pr_verbose_init(c->opts, "");
spin_lock_init(&j->lock);
spin_lock_init(&j->err_lock);
@@ -2972,12 +2975,15 @@ int bch2_fs_journal_init(struct journal *j)
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
!(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
- !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL)))
- return -ENOMEM;
+ !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) {
+ ret = -ENOMEM;
+ goto out;
+ }
j->pin.front = j->pin.back = 1;
-
- return 0;
+out:
+ pr_verbose_init(c->opts, "ret %i", ret);
+ return ret;
}
/* debug: */
diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h
index b7c8a86..a8c8883 100644
--- a/libbcachefs/keylist.h
+++ b/libbcachefs/keylist.h
@@ -58,6 +58,17 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
#define keylist_single(k) \
((struct keylist) { .keys = k, .top = bkey_next(k) })
+static inline u64 keylist_sectors(struct keylist *keys)
+{
+ struct bkey_i *k;
+ u64 ret = 0;
+
+ for_each_keylist_key(keys, k)
+ ret += k->k.size;
+
+ return ret;
+}
+
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_verify_keylist_sorted(struct keylist *);
#else
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 9c2920c..9200ed9 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -13,118 +13,6 @@
#include "move.h"
#include "super-io.h"
-static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
-{
- struct bch_dev *ca = arg;
-
- return bch2_extent_has_device(e, ca->dev_idx);
-}
-
-#define MAX_DATA_OFF_ITER 10
-
-static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
- int flags)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_move_stats stats;
- unsigned pass = 0;
- int ret = 0;
-
- if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
- return 0;
-
- /*
- * XXX: we should be able to do this in one pass, but bch2_move_data()
- * can spuriously fail to move an extent due to racing with other move
- * operations
- */
- do {
- memset(&stats, 0, sizeof(stats));
-
- ret = bch2_move_data(c, NULL,
- SECTORS_IN_FLIGHT_PER_DEVICE,
- NULL,
- writepoint_hashed((unsigned long) current),
- 0,
- ca->dev_idx,
- POS_MIN, POS_MAX,
- migrate_pred, ca,
- &stats);
- if (ret) {
- bch_err(c, "error migrating data: %i", ret);
- return ret;
- }
- } while (atomic64_read(&stats.keys_moved) && pass++ < MAX_DATA_OFF_ITER);
-
- if (atomic64_read(&stats.keys_moved)) {
- bch_err(c, "unable to migrate all data in %d iterations",
- MAX_DATA_OFF_ITER);
- return -1;
- }
-
- mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) {
- ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k));
- if (ret) {
- bch_err(c, "error migrating data %i from check_mark_super()", ret);
- break;
- }
- }
-
- bch2_replicas_gc_end(c, ret);
- mutex_unlock(&c->replicas_gc_lock);
- return ret;
-}
-
-static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
- int flags)
-{
- struct btree_iter iter;
- struct btree *b;
- int ret = 0;
- unsigned id;
-
- if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_BTREE)))
- return 0;
-
- mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
-
- for (id = 0; id < BTREE_ID_NR; id++) {
- for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-
- if (!bch2_extent_has_device(e, ca->dev_idx))
- continue;
-
- ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0);
- if (ret) {
- bch2_btree_iter_unlock(&iter);
- goto err;
- }
- }
- ret = bch2_btree_iter_unlock(&iter);
- if (ret)
- goto err;
- }
-err:
- bch2_replicas_gc_end(c, ret);
- mutex_unlock(&c->replicas_gc_lock);
- return ret;
-}
-
-int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
- BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW &&
- bch2_dev_is_online(ca));
-
- return bch2_dev_usrdata_migrate(c, ca, flags) ?:
- bch2_dev_metadata_migrate(c, ca, flags);
-}
-
static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
unsigned dev_idx, int flags, bool metadata)
{
@@ -152,7 +40,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
int ret = 0;
mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
+ bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
POS_MIN, BTREE_ITER_PREFETCH);
@@ -161,8 +49,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
!(ret = btree_iter_err(k))) {
if (!bkey_extent_is_data(k.k) ||
!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
- ret = bch2_check_mark_super(c, BCH_DATA_USER,
- bch2_bkey_devs(k));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
if (ret)
break;
bch2_btree_iter_next(&iter);
@@ -183,8 +70,8 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
*/
bch2_extent_normalize(c, e.s);
- ret = bch2_check_mark_super(c, BCH_DATA_USER,
- bch2_bkey_devs(bkey_i_to_s_c(&tmp.key)));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+ bkey_i_to_s_c(&tmp.key));
if (ret)
break;
@@ -240,8 +127,8 @@ retry:
dev_idx)) {
bch2_btree_iter_set_locks_want(&iter, 0);
- ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
- bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+ bkey_i_to_s_c(&b->key));
if (ret)
goto err;
} else {
diff --git a/libbcachefs/migrate.h b/libbcachefs/migrate.h
index 6db7b91..de2faab 100644
--- a/libbcachefs/migrate.h
+++ b/libbcachefs/migrate.h
@@ -1,7 +1,6 @@
#ifndef _BCACHEFS_MIGRATE_H
#define _BCACHEFS_MIGRATE_H
-int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int);
int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index e5a46ba..a176484 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -58,6 +58,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
BKEY_PADDED(k) _new, _insert;
struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
+ unsigned nr_dirty;
bool did_work = false;
if (btree_iter_err(k)) {
@@ -71,6 +72,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
m->ptr, m->offset))
goto nomatch;
+ if (m->data_cmd == DATA_REWRITE &&
+ !bch2_extent_has_device(bkey_s_c_to_extent(k),
+ m->data_opts.rewrite_dev))
+ goto nomatch;
+
bkey_reassemble(&_insert.k, k);
insert = bkey_i_to_extent(&_insert.k);
@@ -81,11 +87,12 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
bch2_cut_back(new->k.p, &insert->k);
bch2_cut_back(insert->k.p, &new->k);
- if (m->move_dev >= 0 &&
- (ptr = (struct bch_extent_ptr *)
- bch2_extent_has_device(extent_i_to_s_c(insert),
- m->move_dev)))
+ if (m->data_cmd == DATA_REWRITE) {
+ ptr = (struct bch_extent_ptr *)
+ bch2_extent_has_device(extent_i_to_s_c(insert),
+ m->data_opts.rewrite_dev);
bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
+ }
extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
@@ -108,10 +115,35 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
bch2_extent_narrow_crcs(insert,
(struct bch_extent_crc_unpacked) { 0 });
bch2_extent_normalize(c, extent_i_to_s(insert).s);
- bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert));
+ bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
+ c->opts.data_replicas);
+
+ /*
+ * It's possible we race, and for whatever reason the extent now
+ * has fewer replicas than when we last looked at it - meaning
+ * we need to get a disk reservation here:
+ */
+ nr_dirty = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i));
+ if (m->nr_ptrs_reserved < nr_dirty) {
+ unsigned sectors = (nr_dirty - m->nr_ptrs_reserved) *
+ keylist_sectors(keys);
+
+ /*
+ * can't call bch2_disk_reservation_add() with btree
+ * locks held, at least not without a song and dance
+ */
+ bch2_btree_iter_unlock(&iter);
+
+ ret = bch2_disk_reservation_add(c, &op->res, sectors, 0);
+ if (ret)
+ goto out;
+
+ m->nr_ptrs_reserved = nr_dirty;
+ goto next;
+ }
- ret = bch2_check_mark_super(c, BCH_DATA_USER,
- bch2_extent_devs(extent_i_to_s_c(insert)));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+ extent_i_to_s_c(insert).s_c);
if (ret)
break;
@@ -119,7 +151,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
NULL, op_journal_seq(op),
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
- m->btree_insert_flags,
+ BTREE_INSERT_USE_RESERVE|
+ m->data_opts.btree_insert_flags,
BTREE_INSERT_ENTRY(&iter, &insert->k_i));
if (!ret)
atomic_long_inc(&c->extent_migrate_done);
@@ -150,8 +183,7 @@ out:
return ret;
}
-void bch2_migrate_write_init(struct migrate_write *m,
- struct bch_read_bio *rbio)
+void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
{
/* write bio must own pages: */
BUG_ON(!m->op.wbio.bio.bi_vcnt);
@@ -162,16 +194,39 @@ void bch2_migrate_write_init(struct migrate_write *m,
m->op.pos = rbio->pos;
m->op.version = rbio->version;
m->op.crc = rbio->pick.crc;
+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
m->op.nonce = m->op.crc.nonce + m->op.crc.offset;
m->op.csum_type = m->op.crc.csum_type;
}
- if (m->move_dev >= 0)
- bch2_dev_list_drop_dev(&m->op.devs_have, m->move_dev);
+ if (m->data_cmd == DATA_REWRITE)
+ bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
+}
+
+int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
+ struct bch_devs_mask *devs,
+ struct write_point_specifier wp,
+ struct bch_io_opts io_opts,
+ enum data_cmd data_cmd,
+ struct data_opts data_opts,
+ struct bkey_s_c k)
+{
+ int ret;
- if (m->btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+ m->data_cmd = data_cmd;
+ m->data_opts = data_opts;
+ m->nr_ptrs_reserved = bch2_extent_nr_dirty_ptrs(k);
+
+ bch2_write_op_init(&m->op, c);
+ m->op.csum_type = bch2_data_checksum_type(c, io_opts.data_checksum);
+ m->op.compression_type =
+ bch2_compression_opt_to_type[io_opts.compression];
+ m->op.devs = devs;
+ m->op.write_point = wp;
+
+ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
m->op.alloc_reserve = RESERVE_MOVINGGC;
m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
@@ -180,10 +235,35 @@ void bch2_migrate_write_init(struct migrate_write *m,
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_NOMARK_REPLICAS;
- m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
m->op.nr_replicas = 1;
m->op.nr_replicas_required = 1;
m->op.index_update_fn = bch2_migrate_index_update;
+
+ switch (data_cmd) {
+ case DATA_ADD_REPLICAS:
+ if (m->nr_ptrs_reserved < c->opts.data_replicas) {
+ m->op.nr_replicas = c->opts.data_replicas - m->nr_ptrs_reserved;
+
+ ret = bch2_disk_reservation_get(c, &m->op.res,
+ k.k->size,
+ m->op.nr_replicas, 0);
+ if (ret)
+ return ret;
+
+ m->nr_ptrs_reserved = c->opts.data_replicas;
+ }
+ break;
+ case DATA_REWRITE:
+ break;
+ case DATA_PROMOTE:
+ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
+ m->op.flags |= BCH_WRITE_CACHED;
+ break;
+ default:
+ BUG();
+ }
+
+ return 0;
}
static void move_free(struct closure *cl)
@@ -210,7 +290,7 @@ static void move_write(struct closure *cl)
struct moving_io *io = container_of(cl, struct moving_io, cl);
if (likely(!io->rbio.bio.bi_status)) {
- bch2_migrate_write_init(&io->write, &io->rbio);
+ bch2_migrate_read_done(&io->write, &io->rbio);
closure_call(&io->write.op.cl, bch2_write, NULL, cl);
}
@@ -238,19 +318,19 @@ static void move_read_endio(struct bio *bio)
}
static int bch2_move_extent(struct bch_fs *c,
- struct moving_context *ctxt,
- struct bch_devs_mask *devs,
- struct write_point_specifier wp,
- int btree_insert_flags,
- int move_device,
- struct bch_io_opts opts,
- struct bkey_s_c_extent e)
+ struct moving_context *ctxt,
+ struct bch_devs_mask *devs,
+ struct write_point_specifier wp,
+ struct bch_io_opts io_opts,
+ struct bkey_s_c_extent e,
+ enum data_cmd data_cmd,
+ struct data_opts data_opts)
{
struct extent_pick_ptr pick;
struct moving_io *io;
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
- unsigned sectors = e.k->size, pages, nr_good;
+ unsigned sectors = e.k->size, pages;
int ret = -ENOMEM;
bch2_extent_pick_ptr(c, e.s_c, NULL, &pick);
@@ -279,7 +359,7 @@ static int bch2_move_extent(struct bch_fs *c,
if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
goto err_free;
- io->rbio.opts = opts;
+ io->rbio.opts = io_opts;
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
@@ -288,27 +368,10 @@ static int bch2_move_extent(struct bch_fs *c,
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k);
io->rbio.bio.bi_end_io = move_read_endio;
- io->write.btree_insert_flags = btree_insert_flags;
- io->write.move_dev = move_device;
-
- bch2_write_op_init(&io->write.op, c);
- io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
- io->write.op.compression_type =
- bch2_compression_opt_to_type(opts.compression);
- io->write.op.devs = devs;
- io->write.op.write_point = wp;
-
- if (move_device < 0 &&
- ((nr_good = bch2_extent_nr_good_ptrs(c, e)) <
- c->opts.data_replicas)) {
- io->write.op.nr_replicas = c->opts.data_replicas - nr_good;
-
- ret = bch2_disk_reservation_get(c, &io->write.op.res,
- e.k->size,
- io->write.op.nr_replicas, 0);
- if (ret)
- goto err_free_pages;
- }
+ ret = bch2_migrate_write_init(c, &io->write, devs, wp,
+ io_opts, data_cmd, data_opts, e.s_c);
+ if (ret)
+ goto err_free_pages;
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
@@ -369,8 +432,6 @@ int bch2_move_data(struct bch_fs *c,
unsigned sectors_in_flight,
struct bch_devs_mask *devs,
struct write_point_specifier wp,
- int btree_insert_flags,
- int move_device,
struct bpos start,
struct bpos end,
move_pred_fn pred, void *arg,
@@ -378,12 +439,14 @@ int bch2_move_data(struct bch_fs *c,
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct moving_context ctxt = { .stats = stats };
- struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
struct bkey_s_c_extent e;
+ struct data_opts data_opts;
+ enum data_cmd data_cmd;
u64 cur_inum = U64_MAX;
- int ret = 0;
+ int ret = 0, ret2;
closure_init_stack(&ctxt.cl);
INIT_LIST_HEAD(&ctxt.reads);
@@ -430,28 +493,44 @@ peek:
/* don't hold btree locks while looking up inode: */
bch2_btree_iter_unlock(&stats->iter);
- opts = bch2_opts_to_inode_opts(c->opts);
+ io_opts = bch2_opts_to_inode_opts(c->opts);
if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
- bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode));
+ bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
cur_inum = k.k->p.inode;
goto peek;
}
- if (!pred(arg, e))
+ switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e,
+ &io_opts, &data_opts))) {
+ case DATA_SKIP:
goto next;
+ case DATA_SCRUB:
+ BUG();
+ case DATA_ADD_REPLICAS:
+ case DATA_REWRITE:
+ case DATA_PROMOTE:
+ break;
+ default:
+ BUG();
+ }
/* unlock before doing IO: */
bkey_reassemble(&tmp.k, k);
k = bkey_i_to_s_c(&tmp.k);
bch2_btree_iter_unlock(&stats->iter);
- if (bch2_move_extent(c, &ctxt, devs, wp,
- btree_insert_flags,
- move_device, opts,
- bkey_s_c_to_extent(k))) {
- /* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(&ctxt);
- continue;
+ ret2 = bch2_move_extent(c, &ctxt, devs, wp, io_opts,
+ bkey_s_c_to_extent(k),
+ data_cmd, data_opts);
+ if (ret2) {
+ if (ret2 == -ENOMEM) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(&ctxt);
+ continue;
+ }
+
+ /* XXX signal failure */
+ goto next;
}
if (rate)
@@ -486,11 +565,11 @@ static int bch2_gc_data_replicas(struct bch_fs *c)
int ret;
mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
+ bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
BTREE_ITER_PREFETCH, k) {
- ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
if (ret)
break;
}
@@ -514,8 +593,8 @@ static int bch2_gc_btree_replicas(struct bch_fs *c)
for (id = 0; id < BTREE_ID_NR; id++) {
for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
- ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
- bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+ bkey_i_to_s_c(&b->key));
bch2_btree_iter_cond_resched(&iter);
}
@@ -534,18 +613,35 @@ static int bch2_move_btree(struct bch_fs *c,
void *arg,
struct bch_move_stats *stats)
{
+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree *b;
unsigned id;
+ struct data_opts data_opts;
+ enum data_cmd cmd;
int ret = 0;
stats->data_type = BCH_DATA_BTREE;
for (id = 0; id < BTREE_ID_NR; id++) {
for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
- if (pred(arg, bkey_i_to_s_c_extent(&b->key)))
- ret = bch2_btree_node_rewrite(c, &stats->iter,
- b->data->keys.seq, 0) ?: ret;
+ switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE,
+ bkey_i_to_s_c_extent(&b->key),
+ &io_opts,
+ &data_opts))) {
+ case DATA_SKIP:
+ goto next;
+ case DATA_SCRUB:
+ BUG();
+ case DATA_ADD_REPLICAS:
+ case DATA_REWRITE:
+ break;
+ default:
+ BUG();
+ }
+ ret = bch2_btree_node_rewrite(c, &stats->iter,
+ b->data->keys.seq, 0) ?: ret;
+next:
bch2_btree_iter_cond_resched(&stats->iter);
}
@@ -556,32 +652,48 @@ static int bch2_move_btree(struct bch_fs *c,
}
#if 0
-static bool scrub_data_pred(void *arg, struct bkey_s_c_extent e)
+static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
+ enum bkey_type type,
+ struct bkey_s_c_extent e,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
{
+ return DATA_SCRUB;
}
#endif
-static bool rereplicate_metadata_pred(void *arg, struct bkey_s_c_extent e)
+static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
+ enum bkey_type type,
+ struct bkey_s_c_extent e,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
{
- struct bch_fs *c = arg;
unsigned nr_good = bch2_extent_nr_good_ptrs(c, e);
+ unsigned replicas = type == BKEY_TYPE_BTREE
+ ? c->opts.metadata_replicas
+ : c->opts.data_replicas;
- return nr_good && nr_good < c->opts.metadata_replicas;
-}
+ if (!nr_good || nr_good >= replicas)
+ return DATA_SKIP;
-static bool rereplicate_data_pred(void *arg, struct bkey_s_c_extent e)
-{
- struct bch_fs *c = arg;
- unsigned nr_good = bch2_extent_nr_good_ptrs(c, e);
-
- return nr_good && nr_good < c->opts.data_replicas;
+ data_opts->btree_insert_flags = 0;
+ return DATA_ADD_REPLICAS;
}
-static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
+static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
+ enum bkey_type type,
+ struct bkey_s_c_extent e,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
{
struct bch_ioctl_data *op = arg;
- return bch2_extent_has_device(e, op->migrate.dev);
+ if (!bch2_extent_has_device(e, op->migrate.dev))
+ return DATA_SKIP;
+
+ data_opts->btree_insert_flags = 0;
+ data_opts->rewrite_dev = op->migrate.dev;
+ return DATA_REWRITE;
}
int bch2_data_job(struct bch_fs *c,
@@ -595,16 +707,15 @@ int bch2_data_job(struct bch_fs *c,
stats->data_type = BCH_DATA_JOURNAL;
ret = bch2_journal_flush_device(&c->journal, -1);
- ret = bch2_move_btree(c, rereplicate_metadata_pred, c, stats) ?: ret;
+ ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
ret = bch2_gc_btree_replicas(c) ?: ret;
ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE,
NULL,
writepoint_hashed((unsigned long) current),
- 0, -1,
op.start,
op.end,
- rereplicate_data_pred, c, stats) ?: ret;
+ rereplicate_pred, c, stats) ?: ret;
ret = bch2_gc_data_replicas(c) ?: ret;
break;
case BCH_DATA_OP_MIGRATE:
@@ -620,7 +731,6 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE,
NULL,
writepoint_hashed((unsigned long) current),
- 0, -1,
op.start,
op.end,
migrate_pred, &op, stats) ?: ret;
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index 07aa566..819e5d9 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -8,23 +8,47 @@
struct bch_read_bio;
struct moving_context;
+enum data_cmd {
+ DATA_SKIP,
+ DATA_SCRUB,
+ DATA_ADD_REPLICAS,
+ DATA_REWRITE,
+ DATA_PROMOTE,
+};
+
+struct data_opts {
+ unsigned rewrite_dev;
+ int btree_insert_flags;
+};
+
struct migrate_write {
+ enum data_cmd data_cmd;
+ struct data_opts data_opts;
+
+ unsigned nr_ptrs_reserved;
+
struct moving_context *ctxt;
/* what we read: */
struct bch_extent_ptr ptr;
u64 offset;
- int move_dev;
- int btree_insert_flags;
struct bch_write_op op;
};
-void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *);
+void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
+int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
+ struct bch_devs_mask *,
+ struct write_point_specifier,
+ struct bch_io_opts,
+ enum data_cmd, struct data_opts,
+ struct bkey_s_c);
#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
-typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent);
+typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
+ enum bkey_type, struct bkey_s_c_extent,
+ struct bch_io_opts *, struct data_opts *);
struct bch_move_stats {
enum bch_data_type data_type;
@@ -39,7 +63,7 @@ struct bch_move_stats {
int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
unsigned, struct bch_devs_mask *,
struct write_point_specifier,
- int, int, struct bpos, struct bpos,
+ struct bpos, struct bpos,
move_pred_fn, void *,
struct bch_move_stats *);
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 515d500..c306a89 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -61,9 +61,9 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
return (l->offset > r->offset) - (l->offset < r->offset);
}
-static bool copygc_pred(void *arg, struct bkey_s_c_extent e)
+static bool __copygc_pred(struct bch_dev *ca,
+ struct bkey_s_c_extent e)
{
- struct bch_dev *ca = arg;
copygc_heap *h = &ca->copygc_heap;
const struct bch_extent_ptr *ptr =
bch2_extent_has_device(e, ca->dev_idx);
@@ -83,6 +83,22 @@ static bool copygc_pred(void *arg, struct bkey_s_c_extent e)
return false;
}
+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
+ enum bkey_type type,
+ struct bkey_s_c_extent e,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
+{
+ struct bch_dev *ca = arg;
+
+ if (!__copygc_pred(ca, e))
+ return DATA_SKIP;
+
+ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE,
+ data_opts->rewrite_dev = ca->dev_idx;
+ return DATA_REWRITE;
+}
+
static bool have_copygc_reserve(struct bch_dev *ca)
{
bool ret;
@@ -165,8 +181,6 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
SECTORS_IN_FLIGHT_PER_DEVICE,
&ca->self,
writepoint_ptr(&ca->copygc_write_point),
- BTREE_INSERT_USE_RESERVE,
- ca->dev_idx,
POS_MIN, POS_MAX,
copygc_pred, ca,
&move_stats);
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index eae63cf..ec50345 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -22,6 +22,7 @@ const char * const bch2_compression_types[] = {
"none",
"lz4",
"gzip",
+ "zstd",
NULL
};
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 5d42dd5..8a3ac66 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -73,10 +73,10 @@ enum opt_type {
BCH_OPT(errors, u8, OPT_RUNTIME, \
OPT_STR(bch2_error_actions), \
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO) \
- BCH_OPT(metadata_replicas, u8, OPT_MOUNT, \
+ BCH_OPT(metadata_replicas, u8, OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
BCH_SB_META_REPLICAS_WANT, 1) \
- BCH_OPT(data_replicas, u8, OPT_MOUNT, \
+ BCH_OPT(data_replicas, u8, OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
BCH_SB_DATA_REPLICAS_WANT, 1) \
BCH_OPT(metadata_replicas_required, u8, OPT_MOUNT, \
@@ -127,6 +127,9 @@ enum opt_type {
BCH_OPT(verbose_recovery, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
+ BCH_OPT(verbose_init, u8, OPT_MOUNT, \
+ OPT_BOOL(), \
+ NO_SB_OPT, false) \
BCH_OPT(journal_flush_disabled, u8, OPT_RUNTIME, \
OPT_BOOL(), \
NO_SB_OPT, false) \
diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c
index 6ab2c86..d28f133 100644
--- a/libbcachefs/quota.c
+++ b/libbcachefs/quota.c
@@ -74,13 +74,6 @@ static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
_i < QTYP_NR); \
_i++)
-static inline unsigned enabled_qtypes(struct bch_fs *c)
-{
- return ((c->opts.usrquota << QTYP_USR)|
- (c->opts.grpquota << QTYP_GRP)|
- (c->opts.prjquota << QTYP_PRJ));
-}
-
static bool ignore_hardlimit(struct bch_memquota_type *q)
{
if (capable(CAP_SYS_RESOURCE))
@@ -478,7 +471,7 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
return -EINVAL;
- if (uflags & FS_QUOTA_PDQ_ENFD)
+ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
return -EINVAL;
mutex_lock(&c->sb_lock);
@@ -487,10 +480,9 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
if (uflags & FS_QUOTA_GDQ_ENFD)
SET_BCH_SB_GRPQUOTA(c->disk_sb, true);
-#if 0
+
if (uflags & FS_QUOTA_PDQ_ENFD)
SET_BCH_SB_PRJQUOTA(c->disk_sb, true);
-#endif
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h
index b5536be..509b7f0 100644
--- a/libbcachefs/quota.h
+++ b/libbcachefs/quota.h
@@ -20,6 +20,13 @@ static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
};
}
+static inline unsigned enabled_qtypes(struct bch_fs *c)
+{
+ return ((c->opts.usrquota << QTYP_USR)|
+ (c->opts.grpquota << QTYP_GRP)|
+ (c->opts.prjquota << QTYP_PRJ));
+}
+
#ifdef CONFIG_BCACHEFS_QUOTA
int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
diff --git a/libbcachefs/siphash.c b/libbcachefs/siphash.c
index d689a7b..3a6c9c8 100644
--- a/libbcachefs/siphash.c
+++ b/libbcachefs/siphash.c
@@ -43,7 +43,6 @@
* https://131002.net/siphash/
*/
-#include <linux/compiler.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
#include <linux/bitops.h>
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index f333b8f..c747391 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -546,6 +546,8 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
__le64 *i;
int ret;
+ pr_verbose_init(*opts, "");
+
memset(sb, 0, sizeof(*sb));
sb->mode = FMODE_READ;
@@ -566,8 +568,10 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
opt_set(*opts, nochanges, true);
}
- if (IS_ERR(sb->bdev))
- return PTR_ERR(sb->bdev);
+ if (IS_ERR(sb->bdev)) {
+ ret = PTR_ERR(sb->bdev);
+ goto out;
+ }
err = "cannot allocate memory";
ret = __bch2_super_realloc(sb, 0);
@@ -638,12 +642,14 @@ got_super:
if (sb->mode & FMODE_WRITE)
bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
|= BDI_CAP_STABLE_WRITES;
-
- return 0;
+ ret = 0;
+out:
+ pr_verbose_init(*opts, "ret %i", ret);
+ return ret;
err:
bch2_free_super(sb);
pr_err("error reading superblock: %s", err);
- return ret;
+ goto out;
}
/* write superblock: */
@@ -744,17 +750,15 @@ void bch2_write_super(struct bch_fs *c)
nr_wrote = dev_mask_nr(&sb_written);
can_mount_with_written =
- bch2_have_enough_devs(c,
- __bch2_replicas_status(c, sb_written),
- BCH_FORCE_IF_DEGRADED);
+ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
+ BCH_FORCE_IF_DEGRADED);
for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
sb_written.d[i] = ~sb_written.d[i];
can_mount_without_written =
- bch2_have_enough_devs(c,
- __bch2_replicas_status(c, sb_written),
- BCH_FORCE_IF_DEGRADED);
+ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
+ BCH_FORCE_IF_DEGRADED);
/*
* If we would be able to mount _without_ the devices we successfully
@@ -1052,7 +1056,7 @@ static bool replicas_has_entry(struct bch_replicas_cpu *r,
}
noinline
-static int bch2_check_mark_super_slowpath(struct bch_fs *c,
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
struct bch_replicas_cpu_entry new_entry,
unsigned max_dev)
{
@@ -1109,9 +1113,9 @@ err:
return ret;
}
-int bch2_check_mark_super(struct bch_fs *c,
- enum bch_data_type data_type,
- struct bch_devs_list devs)
+int bch2_mark_replicas(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bch_devs_list devs)
{
struct bch_replicas_cpu_entry search;
struct bch_replicas_cpu *r, *gc_r;
@@ -1121,6 +1125,8 @@ int bch2_check_mark_super(struct bch_fs *c,
if (!devs.nr)
return 0;
+ BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+
devlist_to_replicas(devs, data_type, &search, &max_dev);
rcu_read_lock();
@@ -1131,7 +1137,23 @@ int bch2_check_mark_super(struct bch_fs *c,
rcu_read_unlock();
return likely(marked) ? 0
- : bch2_check_mark_super_slowpath(c, search, max_dev);
+ : bch2_mark_replicas_slowpath(c, search, max_dev);
+}
+
+int bch2_mark_bkey_replicas(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bkey_s_c k)
+{
+ struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+ unsigned i;
+ int ret;
+
+ for (i = 0; i < cached.nr; i++)
+ if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
+ bch2_dev_list_single(cached.devs[i]))))
+ return ret;
+
+ return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
}
int bch2_replicas_gc_end(struct bch_fs *c, int err)
@@ -1417,7 +1439,7 @@ int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t
/* Query replicas: */
-bool bch2_sb_has_replicas(struct bch_fs *c,
+bool bch2_replicas_marked(struct bch_fs *c,
enum bch_data_type data_type,
struct bch_devs_list devs)
{
@@ -1438,6 +1460,21 @@ bool bch2_sb_has_replicas(struct bch_fs *c,
return ret;
}
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bkey_s_c k)
+{
+ struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+ unsigned i;
+
+ for (i = 0; i < cached.nr; i++)
+ if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
+ bch2_dev_list_single(cached.devs[i])))
+ return false;
+
+ return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_devs_mask online_devs)
{
@@ -1495,29 +1532,26 @@ struct replicas_status bch2_replicas_status(struct bch_fs *c)
return __bch2_replicas_status(c, bch2_online_devs(c));
}
-bool bch2_have_enough_devs(struct bch_fs *c,
- struct replicas_status s,
- unsigned flags)
+static bool have_enough_devs(struct replicas_status s,
+ enum bch_data_type type,
+ bool force_if_degraded,
+ bool force_if_lost)
{
- if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
- s.replicas[BCH_DATA_BTREE].nr_offline) &&
- !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
- return false;
-
- if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
- !s.replicas[BCH_DATA_BTREE].nr_online) &&
- !(flags & BCH_FORCE_IF_METADATA_LOST))
- return false;
-
- if (s.replicas[BCH_DATA_USER].nr_offline &&
- !(flags & BCH_FORCE_IF_DATA_DEGRADED))
- return false;
-
- if (!s.replicas[BCH_DATA_USER].nr_online &&
- !(flags & BCH_FORCE_IF_DATA_LOST))
- return false;
+ return (!s.replicas[type].nr_offline || force_if_degraded) &&
+ (s.replicas[type].nr_online || force_if_lost);
+}
- return true;
+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
+{
+ return (have_enough_devs(s, BCH_DATA_JOURNAL,
+ flags & BCH_FORCE_IF_METADATA_DEGRADED,
+ flags & BCH_FORCE_IF_METADATA_LOST) &&
+ have_enough_devs(s, BCH_DATA_BTREE,
+ flags & BCH_FORCE_IF_METADATA_DEGRADED,
+ flags & BCH_FORCE_IF_METADATA_LOST) &&
+ have_enough_devs(s, BCH_DATA_USER,
+ flags & BCH_FORCE_IF_DATA_DEGRADED,
+ flags & BCH_FORCE_IF_DATA_LOST));
}
unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h
index eb85410..d7fecf0 100644
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -139,10 +139,14 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
/* BCH_SB_FIELD_replicas: */
-bool bch2_sb_has_replicas(struct bch_fs *, enum bch_data_type,
- struct bch_devs_list);
-int bch2_check_mark_super(struct bch_fs *, enum bch_data_type,
+bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
struct bch_devs_list);
+bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
+ struct bkey_s_c);
+int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
+ struct bch_devs_list);
+int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
+ struct bkey_s_c);
int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
@@ -157,7 +161,7 @@ struct replicas_status {
struct replicas_status __bch2_replicas_status(struct bch_fs *,
struct bch_devs_mask);
struct replicas_status bch2_replicas_status(struct bch_fs *);
-bool bch2_have_enough_devs(struct bch_fs *, struct replicas_status, unsigned);
+bool bch2_have_enough_devs(struct replicas_status, unsigned);
unsigned bch2_replicas_online(struct bch_fs *, bool);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index f836c19..58bcd7d 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -507,9 +507,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
struct bch_fs *c;
unsigned i, iter_size;
+ pr_verbose_init(opts, "");
+
c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
if (!c)
- return NULL;
+ goto out;
__module_get(THIS_MODULE);
@@ -539,7 +541,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->btree_interior_update_lock);
mutex_init(&c->bio_bounce_pages_lock);
- mutex_init(&c->zlib_workspace_lock);
bio_list_init(&c->btree_write_error_list);
spin_lock_init(&c->btree_write_error_lock);
@@ -646,10 +647,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
kobject_init(&c->internal, &bch2_fs_internal_ktype);
kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+out:
+ pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
return c;
err:
bch2_fs_free(c);
- return NULL;
+ c = NULL;
+ goto out;
}
static const char *__bch2_fs_online(struct bch_fs *c)
@@ -809,7 +813,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
goto err;
bch_verbose(c, "fsck done");
- if (c->opts.usrquota || c->opts.grpquota) {
+ if (enabled_qtypes(c)) {
bch_verbose(c, "reading quotas:");
ret = bch2_fs_quota_read(c);
if (ret)
@@ -864,7 +868,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
NULL, NULL, NULL, 0))
goto err;
- if (c->opts.usrquota || c->opts.grpquota) {
+ if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
if (ret)
goto err;
@@ -1084,14 +1088,17 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
{
struct bch_member *member;
- struct bch_dev *ca;
+ struct bch_dev *ca = NULL;
+ int ret = 0;
+
+ pr_verbose_init(c->opts, "");
if (bch2_fs_init_fault("dev_alloc"))
- return -ENOMEM;
+ goto err;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
- return -ENOMEM;
+ goto err;
kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->ref_completion);
@@ -1133,11 +1140,14 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
if (bch2_dev_sysfs_online(c, ca))
pr_warn("error creating sysfs objects");
-
- return 0;
+out:
+ pr_verbose_init(c->opts, "ret %i", ret);
+ return ret;
err:
- bch2_dev_free(ca);
- return -ENOMEM;
+ if (ca)
+ bch2_dev_free(ca);
+ ret = -ENOMEM;
+ goto out;
}
static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
@@ -1240,7 +1250,8 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
/* do we have enough devices to write to? */
for_each_member_device(ca2, c, i)
- nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
+ if (ca2 != ca)
+ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
? c->opts.metadata_replicas
@@ -1249,7 +1260,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
? c->opts.data_replicas
: c->opts.data_replicas_required);
- return nr_rw - 1 <= required;
+ return nr_rw >= required;
case BCH_MEMBER_STATE_FAILED:
case BCH_MEMBER_STATE_SPARE:
if (ca->mi.state != BCH_MEMBER_STATE_RW &&
@@ -1262,7 +1273,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
s = __bch2_replicas_status(c, new_online_devs);
- return bch2_have_enough_devs(c, s, flags);
+ return bch2_have_enough_devs(s, flags);
default:
BUG();
}
@@ -1299,7 +1310,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
s = bch2_replicas_status(c);
- return bch2_have_enough_devs(c, s, flags);
+ return bch2_have_enough_devs(s, flags);
}
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
@@ -1346,12 +1357,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
if (!bch2_dev_state_allowed(c, ca, new_state, flags))
return -EINVAL;
- if (new_state == BCH_MEMBER_STATE_RW) {
- if (__bch2_dev_read_write(c, ca))
- return -ENOMEM;
- } else {
+ if (new_state != BCH_MEMBER_STATE_RW)
__bch2_dev_read_only(c, ca);
- }
bch_notice(ca, "%s", bch2_dev_state[new_state]);
@@ -1361,6 +1368,9 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ if (new_state == BCH_MEMBER_STATE_RW)
+ return __bch2_dev_read_write(c, ca) ? -ENOMEM : 0;
+
return 0;
}
@@ -1701,11 +1711,17 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
const char *err;
int ret = -ENOMEM;
- if (!nr_devices)
- return ERR_PTR(-EINVAL);
+ pr_verbose_init(opts, "");
- if (!try_module_get(THIS_MODULE))
- return ERR_PTR(-ENODEV);
+ if (!nr_devices) {
+ c = ERR_PTR(-EINVAL);
+ goto out2;
+ }
+
+ if (!try_module_get(THIS_MODULE)) {
+ c = ERR_PTR(-ENODEV);
+ goto out2;
+ }
sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
if (!sb)
@@ -1760,8 +1776,11 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
if (err)
goto err_print;
+out:
kfree(sb);
module_put(THIS_MODULE);
+out2:
+ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
return c;
err_print:
pr_err("bch_fs_open err opening %s: %s",
@@ -1770,12 +1789,10 @@ err_print:
err:
if (c)
bch2_fs_stop(c);
-
for (i = 0; i < nr_devices; i++)
bch2_free_super(&sb[i]);
- kfree(sb);
- module_put(THIS_MODULE);
- return ERR_PTR(ret);
+ c = ERR_PTR(ret);
+ goto out;
}
static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index d0a38cf..1718f5c 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -67,6 +67,11 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
devs->devs[devs->nr++] = dev;
}
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+}
+
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
const struct bch_devs_mask *mask)
{
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h
index 966da4a..d76d917 100644
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -15,7 +15,7 @@ struct bch_devs_mask {
struct bch_devs_list {
u8 nr;
- u8 devs[BCH_REPLICAS_MAX];
+ u8 devs[BCH_REPLICAS_MAX + 1];
};
struct bch_member_cpu {
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 597e1f0..2e958a8 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -164,6 +164,8 @@ read_attribute(extent_migrate_raced);
rw_attribute(journal_write_delay_ms);
rw_attribute(journal_reclaim_delay_ms);
+rw_attribute(writeback_pages_max);
+
rw_attribute(discard);
rw_attribute(cache_replacement_policy);
@@ -310,6 +312,8 @@ SHOW(bch2_fs)
sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms);
sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
+ sysfs_print(writeback_pages_max, c->writeback_pages_max);
+
sysfs_print(block_size, block_bytes(c));
sysfs_print(btree_node_size, btree_bytes(c));
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
@@ -370,6 +374,9 @@ STORE(__bch2_fs)
sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
+ if (attr == &sysfs_writeback_pages_max)
+ c->writeback_pages_max = strtoul_restrict_or_return(buf, 1, UINT_MAX);
+
if (attr == &sysfs_btree_gc_periodic) {
ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
?: (ssize_t) size;
@@ -459,6 +466,8 @@ struct attribute *bch2_fs_files[] = {
&sysfs_journal_write_delay_ms,
&sysfs_journal_reclaim_delay_ms,
+ &sysfs_writeback_pages_max,
+
&sysfs_tiering_percent,
&sysfs_compression_stats,
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c
index c4625c8..775c2e2 100644
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@@ -14,10 +14,9 @@
#include <linux/kthread.h>
#include <trace/events/bcachefs.h>
-static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
+static bool __tiering_pred(struct bch_fs *c, struct bch_tier *tier,
+ struct bkey_s_c_extent e)
{
- struct bch_tier *tier = arg;
- struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
const struct bch_extent_ptr *ptr;
unsigned replicas = 0;
@@ -33,6 +32,21 @@ static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
return replicas < c->opts.data_replicas;
}
+static enum data_cmd tiering_pred(struct bch_fs *c, void *arg,
+ enum bkey_type type,
+ struct bkey_s_c_extent e,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
+{
+ struct bch_tier *tier = arg;
+
+ if (!__tiering_pred(c, tier, e))
+ return DATA_SKIP;
+
+ data_opts->btree_insert_flags = 0;
+ return DATA_ADD_REPLICAS;
+}
+
static int bch2_tiering_thread(void *arg)
{
struct bch_tier *tier = arg;
@@ -90,8 +104,6 @@ static int bch2_tiering_thread(void *arg)
SECTORS_IN_FLIGHT_PER_DEVICE * nr_devices,
&tier->devs,
writepoint_ptr(&tier->wp),
- 0,
- -1,
POS_MIN, POS_MAX,
tiering_pred, tier,
&move_stats);
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index 6e97e83..d475f98 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -817,4 +817,19 @@ do { \
#define array_remove_item(_array, _nr, _pos) \
array_remove_items(_array, _nr, _pos, 1)
+#define bubble_sort(_base, _nr, _cmp) \
+do { \
+ ssize_t _i, _end; \
+ bool _swapped = true; \
+ \
+ for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
+ _swapped = false; \
+ for (_i = 0; _i < _end; _i++) \
+ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \
+ swap((_base)[_i], (_base)[_i + 1]); \
+ _swapped = true; \
+ } \
+ } \
+} while (0)
+
#endif /* _BCACHEFS_UTIL_H */