summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2023-04-15 15:03:36 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-08-07 17:52:47 -0400
commit853bfa639dab6096c20942a020e5b809646a0388 (patch)
tree633df828e2c371ec2cf1b3a68ceb3e170be0905a
parent962eee6ab7bd29e3a1ad726a837812f65bcc2474 (diff)
Merge with a7c304e5a4 bcachefs: Add a cond_resched() call to journal_keys_sort()
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/Kconfig23
-rw-r--r--fs/bcachefs/Makefile11
-rw-r--r--fs/bcachefs/acl.c19
-rw-r--r--fs/bcachefs/acl.h4
-rw-r--r--fs/bcachefs/alloc_background.c2247
-rw-r--r--fs/bcachefs/alloc_background.h272
-rw-r--r--fs/bcachefs/alloc_foreground.c1132
-rw-r--r--fs/bcachefs/alloc_foreground.h105
-rw-r--r--fs/bcachefs/alloc_types.h95
-rw-r--r--fs/bcachefs/backpointers.c886
-rw-r--r--fs/bcachefs/backpointers.h130
-rw-r--r--fs/bcachefs/bbpos.h48
-rw-r--r--fs/bcachefs/bcachefs.h339
-rw-r--r--fs/bcachefs/bcachefs_format.h539
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h19
-rw-r--r--fs/bcachefs/bkey.c240
-rw-r--r--fs/bcachefs/bkey.h242
-rw-r--r--fs/bcachefs/bkey_buf.h1
-rw-r--r--fs/bcachefs/bkey_cmp.h129
-rw-r--r--fs/bcachefs/bkey_methods.c268
-rw-r--r--fs/bcachefs/bkey_methods.h127
-rw-r--r--fs/bcachefs/bkey_sort.c15
-rw-r--r--fs/bcachefs/bset.c112
-rw-r--r--fs/bcachefs/bset.h120
-rw-r--r--fs/bcachefs/btree_cache.c695
-rw-r--r--fs/bcachefs/btree_cache.h15
-rw-r--r--fs/bcachefs/btree_gc.c1251
-rw-r--r--fs/bcachefs/btree_gc.h7
-rw-r--r--fs/bcachefs/btree_io.c687
-rw-r--r--fs/bcachefs/btree_io.h76
-rw-r--r--fs/bcachefs/btree_iter.c2379
-rw-r--r--fs/bcachefs/btree_iter.h587
-rw-r--r--fs/bcachefs/btree_key_cache.c721
-rw-r--r--fs/bcachefs/btree_key_cache.h15
-rw-r--r--fs/bcachefs/btree_locking.c795
-rw-r--r--fs/bcachefs/btree_locking.h379
-rw-r--r--fs/bcachefs/btree_types.h331
-rw-r--r--fs/bcachefs/btree_update.h94
-rw-r--r--fs/bcachefs/btree_update_interior.c1238
-rw-r--r--fs/bcachefs/btree_update_interior.h16
-rw-r--r--fs/bcachefs/btree_update_leaf.c1700
-rw-r--r--fs/bcachefs/btree_write_buffer.c340
-rw-r--r--fs/bcachefs/btree_write_buffer.h14
-rw-r--r--fs/bcachefs/btree_write_buffer_types.h44
-rw-r--r--fs/bcachefs/buckets.c1318
-rw-r--r--fs/bcachefs/buckets.h247
-rw-r--r--fs/bcachefs/buckets_types.h46
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.c163
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.h6
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal_types.h11
-rw-r--r--fs/bcachefs/chardev.c20
-rw-r--r--fs/bcachefs/checksum.c167
-rw-r--r--fs/bcachefs/checksum.h37
-rw-r--r--fs/bcachefs/clock.c10
-rw-r--r--fs/bcachefs/compress.c89
-rw-r--r--fs/bcachefs/counters.c107
-rw-r--r--fs/bcachefs/counters.h17
-rw-r--r--fs/bcachefs/darray.h82
-rw-r--r--fs/bcachefs/data_update.c638
-rw-r--r--fs/bcachefs/data_update.h43
-rw-r--r--fs/bcachefs/debug.c681
-rw-r--r--fs/bcachefs/debug.h2
-rw-r--r--fs/bcachefs/dirent.c87
-rw-r--r--fs/bcachefs/dirent.h6
-rw-r--r--fs/bcachefs/disk_groups.c144
-rw-r--r--fs/bcachefs/disk_groups.h6
-rw-r--r--fs/bcachefs/ec.c1124
-rw-r--r--fs/bcachefs/ec.h70
-rw-r--r--fs/bcachefs/ec_types.h7
-rw-r--r--fs/bcachefs/errcode.c63
-rw-r--r--fs/bcachefs/errcode.h228
-rw-r--r--fs/bcachefs/error.c181
-rw-r--r--fs/bcachefs/error.h88
-rw-r--r--fs/bcachefs/extent_update.c24
-rw-r--r--fs/bcachefs/extents.c489
-rw-r--r--fs/bcachefs/extents.h134
-rw-r--r--fs/bcachefs/fifo.h2
-rw-r--r--fs/bcachefs/fs-common.c17
-rw-r--r--fs/bcachefs/fs-io.c1474
-rw-r--r--fs/bcachefs/fs-io.h18
-rw-r--r--fs/bcachefs/fs-ioctl.c104
-rw-r--r--fs/bcachefs/fs.c389
-rw-r--r--fs/bcachefs/fs.h55
-rw-r--r--fs/bcachefs/fsck.c1248
-rw-r--r--fs/bcachefs/inode.c428
-rw-r--r--fs/bcachefs/inode.h102
-rw-r--r--fs/bcachefs/io.c1311
-rw-r--r--fs/bcachefs/io.h73
-rw-r--r--fs/bcachefs/io_types.h22
-rw-r--r--fs/bcachefs/journal.c962
-rw-r--r--fs/bcachefs/journal.h110
-rw-r--r--fs/bcachefs/journal_io.c978
-rw-r--r--fs/bcachefs/journal_io.h30
-rw-r--r--fs/bcachefs/journal_reclaim.c223
-rw-r--r--fs/bcachefs/journal_sb.c219
-rw-r--r--fs/bcachefs/journal_sb.h24
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c19
-rw-r--r--fs/bcachefs/journal_types.h103
-rw-r--r--fs/bcachefs/keylist.c19
-rw-r--r--fs/bcachefs/keylist.h2
-rw-r--r--fs/bcachefs/lru.c188
-rw-r--r--fs/bcachefs/lru.h62
-rw-r--r--fs/bcachefs/migrate.c127
-rw-r--r--fs/bcachefs/move.c1360
-rw-r--r--fs/bcachefs/move.h112
-rw-r--r--fs/bcachefs/move_types.h17
-rw-r--r--fs/bcachefs/movinggc.c510
-rw-r--r--fs/bcachefs/movinggc.h3
-rw-r--r--fs/bcachefs/nocow_locking.c123
-rw-r--r--fs/bcachefs/nocow_locking.h49
-rw-r--r--fs/bcachefs/nocow_locking_types.h20
-rw-r--r--fs/bcachefs/opts.c165
-rw-r--r--fs/bcachefs/opts.h150
-rw-r--r--fs/bcachefs/printbuf.c415
-rw-r--r--fs/bcachefs/printbuf.h284
-rw-r--r--fs/bcachefs/quota.c386
-rw-r--r--fs/bcachefs/quota.h6
-rw-r--r--fs/bcachefs/rebalance.c143
-rw-r--r--fs/bcachefs/recovery.c791
-rw-r--r--fs/bcachefs/recovery.h20
-rw-r--r--fs/bcachefs/reflink.c126
-rw-r--r--fs/bcachefs/reflink.h35
-rw-r--r--fs/bcachefs/replicas.c190
-rw-r--r--fs/bcachefs/replicas.h18
-rw-r--r--fs/bcachefs/replicas_types.h17
-rw-r--r--fs/bcachefs/siphash.c2
-rw-r--r--fs/bcachefs/str_hash.h70
-rw-r--r--fs/bcachefs/subvolume.c763
-rw-r--r--fs/bcachefs/subvolume.h96
-rw-r--r--fs/bcachefs/subvolume_types.h18
-rw-r--r--fs/bcachefs/super-io.c779
-rw-r--r--fs/bcachefs/super-io.h14
-rw-r--r--fs/bcachefs/super.c662
-rw-r--r--fs/bcachefs/super.h17
-rw-r--r--fs/bcachefs/super_types.h1
-rw-r--r--fs/bcachefs/sysfs.c660
-rw-r--r--fs/bcachefs/sysfs.h14
-rw-r--r--fs/bcachefs/tests.c303
-rw-r--r--fs/bcachefs/trace.c8
-rw-r--r--fs/bcachefs/two_state_shared_lock.c8
-rw-r--r--fs/bcachefs/two_state_shared_lock.h59
-rw-r--r--fs/bcachefs/util.c590
-rw-r--r--fs/bcachefs/util.h294
-rw-r--r--fs/bcachefs/varint.c1
-rw-r--r--fs/bcachefs/vstructs.h2
-rw-r--r--fs/bcachefs/xattr.c117
-rw-r--r--fs/bcachefs/xattr.h6
-rw-r--r--include/linux/generic-radix-tree.h75
-rw-r--r--include/linux/six.h76
-rw-r--r--include/trace/events/bcachefs.h1067
-rw-r--r--kernel/locking/six.c513
-rw-r--r--lib/generic-radix-tree.c112
152 files changed, 29239 insertions, 15179 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 27742ce276cd..28b585223e89 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -22,6 +22,8 @@ config BCACHEFS_FS
select XOR_BLOCKS
select XXHASH
select SRCU
+ select SYMBOLIC_ERRNAME
+ select MEAN_AND_VARIANCE
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
@@ -36,6 +38,15 @@ config BCACHEFS_POSIX_ACL
depends on BCACHEFS_FS
select FS_POSIX_ACL
+config BCACHEFS_DEBUG_TRANSACTIONS
+ bool "bcachefs runtime info"
+ depends on BCACHEFS_FS
+ default y
+ help
+ This makes the list of running btree transactions available in debugfs.
+
+ This is a highly useful debugging feature but does add a small amount of overhead.
+
config BCACHEFS_DEBUG
bool "bcachefs debugging"
depends on BCACHEFS_FS
@@ -50,3 +61,15 @@ config BCACHEFS_TESTS
depends on BCACHEFS_FS
help
Include some unit and performance tests for the core btree code
+
+config BCACHEFS_LOCK_TIME_STATS
+ bool "bcachefs lock time statistics"
+ depends on BCACHEFS_FS
+ help
+ Expose statistics for how long we held a lock in debugfs
+
+config BCACHEFS_NO_LATENCY_ACCT
+ bool "disable latency accounting and time stats"
+ depends on BCACHEFS_FS
+ help
+ This disables device latency tracking and time stats, only for performance testing
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index cf29fdaadc5b..a71956048a02 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o
bcachefs-y := \
alloc_background.o \
alloc_foreground.o \
+ backpointers.o \
bkey.o \
bkey_methods.o \
bkey_sort.o \
@@ -13,18 +14,23 @@ bcachefs-y := \
btree_io.o \
btree_iter.o \
btree_key_cache.o \
+ btree_locking.o \
btree_update_interior.o \
btree_update_leaf.o \
+ btree_write_buffer.o \
buckets.o \
buckets_waiting_for_journal.o \
chardev.o \
checksum.o \
clock.o \
compress.o \
+ counters.o \
debug.o \
dirent.o \
disk_groups.o \
+ data_update.o \
ec.o \
+ errcode.o \
error.o \
extents.o \
extent_update.o \
@@ -38,12 +44,16 @@ bcachefs-y := \
journal.o \
journal_io.o \
journal_reclaim.o \
+ journal_sb.o \
journal_seq_blacklist.o \
keylist.o \
+ lru.o \
migrate.o \
move.o \
movinggc.o \
+ nocow_locking.o \
opts.o \
+ printbuf.o \
quota.o \
rebalance.o \
recovery.o \
@@ -56,6 +66,7 @@ bcachefs-y := \
sysfs.o \
tests.o \
trace.o \
+ two_state_shared_lock.o \
util.o \
varint.o \
xattr.o
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 2588812c5066..5cb06ac58960 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -173,7 +173,7 @@ bch2_acl_to_xattr(struct btree_trans *trans,
bkey_xattr_init(&xattr->k_i);
xattr->k.u64s = u64s;
xattr->v.x_type = acl_to_xattr_type(type);
- xattr->v.x_name_len = 0,
+ xattr->v.x_name_len = 0;
xattr->v.x_val_len = cpu_to_le16(acl_len);
acl_header = xattr_val(&xattr->v);
@@ -212,9 +212,10 @@ bch2_acl_to_xattr(struct btree_trans *trans,
return xattr;
}
-struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
+struct posix_acl *bch2_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
{
- struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct btree_trans trans;
@@ -233,7 +234,7 @@ retry:
&X_SEARCH(acl_to_xattr_type(type), "", 0),
0);
if (ret) {
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (ret != -ENOENT)
acl = ERR_PTR(ret);
@@ -289,9 +290,11 @@ int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
return ret == -ENOENT ? 0 : ret;
}
-int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type)
+int bch2_set_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry,
+ struct posix_acl *_acl, int type)
{
- struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
struct btree_iter inode_iter = { NULL };
@@ -314,7 +317,7 @@ retry:
mode = inode_u.bi_mode;
if (type == ACL_TYPE_ACCESS) {
- ret = posix_acl_update_mode(&inode->v, &mode, &acl);
+ ret = posix_acl_update_mode(mnt_userns, &inode->v, &mode, &acl);
if (ret)
goto btree_err;
}
@@ -331,7 +334,7 @@ retry:
btree_err:
bch2_trans_iter_exit(&trans, &inode_iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (unlikely(ret))
goto err;
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
index 14cabbc91808..ac206f6584e9 100644
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
@@ -26,12 +26,12 @@ typedef struct {
__le32 a_version;
} bch_acl_header;
-struct posix_acl *bch2_get_acl(struct inode *, int);
+struct posix_acl *bch2_get_acl(struct user_namespace *, struct dentry *, int);
int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
struct posix_acl *, int);
-int bch2_set_acl(struct inode *, struct posix_acl *, int);
+int bch2_set_acl(struct user_namespace *, struct dentry *, struct posix_acl *, int);
int bch2_acl_chmod(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
umode_t, struct posix_acl **);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 7f353cbdf427..aef796b5a48a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -2,18 +2,21 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_key_cache.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_gc.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
#include "ec.h"
#include "error.h"
+#include "lru.h"
#include "recovery.h"
#include "varint.h"
@@ -26,12 +29,7 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
-const char * const bch2_allocator_states[] = {
-#define x(n) #n,
- ALLOC_THREAD_STATES()
-#undef x
- NULL
-};
+/* Persistent alloc info: */
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
@@ -39,16 +37,17 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#undef x
};
-struct bkey_alloc_buf {
- struct bkey_i k;
- struct bch_alloc_v3 v;
-
-#define x(_name, _bits) + _bits / 8
- u8 _pad[0 + BCH_ALLOC_FIELDS_V2()];
+struct bkey_alloc_unpacked {
+ u64 journal_seq;
+ u8 gen;
+ u8 oldest_gen;
+ u8 data_type;
+ bool need_discard:1;
+ bool need_inc_gen:1;
+#define x(_name, _bits) u##_bits _name;
+ BCH_ALLOC_FIELDS_V2()
#undef x
-} __attribute__((packed, aligned(8)));
-
-/* Persistent alloc info: */
+};
static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
const void **p, unsigned field)
@@ -170,6 +169,8 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
out->gen = a.v->gen;
out->oldest_gen = a.v->oldest_gen;
out->data_type = a.v->data_type;
+ out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
+ out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
out->journal_seq = le64_to_cpu(a.v->journal_seq);
#define x(_name, _bits) \
@@ -191,53 +192,9 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
return 0;
}
-static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
- const struct bkey_alloc_unpacked src)
+static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
{
- struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k);
- unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
- u8 *out = a->v.data;
- u8 *end = (void *) &dst[1];
- u8 *last_nonzero_field = out;
- unsigned bytes;
-
- a->k.p = POS(src.dev, src.bucket);
- a->v.gen = src.gen;
- a->v.oldest_gen = src.oldest_gen;
- a->v.data_type = src.data_type;
- a->v.journal_seq = cpu_to_le64(src.journal_seq);
-
-#define x(_name, _bits) \
- nr_fields++; \
- \
- if (src._name) { \
- out += bch2_varint_encode_fast(out, src._name); \
- \
- last_nonzero_field = out; \
- last_nonzero_fieldnr = nr_fields; \
- } else { \
- *out++ = 0; \
- }
-
- BCH_ALLOC_FIELDS_V2()
-#undef x
- BUG_ON(out > end);
-
- out = last_nonzero_field;
- a->v.nr_fields = last_nonzero_fieldnr;
-
- bytes = (u8 *) out - (u8 *) &a->v;
- set_bkey_val_bytes(&a->k, bytes);
- memset_u64s_tail(&a->v, 0, bytes);
-}
-
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
-{
- struct bkey_alloc_unpacked ret = {
- .dev = k.k->p.inode,
- .bucket = k.k->p.offset,
- .gen = 0,
- };
+ struct bkey_alloc_unpacked ret = { .gen = 0 };
switch (k.k->type) {
case KEY_TYPE_alloc:
@@ -254,26 +211,6 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
return ret;
}
-static void bch2_alloc_pack(struct bch_fs *c,
- struct bkey_alloc_buf *dst,
- const struct bkey_alloc_unpacked src)
-{
- bch2_alloc_pack_v3(dst, src);
-}
-
-int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_alloc_unpacked *u, unsigned trigger_flags)
-{
- struct bkey_alloc_buf *a;
-
- a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- bch2_alloc_pack(trans->c, a, *u);
- return bch2_trans_update(trans, iter, &a->k, trigger_flags);
-}
-
static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
{
unsigned i, bytes = offsetof(struct bch_alloc, data);
@@ -285,60 +222,341 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
return DIV_ROUND_UP(bytes, sizeof(u64));
}
-const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
- if (k.k->p.inode >= c->sb.nr_devices ||
- !c->devs[k.k->p.inode])
- return "invalid device";
-
/* allow for unknown fields */
- if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v))
- return "incorrect value size";
+ if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
+ prt_printf(err, "incorrect value size (%zu < %u)",
+ bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ return 0;
}
-const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
struct bkey_alloc_unpacked u;
- if (k.k->p.inode >= c->sb.nr_devices ||
- !c->devs[k.k->p.inode])
- return "invalid device";
-
- if (bch2_alloc_unpack_v2(&u, k))
- return "unpack error";
+ if (bch2_alloc_unpack_v2(&u, k)) {
+ prt_printf(err, "unpack error");
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ return 0;
}
-const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
struct bkey_alloc_unpacked u;
- if (k.k->p.inode >= c->sb.nr_devices ||
- !c->devs[k.k->p.inode])
- return "invalid device";
+ if (bch2_alloc_unpack_v3(&u, k)) {
+ prt_printf(err, "unpack error");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ return 0;
+}
+
+int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
+{
+ struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+ int rw = flags & WRITE;
+
+ if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) {
+ prt_printf(err, "bad val size (%lu != %u)",
+ bkey_val_u64s(k.k), alloc_v4_u64s(a.v));
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
+ prt_printf(err, "invalid backpointers_start");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ if (rw == WRITE &&
+ !(flags & BKEY_INVALID_FROM_JOURNAL) &&
+ test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+ unsigned i, bp_len = 0;
- if (bch2_alloc_unpack_v3(&u, k))
- return "unpack error";
+ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
+ bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len;
- return NULL;
+ if (bp_len > a.v->dirty_sectors) {
+ prt_printf(err, "too many backpointers");
+ return -BCH_ERR_invalid_bkey;
+ }
+ }
+
+ if (rw == WRITE) {
+ if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
+ prt_printf(err, "invalid data type (got %u should be %u)",
+ a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ switch (a.v->data_type) {
+ case BCH_DATA_free:
+ case BCH_DATA_need_gc_gens:
+ case BCH_DATA_need_discard:
+ if (a.v->dirty_sectors ||
+ a.v->cached_sectors ||
+ a.v->stripe) {
+ prt_printf(err, "empty data type free but have data");
+ return -BCH_ERR_invalid_bkey;
+ }
+ break;
+ case BCH_DATA_sb:
+ case BCH_DATA_journal:
+ case BCH_DATA_btree:
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ if (!a.v->dirty_sectors) {
+ prt_printf(err, "data_type %s but dirty_sectors==0",
+ bch2_data_types[a.v->data_type]);
+ return -BCH_ERR_invalid_bkey;
+ }
+ break;
+ case BCH_DATA_cached:
+ if (!a.v->cached_sectors ||
+ a.v->dirty_sectors ||
+ a.v->stripe) {
+ prt_printf(err, "data type inconsistency");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ if (!a.v->io_time[READ] &&
+ test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) {
+ prt_printf(err, "cached bucket with read_time == 0");
+ return -BCH_ERR_invalid_bkey;
+ }
+ break;
+ case BCH_DATA_stripe:
+ if (!a.v->stripe) {
+ prt_printf(err, "data_type %s but stripe==0",
+ bch2_data_types[a.v->data_type]);
+ return -BCH_ERR_invalid_bkey;
+ }
+ break;
+ }
+ }
+
+ return 0;
}
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+static inline u64 swab40(u64 x)
{
- struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+ return (((x & 0x00000000ffULL) << 32)|
+ ((x & 0x000000ff00ULL) << 16)|
+ ((x & 0x0000ff0000ULL) >> 0)|
+ ((x & 0x00ff000000ULL) >> 16)|
+ ((x & 0xff00000000ULL) >> 32));
+}
- pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
- u.gen, u.oldest_gen, bch2_data_types[u.data_type],
- u.journal_seq);
-#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name);
- BCH_ALLOC_FIELDS_V2()
-#undef x
+void bch2_alloc_v4_swab(struct bkey_s k)
+{
+ struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
+ struct bch_backpointer *bp, *bps;
+
+ a->journal_seq = swab64(a->journal_seq);
+ a->flags = swab32(a->flags);
+ a->dirty_sectors = swab32(a->dirty_sectors);
+ a->cached_sectors = swab32(a->cached_sectors);
+ a->io_time[0] = swab64(a->io_time[0]);
+ a->io_time[1] = swab64(a->io_time[1]);
+ a->stripe = swab32(a->stripe);
+ a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+
+ bps = alloc_v4_backpointers(a);
+ for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
+ bp->bucket_offset = swab40(bp->bucket_offset);
+ bp->bucket_len = swab32(bp->bucket_len);
+ bch2_bpos_swab(&bp->pos);
+ }
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bch_alloc_v4 _a;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+ unsigned i;
+
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "gen %u oldest_gen %u data_type %s",
+ a->gen, a->oldest_gen,
+ a->data_type < BCH_DATA_NR
+ ? bch2_data_types[a->data_type]
+ : "(invalid data type)");
+ prt_newline(out);
+ prt_printf(out, "journal_seq %llu", a->journal_seq);
+ prt_newline(out);
+ prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a));
+ prt_newline(out);
+ prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a));
+ prt_newline(out);
+ prt_printf(out, "dirty_sectors %u", a->dirty_sectors);
+ prt_newline(out);
+ prt_printf(out, "cached_sectors %u", a->cached_sectors);
+ prt_newline(out);
+ prt_printf(out, "stripe %u", a->stripe);
+ prt_newline(out);
+ prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy);
+ prt_newline(out);
+ prt_printf(out, "io_time[READ] %llu", a->io_time[READ]);
+ prt_newline(out);
+ prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]);
+ prt_newline(out);
+ prt_printf(out, "fragmentation %llu", a->fragmentation_lru);
+ prt_newline(out);
+ prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
+ prt_newline(out);
+
+ if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
+ struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
+ const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
+
+ prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
+ printbuf_indent_add(out, 2);
+
+ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
+ prt_newline(out);
+ bch2_backpointer_to_text(out, &bps[i]);
+ }
+
+ printbuf_indent_sub(out, 2);
+ }
+
+ printbuf_indent_sub(out, 2);
+}
+
+void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
+{
+ if (k.k->type == KEY_TYPE_alloc_v4) {
+ void *src, *dst;
+
+ *out = *bkey_s_c_to_alloc_v4(k).v;
+
+ src = alloc_v4_backpointers(out);
+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+ dst = alloc_v4_backpointers(out);
+
+ if (src < dst)
+ memset(src, 0, dst - src);
+
+ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
+ } else {
+ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+ *out = (struct bch_alloc_v4) {
+ .journal_seq = u.journal_seq,
+ .flags = u.need_discard,
+ .gen = u.gen,
+ .oldest_gen = u.oldest_gen,
+ .data_type = u.data_type,
+ .stripe_redundancy = u.stripe_redundancy,
+ .dirty_sectors = u.dirty_sectors,
+ .cached_sectors = u.cached_sectors,
+ .io_time[READ] = u.read_time,
+ .io_time[WRITE] = u.write_time,
+ .stripe = u.stripe,
+ };
+
+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+ }
+}
+
+static noinline struct bkey_i_alloc_v4 *
+__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+ struct bkey_i_alloc_v4 *ret;
+
+ ret = bch2_trans_kmalloc(trans, sizeof(struct bkey_i_alloc_v4));
+ if (IS_ERR(ret))
+ return ret;
+
+ if (k.k->type == KEY_TYPE_alloc_v4) {
+ void *src, *dst;
+
+ bkey_reassemble(&ret->k_i, k);
+
+ src = alloc_v4_backpointers(&ret->v);
+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
+ dst = alloc_v4_backpointers(&ret->v);
+
+ if (src < dst)
+ memset(src, 0, dst - src);
+
+ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
+ set_alloc_v4_u64s(ret);
+ } else {
+ bkey_alloc_v4_init(&ret->k_i);
+ ret->k.p = k.k->p;
+ bch2_alloc_to_v4(k, &ret->v);
+ }
+ return ret;
+}
+
+static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
+{
+ struct bkey_s_c_alloc_v4 a;
+
+ if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
+ ((a = bkey_s_c_to_alloc_v4(k), true) &&
+ BCH_ALLOC_V4_BACKPOINTERS_START(a.v) == BCH_ALLOC_V4_U64s &&
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) {
+ /*
+ * Reserve space for one more backpointer here:
+ * Not sketchy at doing it this way, nope...
+ */
+ struct bkey_i_alloc_v4 *ret =
+ bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + sizeof(struct bch_backpointer));
+ if (!IS_ERR(ret))
+ bkey_reassemble(&ret->k_i, k);
+ return ret;
+ }
+
+ return __bch2_alloc_to_v4_mut(trans, k);
+}
+
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+ return bch2_alloc_to_v4_mut_inlined(trans, k);
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos)
+{
+ struct bkey_s_c k;
+ struct bkey_i_alloc_v4 *a;
+ int ret;
+
+ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+ BTREE_ITER_WITH_UPDATES|
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (unlikely(ret))
+ goto err;
+
+ a = bch2_alloc_to_v4_mut_inlined(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (unlikely(ret))
+ goto err;
+ return a;
+err:
+ bch2_trans_iter_exit(trans, iter);
+ return ERR_PTR(ret);
}
int bch2_alloc_read(struct bch_fs *c)
@@ -346,624 +564,1500 @@ int bch2_alloc_read(struct bch_fs *c)
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
+ struct bch_alloc_v4 a;
struct bch_dev *ca;
- struct bucket *g;
- struct bkey_alloc_unpacked u;
int ret;
bch2_trans_init(&trans, c, 0, 0);
- down_read(&c->gc_lock);
for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
- if (!bkey_is_alloc(k.k))
+ /*
+ * Not a fsck error because this is checked/repaired by
+ * bch2_check_alloc_key() which runs later:
+ */
+ if (!bch2_dev_bucket_exists(c, k.k->p))
continue;
ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = bucket(ca, k.k->p.offset);
- u = bch2_alloc_unpack(k);
-
- *bucket_gen(ca, k.k->p.offset) = u.gen;
- g->_mark.gen = u.gen;
- g->_mark.data_type = u.data_type;
- g->_mark.dirty_sectors = u.dirty_sectors;
- g->_mark.cached_sectors = u.cached_sectors;
- g->_mark.stripe = u.stripe != 0;
- g->stripe = u.stripe;
- g->stripe_redundancy = u.stripe_redundancy;
- g->io_time[READ] = u.read_time;
- g->io_time[WRITE] = u.write_time;
- g->oldest_gen = u.oldest_gen;
- g->gen_valid = 1;
+
+ *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
}
bch2_trans_iter_exit(&trans, &iter);
- up_read(&c->gc_lock);
bch2_trans_exit(&trans);
- if (ret) {
- bch_err(c, "error reading alloc info: %i", ret);
- return ret;
+ if (ret)
+ bch_err(c, "error reading alloc info: %s", bch2_err_str(ret));
+
+ return ret;
+}
+
+static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
+{
+ *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
+
+ pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
+ return pos;
+}
+
+static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
+{
+ pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
+ pos.offset += offset;
+ return pos;
+}
+
+static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
+{
+ return k.k->type == KEY_TYPE_bucket_gens
+ ? bkey_s_c_to_bucket_gens(k).v->gens[offset]
+ : 0;
+}
+
+int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
+{
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
+ prt_printf(err, "bad val size (%lu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
+ return -BCH_ERR_invalid_bkey;
}
return 0;
}
-static int bch2_alloc_write_key(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned flags)
+void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
- struct bch_fs *c = trans->c;
+ struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
+ if (i)
+ prt_char(out, ' ');
+ prt_printf(out, "%u", g.v->gens[i]);
+ }
+}
+
+int bch2_bucket_gens_init(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_alloc_unpacked old_u, new_u;
+ struct bch_alloc_v4 a;
+ struct bkey_i_bucket_gens g;
+ bool have_bucket_gens_key = false;
+ unsigned offset;
+ struct bpos pos;
+ u8 gen;
int ret;
-retry:
- bch2_trans_begin(trans);
- ret = bch2_btree_key_cache_flush(trans,
- BTREE_ID_alloc, iter->pos);
- if (ret)
- goto err;
+ bch2_trans_init(&trans, c, 0, 0);
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ /*
+ * Not a fsck error because this is checked/repaired by
+ * bch2_check_alloc_key() which runs later:
+ */
+ if (!bch2_dev_bucket_exists(c, k.k->p))
+ continue;
- old_u = bch2_alloc_unpack(k);
- new_u = alloc_mem_to_key(c, iter);
+ gen = bch2_alloc_to_v4(k, &a)->gen;
+ pos = alloc_gens_pos(iter.pos, &offset);
- if (!bkey_alloc_unpacked_cmp(old_u, new_u))
- return 0;
+ if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
+ ret = commit_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+ if (ret)
+ break;
+ have_bucket_gens_key = false;
+ }
+
+ if (!have_bucket_gens_key) {
+ bkey_bucket_gens_init(&g.k_i);
+ g.k.p = pos;
+ have_bucket_gens_key = true;
+ }
+
+ g.v.gens[offset] = gen;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (have_bucket_gens_key && !ret)
+ ret = commit_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+
+ bch2_trans_exit(&trans);
+
+ if (ret)
+ bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
- ret = bch2_alloc_write(trans, iter, &new_u,
- BTREE_TRIGGER_NORUN) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|flags);
-err:
- if (ret == -EINTR)
- goto retry;
return ret;
}
-int bch2_alloc_write_all(struct bch_fs *c, unsigned flags)
+int bch2_bucket_gens_read(struct bch_fs *c)
{
struct btree_trans trans;
struct btree_iter iter;
+ struct bkey_s_c k;
+ const struct bch_bucket_gens *g;
struct bch_dev *ca;
- unsigned i;
- int ret = 0;
+ u64 b;
+ int ret;
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ bch2_trans_init(&trans, c, 0, 0);
- for_each_member_device(ca, c, i) {
- bch2_btree_iter_set_pos(&iter,
- POS(ca->dev_idx, ca->mi.first_bucket));
+ for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+ u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
- while (iter.pos.offset < ca->mi.nbuckets) {
- ret = bch2_alloc_write_key(&trans, &iter, flags);
- if (ret) {
- percpu_ref_put(&ca->ref);
- goto err;
- }
- bch2_btree_iter_advance(&iter);
- }
+ if (k.k->type != KEY_TYPE_bucket_gens)
+ continue;
+
+ g = bkey_s_c_to_bucket_gens(k).v;
+
+ /*
+ * Not a fsck error because this is checked/repaired by
+ * bch2_check_alloc_key() which runs later:
+ */
+ if (!bch2_dev_exists2(c, k.k->p.inode))
+ continue;
+
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+ for (b = max_t(u64, ca->mi.first_bucket, start);
+ b < min_t(u64, ca->mi.nbuckets, end);
+ b++)
+ *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
}
-err:
bch2_trans_iter_exit(&trans, &iter);
+
bch2_trans_exit(&trans);
+
+ if (ret)
+ bch_err(c, "error reading alloc info: %s", bch2_err_str(ret));
+
return ret;
}
-/* Bucket IO clocks: */
+/* Free space/discard btree: */
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
- size_t bucket_nr, int rw)
+static int bch2_bucket_do_index(struct btree_trans *trans,
+ struct bkey_s_c alloc_k,
+ const struct bch_alloc_v4 *a,
+ bool set)
{
struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_alloc_unpacked u;
- u64 *time, now;
- int ret = 0;
+ struct bkey_s_c old;
+ struct bkey_i *k;
+ enum btree_id btree;
+ enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ struct printbuf buf = PRINTBUF;
+ int ret;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
- BTREE_ITER_CACHED|
+ if (a->data_type != BCH_DATA_free &&
+ a->data_type != BCH_DATA_need_discard)
+ return 0;
+
+ k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+ if (IS_ERR(k))
+ return PTR_ERR(k);
+
+ bkey_init(&k->k);
+ k->k.type = new_type;
+
+ switch (a->data_type) {
+ case BCH_DATA_free:
+ btree = BTREE_ID_freespace;
+ k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
+ bch2_key_resize(&k->k, 1);
+ break;
+ case BCH_DATA_need_discard:
+ btree = BTREE_ID_need_discard;
+ k->k.p = alloc_k.k->p;
+ break;
+ default:
+ return 0;
+ }
+
+ bch2_trans_iter_init(trans, &iter, btree,
+ bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
+ old = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(old);
+ if (ret)
+ goto err;
+
+ if (ca->mi.freespace_initialized &&
+ test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags) &&
+ bch2_trans_inconsistent_on(old.k->type != old_type, trans,
+ "incorrect key when %s %s btree (got %s should be %s)\n"
+ " for %s",
+ set ? "setting" : "clearing",
+ bch2_btree_ids[btree],
+ bch2_bkey_types[old.k->type],
+ bch2_bkey_types[old_type],
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ ret = -EIO;
+ goto err;
+ }
+
+ ret = bch2_trans_update(trans, &iter, k, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
+ struct bpos bucket, u8 gen)
+{
+ struct btree_iter iter;
+ unsigned offset;
+ struct bpos pos = alloc_gens_pos(bucket, &offset);
+ struct bkey_i_bucket_gens *g;
+ struct bkey_s_c k;
+ int ret;
+
+ g = bch2_trans_kmalloc(trans, sizeof(*g));
+ ret = PTR_ERR_OR_ZERO(g);
+ if (ret)
+ return ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_bucket_gens, pos,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
- goto out;
-
- u = bch2_alloc_unpack(k);
+ goto err;
- time = rw == READ ? &u.read_time : &u.write_time;
- now = atomic64_read(&c->io_clock[rw].now);
- if (*time == now)
- goto out;
+ if (k.k->type != KEY_TYPE_bucket_gens) {
+ bkey_bucket_gens_init(&g->k_i);
+ g->k.p = iter.pos;
+ } else {
+ bkey_reassemble(&g->k_i, k);
+ }
- *time = now;
+ g->v.gens[offset] = gen;
- ret = bch2_alloc_write(trans, &iter, &u, 0) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
-out:
+ ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
+err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-/* Background allocator thread: */
+int bch2_trans_mark_alloc(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_alloc_v4 old_a_convert, *new_a;
+ const struct bch_alloc_v4 *old_a;
+ u64 old_lru, new_lru;
+ int ret = 0;
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
+ /*
+ * Deletion only happens in the device removal path, with
+ * BTREE_TRIGGER_NORUN:
+ */
+ BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
- struct bucket_mark m)
-{
- u8 gc_gen;
+ old_a = bch2_alloc_to_v4(old, &old_a_convert);
+ new_a = &bkey_i_to_alloc_v4(new)->v;
- if (!is_available_bucket(m))
- return false;
+ new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
- if (m.owned_by_allocator)
- return false;
+ if (new_a->dirty_sectors > old_a->dirty_sectors ||
+ new_a->cached_sectors > old_a->cached_sectors) {
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+ SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
+ }
- if (ca->buckets_nouse &&
- test_bit(b, ca->buckets_nouse))
- return false;
+ if (data_type_is_empty(new_a->data_type) &&
+ BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
+ !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
+ new_a->gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+ }
- if (ca->new_fs_bucket_idx) {
- /*
- * Device or filesystem is still being initialized, and we
- * haven't fully marked superblocks & journal:
- */
- if (is_superblock_bucket(ca, b))
- return false;
+ if (old_a->data_type != new_a->data_type ||
+ (new_a->data_type == BCH_DATA_free &&
+ alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
+ ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
+ bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
+ if (ret)
+ return ret;
+ }
+
+ if (new_a->data_type == BCH_DATA_cached &&
+ !new_a->io_time[READ])
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- if (b < ca->new_fs_bucket_idx)
- return false;
+ old_lru = alloc_lru_idx_read(*old_a);
+ new_lru = alloc_lru_idx_read(*new_a);
+
+ if (old_lru != new_lru) {
+ ret = bch2_lru_change(trans, new->k.p.inode,
+ bucket_to_u64(new->k.p),
+ old_lru, new_lru);
+ if (ret)
+ return ret;
}
- gc_gen = bucket_gc_gen(bucket(ca, b));
+ new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
+ bch_dev_bkey_exists(c, new->k.p.inode));
+
+ if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+ ret = bch2_lru_change(trans,
+ BCH_LRU_FRAGMENTATION_START,
+ bucket_to_u64(new->k.p),
+ old_a->fragmentation_lru, new_a->fragmentation_lru);
+ if (ret)
+ return ret;
+ }
- ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
- ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX;
+ if (old_a->gen != new_a->gen) {
+ ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
+ if (ret)
+ return ret;
+ }
- return gc_gen < BUCKET_GC_GEN_MAX;
+ return 0;
}
/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
+ * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for
+ * extents style btrees, but works on non-extents btrees:
*/
-
-static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
- u64 now, u64 last_seq_ondisk)
+struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
{
- unsigned used = bucket_sectors_used(m);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
- if (used) {
- /*
- * Prefer to keep buckets that have been read more recently, and
- * buckets that have more data in them:
- */
- u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
- u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
+ if (bkey_err(k))
+ return k;
- return -last_read_scaled;
+ if (k.k->type) {
+ return k;
} else {
+ struct btree_iter iter2;
+ struct bpos next;
+
+ bch2_trans_copy_iter(&iter2, iter);
+
+ if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX))
+ end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p));
+
+ end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
+
/*
- * Prefer to use buckets with smaller gc_gen so that we don't
- * have to walk the btree and recalculate oldest_gen - but shift
- * off the low bits so that buckets will still have equal sort
- * keys when there's only a small difference, so that we can
- * keep sequential buckets together:
+ * btree node min/max is a closed interval, upto takes a half
+ * open interval:
*/
- return bucket_gc_gen(g) >> 4;
+ k = bch2_btree_iter_peek_upto(&iter2, end);
+ next = iter2.pos;
+ bch2_trans_iter_exit(iter->trans, &iter2);
+
+ BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
+
+ if (bkey_err(k))
+ return k;
+
+ bkey_init(hole);
+ hole->p = iter->pos;
+
+ bch2_key_resize(hole, next.offset - iter->pos.offset);
+ return (struct bkey_s_c) { hole, NULL };
}
}
-static inline int bucket_alloc_cmp(alloc_heap *h,
- struct alloc_heap_entry l,
- struct alloc_heap_entry r)
+static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
{
- return cmp_int(l.key, r.key) ?:
- cmp_int(r.nr, l.nr) ?:
- cmp_int(l.bucket, r.bucket);
-}
+ struct bch_dev *ca;
+ unsigned iter;
-static inline int bucket_idx_cmp(const void *_l, const void *_r)
-{
- const struct alloc_heap_entry *l = _l, *r = _r;
+ if (bch2_dev_bucket_exists(c, *bucket))
+ return true;
- return cmp_int(l->bucket, r->bucket);
-}
+ if (bch2_dev_exists2(c, bucket->inode)) {
+ ca = bch_dev_bkey_exists(c, bucket->inode);
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bucket_array *buckets;
- struct alloc_heap_entry e = { 0 };
- u64 now, last_seq_ondisk;
- size_t b, i, nr = 0;
+ if (bucket->offset < ca->mi.first_bucket) {
+ bucket->offset = ca->mi.first_bucket;
+ return true;
+ }
- down_read(&ca->bucket_lock);
+ bucket->inode++;
+ bucket->offset = 0;
+ }
- buckets = bucket_array(ca);
- ca->alloc_heap.used = 0;
- now = atomic64_read(&c->io_clock[READ].now);
- last_seq_ondisk = c->journal.flushed_seq_ondisk;
+ rcu_read_lock();
+ iter = bucket->inode;
+ ca = __bch2_next_dev(c, &iter, NULL);
+ if (ca)
+ *bucket = POS(ca->dev_idx, ca->mi.first_bucket);
+ rcu_read_unlock();
- /*
- * Find buckets with lowest read priority, by building a maxheap sorted
- * by read priority and repeatedly replacing the maximum element until
- * all buckets have been visited.
- */
- for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
- struct bucket *g = &buckets->b[b];
- struct bucket_mark m = READ_ONCE(g->mark);
- unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
+ return ca != NULL;
+}
+
+struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole)
+{
+ struct bch_fs *c = iter->trans->c;
+ struct bkey_s_c k;
+again:
+ k = bch2_get_key_or_hole(iter, POS_MAX, hole);
+ if (bkey_err(k))
+ return k;
- cond_resched();
+ if (!k.k->type) {
+ struct bpos bucket = bkey_start_pos(k.k);
- if (!bch2_can_invalidate_bucket(ca, b, m))
- continue;
+ if (!bch2_dev_bucket_exists(c, bucket)) {
+ if (!next_bucket(c, &bucket))
+ return bkey_s_c_null;
- if (!m.data_type &&
- bch2_bucket_needs_journal_commit(c, last_seq_ondisk,
- ca->dev_idx, b)) {
- ca->buckets_waiting_on_journal++;
- continue;
+ bch2_btree_iter_set_pos(iter, bucket);
+ goto again;
}
- if (e.nr && e.bucket + e.nr == b && e.key == key) {
- e.nr++;
- } else {
- if (e.nr)
- heap_add_or_replace(&ca->alloc_heap, e,
- -bucket_alloc_cmp, NULL);
-
- e = (struct alloc_heap_entry) {
- .bucket = b,
- .nr = 1,
- .key = key,
- };
+ if (!bch2_dev_bucket_exists(c, k.k->p)) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+
+ bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset);
}
}
- if (e.nr)
- heap_add_or_replace(&ca->alloc_heap, e,
- -bucket_alloc_cmp, NULL);
+ return k;
+}
- for (i = 0; i < ca->alloc_heap.used; i++)
- nr += ca->alloc_heap.data[i].nr;
+static int bch2_check_alloc_key(struct btree_trans *trans,
+ struct bkey_s_c alloc_k,
+ struct btree_iter *alloc_iter,
+ struct btree_iter *discard_iter,
+ struct btree_iter *freespace_iter,
+ struct btree_iter *bucket_gens_iter)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ unsigned discard_key_type, freespace_key_type;
+ unsigned gens_offset;
+ struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ int ret;
- while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
- nr -= ca->alloc_heap.data[0].nr;
- heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
+ if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
+ "alloc key for invalid device:bucket %llu:%llu",
+ alloc_k.k->p.inode, alloc_k.k->p.offset))
+ return bch2_btree_delete_at(trans, alloc_iter, 0);
+
+ ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
+ if (!ca->mi.freespace_initialized)
+ return 0;
+
+ a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+ discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
+ bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
+ k = bch2_btree_iter_peek_slot(discard_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != discard_key_type &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[discard_key_type],
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ struct bkey_i *update =
+ bch2_trans_kmalloc(trans, sizeof(*update));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_init(&update->k);
+ update->k.type = discard_key_type;
+ update->k.p = discard_iter->pos;
+
+ ret = bch2_trans_update(trans, discard_iter, update, 0);
+ if (ret)
+ goto err;
}
- up_read(&ca->bucket_lock);
-}
+ freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
+ bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
+ k = bch2_btree_iter_peek_slot(freespace_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
- size_t i, nr = 0;
+ if (k.k->type != freespace_key_type &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n"
+ " %s",
+ bch2_bkey_types[k.k->type],
+ bch2_bkey_types[freespace_key_type],
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ struct bkey_i *update =
+ bch2_trans_kmalloc(trans, sizeof(*update));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_init(&update->k);
+ update->k.type = freespace_key_type;
+ update->k.p = freespace_iter->pos;
+ bch2_key_resize(&update->k, 1);
+
+ ret = bch2_trans_update(trans, freespace_iter, update, 0);
+ if (ret)
+ goto err;
+ }
- ca->inc_gen_needs_gc = 0;
- ca->inc_gen_really_needs_gc = 0;
- ca->buckets_waiting_on_journal = 0;
+ bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
+ k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- find_reclaimable_buckets_lru(c, ca);
+ if (a->gen != alloc_gen(k, gens_offset) &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n"
+ " %s",
+ alloc_gen(k, gens_offset), a->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+ struct bkey_i_bucket_gens *g =
+ bch2_trans_kmalloc(trans, sizeof(*g));
+
+ ret = PTR_ERR_OR_ZERO(g);
+ if (ret)
+ goto err;
- heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
+ if (k.k->type == KEY_TYPE_bucket_gens) {
+ bkey_reassemble(&g->k_i, k);
+ } else {
+ bkey_bucket_gens_init(&g->k_i);
+ g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
+ }
- for (i = 0; i < ca->alloc_heap.used; i++)
- nr += ca->alloc_heap.data[i].nr;
+ g->v.gens[gens_offset] = a->gen;
- return nr;
+ ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
}
-static int bucket_invalidate_btree(struct btree_trans *trans,
- struct bch_dev *ca, u64 b,
- struct bkey_alloc_unpacked *u)
+static int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
+ struct bpos start,
+ struct bpos *end,
+ struct btree_iter *freespace_iter)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
+ struct bch_dev *ca;
struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
int ret;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- POS(ca->dev_idx, b),
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
+ ca = bch_dev_bkey_exists(c, start.inode);
+ if (!ca->mi.freespace_initialized)
+ return 0;
- k = bch2_btree_iter_peek_slot(&iter);
+ bch2_btree_iter_set_pos(freespace_iter, start);
+
+ k = bch2_btree_iter_peek_slot(freespace_iter);
ret = bkey_err(k);
if (ret)
goto err;
- *u = bch2_alloc_unpack(k);
- u->gen++;
- u->data_type = 0;
- u->dirty_sectors = 0;
- u->cached_sectors = 0;
- u->read_time = atomic64_read(&c->io_clock[READ].now);
- u->write_time = atomic64_read(&c->io_clock[WRITE].now);
+ *end = bkey_min(k.k->p, *end);
+
+ if (k.k->type != KEY_TYPE_set &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, "hole in alloc btree missing in freespace btree\n"
+ " device %llu buckets %llu-%llu",
+ freespace_iter->pos.inode,
+ freespace_iter->pos.offset,
+ end->offset))) {
+ struct bkey_i *update =
+ bch2_trans_kmalloc(trans, sizeof(*update));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_init(&update->k);
+ update->k.type = KEY_TYPE_set;
+ update->k.p = freespace_iter->pos;
+ bch2_key_resize(&update->k,
+ min_t(u64, U32_MAX, end->offset -
+ freespace_iter->pos.offset));
- ret = bch2_alloc_write(trans, &iter, u,
- BTREE_TRIGGER_BUCKET_INVALIDATE);
+ ret = bch2_trans_update(trans, freespace_iter, update, 0);
+ if (ret)
+ goto err;
+ }
err:
- bch2_trans_iter_exit(trans, &iter);
+fsck_err:
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
- u64 *journal_seq, unsigned flags)
+static int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
+ struct bpos start,
+ struct bpos *end,
+ struct btree_iter *bucket_gens_iter)
{
- struct bkey_alloc_unpacked u;
- size_t b;
- u64 commit_seq = 0;
- int ret = 0;
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ unsigned i, gens_offset, gens_end_offset;
+ int ret;
- /*
- * If the read-only path is trying to shut down, we can't be generating
- * new btree updates:
- */
- if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
- return 1;
+ if (c->sb.version < bcachefs_metadata_version_bucket_gens &&
+ !c->opts.version_upgrade)
+ return 0;
- BUG_ON(!ca->alloc_heap.used ||
- !ca->alloc_heap.data[0].nr);
- b = ca->alloc_heap.data[0].bucket;
+ bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
- /* first, put on free_inc and mark as owned by allocator: */
- percpu_down_read(&c->mark_lock);
+ k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- bch2_mark_alloc_bucket(c, ca, b, true);
+ if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
+ alloc_gens_pos(*end, &gens_end_offset)))
+ gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
- spin_lock(&c->freelist_lock);
- verify_not_on_freelist(c, ca, b);
- BUG_ON(!fifo_push(&ca->free_inc, b));
- spin_unlock(&c->freelist_lock);
+ if (k.k->type == KEY_TYPE_bucket_gens) {
+ struct bkey_i_bucket_gens g;
+ bool need_update = false;
- percpu_up_read(&c->mark_lock);
+ bkey_reassemble(&g.k_i, k);
- ret = bch2_trans_do(c, NULL, &commit_seq,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RESERVED|
- flags,
- bucket_invalidate_btree(&trans, ca, b, &u));
+ for (i = gens_offset; i < gens_end_offset; i++) {
+ if (fsck_err_on(g.v.gens[i], c,
+ "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
+ bucket_gens_pos_to_alloc(k.k->p, i).inode,
+ bucket_gens_pos_to_alloc(k.k->p, i).offset,
+ g.v.gens[i])) {
+ g.v.gens[i] = 0;
+ need_update = true;
+ }
+ }
- if (!ret) {
- /* remove from alloc_heap: */
- struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+ if (need_update) {
+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g));
- top->bucket++;
- top->nr--;
+ ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ goto err;
- if (!top->nr)
- heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+ memcpy(k, &g, sizeof(g));
- /*
- * If we invalidating cached data then we need to wait on the
- * journal commit:
- */
- if (u.data_type)
- *journal_seq = max(*journal_seq, commit_seq);
+ ret = bch2_trans_update(trans, bucket_gens_iter, k, 0);
+ if (ret)
+ goto err;
+ }
+ }
- /*
- * We already waiting on u.alloc_seq when we filtered out
- * buckets that need journal commit:
- */
- BUG_ON(*journal_seq > u.journal_seq);
- } else {
- size_t b2;
+ *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int bch2_check_discard_freespace_key(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter;
+ struct bkey_s_c alloc_k;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ u64 genbits;
+ struct bpos pos;
+ enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
+ ? BCH_DATA_need_discard
+ : BCH_DATA_free;
+ struct printbuf buf = PRINTBUF;
+ int ret;
- /* remove from free_inc: */
- percpu_down_read(&c->mark_lock);
- spin_lock(&c->freelist_lock);
+ pos = iter->pos;
+ pos.offset &= ~(~0ULL << 56);
+ genbits = iter->pos.offset & (~0ULL << 56);
- bch2_mark_alloc_bucket(c, ca, b, false);
+ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
- BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
- BUG_ON(b != b2);
+ if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
+ "entry in %s btree for nonexistant dev:bucket %llu:%llu",
+ bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
+ goto delete;
- spin_unlock(&c->freelist_lock);
- percpu_up_read(&c->mark_lock);
- }
+ alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+ ret = bkey_err(alloc_k);
+ if (ret)
+ goto err;
- return ret < 0 ? ret : 0;
+ a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+ if (fsck_err_on(a->data_type != state ||
+ (state == BCH_DATA_free &&
+ genbits != alloc_freespace_genbits(*a)), c,
+ "%s\n incorrectly set in %s index (free %u, genbits %llu should be %llu)",
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+ bch2_btree_ids[iter->btree_id],
+ a->data_type == state,
+ genbits >> 56, alloc_freespace_genbits(*a) >> 56))
+ goto delete;
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
+ return ret;
+delete:
+ ret = bch2_btree_delete_extent_at(trans, iter,
+ iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0);
+ goto out;
}
/*
- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
+ * We've already checked that generation numbers in the bucket_gens btree are
+ * valid for buckets that exist; this just checks for keys for nonexistent
+ * buckets.
*/
-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
+static int bch2_check_bucket_gens_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
- u64 journal_seq = 0;
+ struct bch_fs *c = trans->c;
+ struct bkey_i_bucket_gens g;
+ struct bch_dev *ca;
+ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+ u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
+ u64 b;
+ bool need_update = false;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- /* Only use nowait if we've already invalidated at least one bucket: */
- while (!ret &&
- !fifo_full(&ca->free_inc) &&
- ca->alloc_heap.used) {
- if (kthread_should_stop()) {
- ret = 1;
+ BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
+ bkey_reassemble(&g.k_i, k);
+
+ if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
+ "bucket_gens key for invalid device:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter, 0);
+ goto out;
+ }
+
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ if (fsck_err_on(end <= ca->mi.first_bucket ||
+ start >= ca->mi.nbuckets, c,
+ "bucket_gens key for invalid buckets:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter, 0);
+ goto out;
+ }
+
+ for (b = start; b < ca->mi.first_bucket; b++)
+ if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+ "bucket_gens key has nonzero gen for invalid bucket")) {
+ g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
+ need_update = true;
+ }
+
+ for (b = ca->mi.nbuckets; b < end; b++)
+ if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+ "bucket_gens key has nonzero gen for invalid bucket")) {
+ g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
+ need_update = true;
+ }
+
+ if (need_update) {
+ struct bkey_i *k;
+
+ k = bch2_trans_kmalloc(trans, sizeof(g));
+ ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ goto out;
+
+ memcpy(k, &g, sizeof(g));
+ ret = bch2_trans_update(trans, iter, k, 0);
+ }
+out:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_check_alloc_info(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
+ struct bkey hole;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
+ BTREE_ITER_PREFETCH);
+
+ while (1) {
+ struct bpos next;
+
+ bch2_trans_begin(&trans);
+
+ k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
+ ret = bkey_err(k);
+ if (ret)
+ goto bkey_err;
+
+ if (!k.k)
break;
+
+ if (k.k->type) {
+ next = bpos_nosnap_successor(k.k->p);
+
+ ret = bch2_check_alloc_key(&trans,
+ k, &iter,
+ &discard_iter,
+ &freespace_iter,
+ &bucket_gens_iter);
+ if (ret)
+ goto bkey_err;
+ } else {
+ next = k.k->p;
+
+ ret = bch2_check_alloc_hole_freespace(&trans,
+ bkey_start_pos(k.k),
+ &next,
+ &freespace_iter) ?:
+ bch2_check_alloc_hole_bucket_gens(&trans,
+ bkey_start_pos(k.k),
+ &next,
+ &bucket_gens_iter);
+ if (ret)
+ goto bkey_err;
}
- ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
- (!fifo_empty(&ca->free_inc)
- ? BTREE_INSERT_NOWAIT : 0));
- /*
- * We only want to batch up invalidates when they're going to
- * require flushing the journal:
- */
- if (!journal_seq)
+ ret = bch2_trans_commit(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW);
+ if (ret)
+ goto bkey_err;
+
+ bch2_btree_iter_set_pos(&iter, next);
+bkey_err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
break;
}
+ bch2_trans_iter_exit(&trans, &bucket_gens_iter);
+ bch2_trans_iter_exit(&trans, &freespace_iter);
+ bch2_trans_iter_exit(&trans, &discard_iter);
+ bch2_trans_iter_exit(&trans, &iter);
- /* If we used NOWAIT, don't return the error: */
- if (!fifo_empty(&ca->free_inc))
- ret = 0;
if (ret < 0)
- bch_err(ca, "error invalidating buckets: %i", ret);
+ goto err;
+
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_need_discard, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ bch2_check_discard_freespace_key(&trans, &iter)) ?:
+ for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_freespace, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ bch2_check_discard_freespace_key(&trans, &iter)) ?:
+ for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_bucket_gens, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ bch2_check_bucket_gens_key(&trans, &iter, k));
+err:
+ bch2_trans_exit(&trans);
+ return ret < 0 ? ret : 0;
+}
+
+static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
+ struct btree_iter *alloc_iter)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter lru_iter;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ struct bkey_s_c alloc_k, k;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ alloc_k = bch2_btree_iter_peek(alloc_iter);
+ if (!alloc_k.k)
+ return 0;
+
+ ret = bkey_err(alloc_k);
if (ret)
return ret;
- if (journal_seq)
- ret = bch2_journal_flush_seq(&c->journal, journal_seq);
- if (ret) {
- bch_err(ca, "journal error: %i", ret);
- return ret;
- }
+ a = bch2_alloc_to_v4(alloc_k, &a_convert);
- return 0;
+ if (a->data_type != BCH_DATA_cached)
+ return 0;
+
+ bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
+ lru_pos(alloc_k.k->p.inode,
+ bucket_to_u64(alloc_k.k->p),
+ a->io_time[READ]), 0);
+ k = bch2_btree_iter_peek_slot(&lru_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(!a->io_time[READ], c,
+ "cached bucket with read_time 0\n"
+ " %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
+ fsck_err_on(k.k->type != KEY_TYPE_set, c,
+ "missing lru entry\n"
+ " %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ u64 read_time = a->io_time[READ] ?:
+ atomic64_read(&c->io_clock[READ].now);
+
+ ret = bch2_lru_set(trans,
+ alloc_k.k->p.inode,
+ bucket_to_u64(alloc_k.k->p),
+ read_time);
+ if (ret)
+ goto err;
+
+ if (a->io_time[READ] != read_time) {
+ struct bkey_i_alloc_v4 *a_mut =
+ bch2_alloc_to_v4_mut(trans, alloc_k);
+ ret = PTR_ERR_OR_ZERO(a_mut);
+ if (ret)
+ goto err;
+
+ a_mut->v.io_time[READ] = read_time;
+ ret = bch2_trans_update(trans, alloc_iter,
+ &a_mut->k_i, BTREE_TRIGGER_NORUN);
+ if (ret)
+ goto err;
+ }
+ }
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &lru_iter);
+ printbuf_exit(&buf);
+ return ret;
}
-static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
+int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
{
- if (ca->allocator_state != new_state) {
- ca->allocator_state = new_state;
- closure_wake_up(&ca->fs->freelist_wait);
- }
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+ POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ bch2_check_alloc_to_lru_ref(&trans, &iter));
+
+ bch2_trans_exit(&trans);
+ return ret < 0 ? ret : 0;
}
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+static int bch2_discard_one_bucket(struct btree_trans *trans,
+ struct btree_iter *need_discard_iter,
+ struct bpos *discard_pos_done,
+ u64 *seen,
+ u64 *open,
+ u64 *need_journal_commit,
+ u64 *discarded)
{
- unsigned i;
+ struct bch_fs *c = trans->c;
+ struct bpos pos = need_discard_iter->pos;
+ struct btree_iter iter = { NULL };
+ struct bkey_s_c k;
+ struct bch_dev *ca;
+ struct bkey_i_alloc_v4 *a;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++) {
- /*
- * Don't strand buckets on the copygc freelist until
- * after recovery is finished:
- */
- if (i == RESERVE_MOVINGGC &&
- !test_bit(BCH_FS_STARTED, &c->flags))
- continue;
+ ca = bch_dev_bkey_exists(c, pos.inode);
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
+ return 0;
+ }
- if (fifo_push(&ca->free[i], b)) {
- fifo_pop(&ca->free_inc, b);
- ret = 1;
- break;
+ if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
+ (*open)++;
+ goto out;
+ }
+
+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ pos.inode, pos.offset)) {
+ (*need_journal_commit)++;
+ goto out;
+ }
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ need_discard_iter->pos,
+ BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto out;
+
+ a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto out;
+
+ if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+ a->v.gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ goto write;
+ }
+
+ if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
+ if (test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+ bch2_trans_inconsistent(trans,
+ "clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
+ "%s",
+ a->v.journal_seq,
+ c->journal.flushed_seq_ondisk,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
}
+ goto out;
}
- spin_unlock(&c->freelist_lock);
- ca->allocator_state = ret
- ? ALLOCATOR_running
- : ALLOCATOR_blocked_full;
- closure_wake_up(&c->freelist_wait);
+ if (a->v.data_type != BCH_DATA_need_discard) {
+ if (test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+ bch2_trans_inconsistent(trans,
+ "bucket incorrectly set in need_discard btree\n"
+ "%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ }
+
+ goto out;
+ }
+
+ if (!bkey_eq(*discard_pos_done, iter.pos) &&
+ ca->mi.discard && !c->opts.nochanges) {
+ /*
+ * This works without any other locks because this is the only
+ * thread that removes items from the need_discard tree
+ */
+ bch2_trans_unlock(trans);
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ k.k->p.offset * ca->mi.bucket_size,
+ ca->mi.bucket_size,
+ GFP_KERNEL);
+ *discard_pos_done = iter.pos;
+
+ ret = bch2_trans_relock_notrace(trans);
+ if (ret)
+ goto out;
+ }
+
+ SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+ a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+write:
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
+ if (ret)
+ goto out;
+
+ this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
+ (*discarded)++;
+out:
+ (*seen)++;
+ bch2_trans_iter_exit(trans, &iter);
+ percpu_ref_put(&ca->io_ref);
+ printbuf_exit(&buf);
return ret;
}
-static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+static void bch2_do_discards_work(struct work_struct *work)
{
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
- ca->mi.bucket_size, GFP_NOFS, 0);
+ struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+ struct bpos discard_pos_done = POS_MAX;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ /*
+ * We're doing the commit in bch2_discard_one_bucket instead of using
+ * for_each_btree_key_commit() so that we can increment counters after
+ * successful commit:
+ */
+ ret = for_each_btree_key2(&trans, iter,
+ BTREE_ID_need_discard, POS_MIN, 0, k,
+ bch2_discard_one_bucket(&trans, &iter, &discard_pos_done,
+ &seen,
+ &open,
+ &need_journal_commit,
+ &discarded));
+
+ bch2_trans_exit(&trans);
+
+ if (need_journal_commit * 2 > seen)
+ bch2_journal_flush_async(&c->journal, NULL);
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+
+ trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+ bch2_err_str(ret));
}
-static bool allocator_thread_running(struct bch_dev *ca)
+void bch2_do_discards(struct bch_fs *c)
{
- unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
- test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
- ? ALLOCATOR_running
- : ALLOCATOR_stopped;
- alloc_thread_set_state(ca, state);
- return state == ALLOCATOR_running;
+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
+ !queue_work(c->write_ref_wq, &c->discard_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
-static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
+static int invalidate_one_bucket(struct btree_trans *trans,
+ struct btree_iter *lru_iter,
+ struct bkey_s_c lru_k,
+ s64 *nr_to_invalidate)
{
- s64 available = dev_buckets_reclaimable(ca) -
- (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
- bool ret = available > 0;
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter = { NULL };
+ struct bkey_i_alloc_v4 *a = NULL;
+ struct printbuf buf = PRINTBUF;
+ struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
+ unsigned cached_sectors;
+ int ret = 0;
+
+ if (*nr_to_invalidate <= 0)
+ return 1;
+
+ if (!bch2_dev_bucket_exists(c, bucket)) {
+ prt_str(&buf, "lru entry points to invalid bucket");
+ goto err;
+ }
+
+ if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
+ return 0;
- alloc_thread_set_state(ca, ret
- ? ALLOCATOR_running
- : ALLOCATOR_blocked);
+ a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto out;
+
+ /* We expect harmless races here due to the btree write buffer: */
+ if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
+ goto out;
+
+ BUG_ON(a->v.data_type != BCH_DATA_cached);
+
+ if (!a->v.cached_sectors)
+ bch_err(c, "invalidating empty bucket, confused");
+
+ cached_sectors = a->v.cached_sectors;
+
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+ a->v.gen++;
+ a->v.data_type = 0;
+ a->v.dirty_sectors = 0;
+ a->v.cached_sectors = 0;
+ a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+ a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
+
+ ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
+ BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL);
+ if (ret)
+ goto out;
+
+ trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
+ --*nr_to_invalidate;
+out:
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
return ret;
+err:
+ prt_str(&buf, "\n lru key: ");
+ bch2_bkey_val_to_text(&buf, c, lru_k);
+
+ prt_str(&buf, "\n lru entry: ");
+ bch2_lru_pos_to_text(&buf, lru_iter->pos);
+
+ prt_str(&buf, "\n alloc key: ");
+ if (!a)
+ bch2_bpos_to_text(&buf, bucket);
+ else
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+
+ bch_err(c, "%s", buf.buf);
+ if (test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+ bch2_inconsistent_error(c);
+ ret = -EINVAL;
+ }
+
+ goto out;
}
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
+static void bch2_do_invalidates_work(struct work_struct *work)
{
- struct bch_dev *ca = arg;
- struct bch_fs *c = ca->fs;
- unsigned long gc_count = c->gc_count;
- size_t nr;
+ struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+ struct bch_dev *ca;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ unsigned i;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ ret = bch2_btree_write_buffer_flush(&trans);
+ if (ret)
+ goto err;
+
+ for_each_member_device(ca, c, i) {
+ s64 nr_to_invalidate =
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+
+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru,
+ lru_pos(ca->dev_idx, 0, 0),
+ lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
+ BTREE_ITER_INTENT, k,
+ invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate));
+
+ if (ret < 0) {
+ percpu_ref_put(&ca->ref);
+ break;
+ }
+ }
+err:
+ bch2_trans_exit(&trans);
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+}
+
+void bch2_do_invalidates(struct bch_fs *c)
+{
+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
+ !queue_work(c->write_ref_wq, &c->invalidate_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+}
+
+static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
+ unsigned long *last_updated)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey hole;
+ struct bpos end = POS(ca->dev_idx, ca->mi.nbuckets);
+ struct bch_member *m;
int ret;
- set_freezable();
+ bch2_trans_init(&trans, c, 0, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ BTREE_ITER_PREFETCH);
+ /*
+ * Scan the alloc btree for every bucket on @ca, and add buckets to the
+ * freespace/need_discard/need_gc_gens btrees as needed:
+ */
while (1) {
- ret = kthread_wait_freezable(allocator_thread_running(ca));
+ if (*last_updated + HZ * 10 < jiffies) {
+ bch_info(ca, "%s: currently at %llu/%llu",
+ __func__, iter.pos.offset, ca->mi.nbuckets);
+ *last_updated = jiffies;
+ }
+
+ bch2_trans_begin(&trans);
+
+ if (bkey_ge(iter.pos, end)) {
+ ret = 0;
+ break;
+ }
+
+ k = bch2_get_key_or_hole(&iter, end, &hole);
+ ret = bkey_err(k);
if (ret)
- goto stop;
+ goto bkey_err;
+
+ if (k.k->type) {
+ /*
+ * We process live keys in the alloc btree one at a
+ * time:
+ */
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+
+ ret = bch2_bucket_do_index(&trans, k, a, true) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL);
+ if (ret)
+ goto bkey_err;
- while (!ca->alloc_heap.used) {
- cond_resched();
+ bch2_btree_iter_advance(&iter);
+ } else {
+ struct bkey_i *freespace;
- ret = kthread_wait_freezable(buckets_available(ca, gc_count));
+ freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace));
+ ret = PTR_ERR_OR_ZERO(freespace);
if (ret)
- goto stop;
-
- gc_count = c->gc_count;
- nr = find_reclaimable_buckets(c, ca);
-
- if (!nr && ca->buckets_waiting_on_journal) {
- ret = bch2_journal_flush(&c->journal);
- if (ret)
- goto stop;
- } else if (nr < (ca->mi.nbuckets >> 6) &&
- ca->buckets_waiting_on_journal >= nr / 2) {
- bch2_journal_flush_async(&c->journal, NULL);
- }
+ goto bkey_err;
- if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
- ca->inc_gen_really_needs_gc) &&
- c->gc_thread) {
- atomic_inc(&c->kick_gc);
- wake_up_process(c->gc_thread);
- }
+ bkey_init(&freespace->k);
+ freespace->k.type = KEY_TYPE_set;
+ freespace->k.p = k.k->p;
+ freespace->k.size = k.k->size;
- trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
- ca->inc_gen_really_needs_gc);
- }
+ ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL);
+ if (ret)
+ goto bkey_err;
- ret = bch2_invalidate_buckets(c, ca);
+ bch2_btree_iter_set_pos(&iter, k.k->p);
+ }
+bkey_err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
if (ret)
- goto stop;
+ break;
+ }
- while (!fifo_empty(&ca->free_inc)) {
- u64 b = fifo_peek(&ca->free_inc);
+ bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_exit(&trans);
- discard_one_bucket(c, ca, b);
+ if (ret < 0) {
+ bch_err(ca, "error initializing free space: %s", bch2_err_str(ret));
+ return ret;
+ }
- ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
- if (ret)
- goto stop;
+ mutex_lock(&c->sb_lock);
+ m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
+ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+}
+
+int bch2_fs_freespace_init(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+ int ret = 0;
+ bool doing_init = false;
+ unsigned long last_updated = jiffies;
+
+ /*
+ * We can crash during the device add path, so we need to check this on
+ * every mount:
+ */
+
+ for_each_member_device(ca, c, i) {
+ if (ca->mi.freespace_initialized)
+ continue;
+
+ if (!doing_init) {
+ bch_info(c, "initializing freespace");
+ doing_init = true;
+ }
+
+ ret = bch2_dev_freespace_init(c, ca, &last_updated);
+ if (ret) {
+ percpu_ref_put(&ca->ref);
+ return ret;
}
}
-stop:
- alloc_thread_set_state(ca, ALLOCATOR_stopped);
- return 0;
+
+ if (doing_init) {
+ mutex_lock(&c->sb_lock);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ bch_verbose(c, "done initializing freespace");
+ }
+
+ return ret;
+}
+
+/* Bucket IO clocks: */
+
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a;
+ u64 now;
+ int ret = 0;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr));
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
+
+ now = atomic64_read(&c->io_clock[rw].now);
+ if (a->v.io_time[rw] == now)
+ goto out;
+
+ a->v.io_time[rw] = now;
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
/* Startup/shutdown (ro/rw): */
@@ -974,12 +2068,12 @@ void bch2_recalc_capacity(struct bch_fs *c)
u64 capacity = 0, reserved_sectors = 0, gc_reserve;
unsigned bucket_size_max = 0;
unsigned long ra_pages = 0;
- unsigned i, j;
+ unsigned i;
lockdep_assert_held(&c->state_lock);
for_each_online_member(ca, c, i) {
- struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;
+ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
ra_pages += bdi->ra_pages;
}
@@ -1005,8 +2099,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
* allocations for foreground writes must wait -
* not -ENOSPC calculations.
*/
- for (j = 0; j < RESERVE_NONE; j++)
- dev_reserve += ca->free[j].size;
+
+ dev_reserve += ca->nr_btree_reserve * 2;
+ dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
dev_reserve += 1; /* btree write point */
dev_reserve += 1; /* copygc write point */
@@ -1062,8 +2157,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
unsigned i;
- BUG_ON(ca->alloc_thread);
-
/* First, remove device from allocation groups: */
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
@@ -1074,40 +2167,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
*/
bch2_recalc_capacity(c);
- /* Next, close write points that point to this device... */
- for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
- bch2_writepoint_stop(c, ca, &c->write_points[i]);
-
- bch2_writepoint_stop(c, ca, &c->copygc_write_point);
- bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
- bch2_writepoint_stop(c, ca, &c->btree_write_point);
-
- mutex_lock(&c->btree_reserve_cache_lock);
- while (c->btree_reserve_cache_nr) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
- bch2_open_buckets_put(c, &a->ob);
- }
- mutex_unlock(&c->btree_reserve_cache_lock);
-
- while (1) {
- struct open_bucket *ob;
-
- spin_lock(&c->freelist_lock);
- if (!ca->open_buckets_partial_nr) {
- spin_unlock(&c->freelist_lock);
- break;
- }
- ob = c->open_buckets +
- ca->open_buckets_partial[--ca->open_buckets_partial_nr];
- ob->on_partial_list = false;
- spin_unlock(&c->freelist_lock);
-
- bch2_open_bucket_put(c, ob);
- }
-
- bch2_ec_stop_dev(c, ca);
+ bch2_open_buckets_stop(c, ca, false);
/*
* Wake up threads that were blocked on allocation, so they can notice
@@ -1137,62 +2197,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
-void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
-{
- if (ca->alloc_thread)
- closure_wait_event(&c->freelist_wait,
- ca->allocator_state != ALLOCATOR_running);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- p = rcu_dereference_protected(ca->alloc_thread, 1);
- ca->alloc_thread = NULL;
-
- /*
- * We need an rcu barrier between setting ca->alloc_thread = NULL and
- * the thread shutting down to avoid bch2_wake_allocator() racing:
- *
- * XXX: it would be better to have the rcu barrier be asynchronous
- * instead of blocking us here
- */
- synchronize_rcu();
-
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- /*
- * allocator thread already started?
- */
- if (ca->alloc_thread)
- return 0;
-
- p = kthread_create(bch2_allocator_thread, ca,
- "bch-alloc/%s", ca->name);
- if (IS_ERR(p)) {
- bch_err(ca->fs, "error creating allocator thread: %li",
- PTR_ERR(p));
- return PTR_ERR(p);
- }
-
- get_task_struct(p);
- rcu_assign_pointer(ca->alloc_thread, p);
- wake_up_process(p);
- return 0;
-}
-
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
+ INIT_WORK(&c->discard_work, bch2_do_discards_work);
+ INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 86b64177b3d0..324798396fc6 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -8,91 +8,190 @@
#include "debug.h"
#include "super.h"
-extern const char * const bch2_allocator_states[];
-
-struct bkey_alloc_unpacked {
- u64 journal_seq;
- u64 bucket;
- u8 dev;
- u8 gen;
- u8 oldest_gen;
- u8 data_type;
-#define x(_name, _bits) u##_bits _name;
- BCH_ALLOC_FIELDS_V2()
-#undef x
-};
-
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
-/* returns true if not equal */
-static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
- struct bkey_alloc_unpacked r)
+static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
{
- return l.gen != r.gen ||
- l.oldest_gen != r.oldest_gen ||
- l.data_type != r.data_type
-#define x(_name, ...) || l._name != r._name
- BCH_ALLOC_FIELDS_V2()
-#undef x
- ;
+ struct bch_dev *ca;
+
+ if (!bch2_dev_exists2(c, pos.inode))
+ return false;
+
+ ca = bch_dev_bkey_exists(c, pos.inode);
+ return pos.offset >= ca->mi.first_bucket &&
+ pos.offset < ca->mi.nbuckets;
}
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
- struct bkey_alloc_unpacked *, unsigned);
+static inline u64 bucket_to_u64(struct bpos bucket)
+{
+ return (bucket.inode << 48) | bucket.offset;
+}
-int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
+static inline struct bpos u64_to_bucket(u64 bucket)
+{
+ return POS(bucket >> 48, bucket & ~(~0ULL << 48));
+}
-static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter)
+static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
{
- struct bch_dev *ca;
- struct bucket *g;
- struct bkey_alloc_unpacked ret;
-
- percpu_down_read(&c->mark_lock);
- ca = bch_dev_bkey_exists(c, iter->pos.inode);
- g = bucket(ca, iter->pos.offset);
- ret = (struct bkey_alloc_unpacked) {
- .dev = iter->pos.inode,
- .bucket = iter->pos.offset,
- .gen = g->mark.gen,
- .oldest_gen = g->oldest_gen,
- .data_type = g->mark.data_type,
- .dirty_sectors = g->mark.dirty_sectors,
- .cached_sectors = g->mark.cached_sectors,
- .read_time = g->io_time[READ],
- .write_time = g->io_time[WRITE],
- .stripe = g->stripe,
- .stripe_redundancy = g->stripe_redundancy,
- };
- percpu_up_read(&c->mark_lock);
+ return a.gen - a.oldest_gen;
+}
+
+static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors,
+ u32 cached_sectors,
+ u32 stripe,
+ struct bch_alloc_v4 a,
+ enum bch_data_type data_type)
+{
+ if (stripe)
+ return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
+ if (dirty_sectors)
+ return data_type;
+ if (cached_sectors)
+ return BCH_DATA_cached;
+ if (BCH_ALLOC_V4_NEED_DISCARD(&a))
+ return BCH_DATA_need_discard;
+ if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+ return BCH_DATA_need_gc_gens;
+ return BCH_DATA_free;
+}
+
+static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
+ enum bch_data_type data_type)
+{
+ return __alloc_data_type(a.dirty_sectors, a.cached_sectors,
+ a.stripe, a, data_type);
+}
+
+static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
+{
+ return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type;
+}
+
+static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
+{
+ return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
+}
+
+#define DATA_TYPES_MOVABLE \
+ ((1U << BCH_DATA_btree)| \
+ (1U << BCH_DATA_user)| \
+ (1U << BCH_DATA_stripe))
+
+static inline bool data_type_movable(enum bch_data_type type)
+{
+ return (1U << type) & DATA_TYPES_MOVABLE;
+}
+
+static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
+ struct bch_dev *ca)
+{
+ if (!data_type_movable(a.data_type) ||
+ a.dirty_sectors >= ca->mi.bucket_size)
+ return 0;
+
+ return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size);
+}
+
+static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
+{
+ return ((u64) alloc_gc_gen(a) >> 4) << 56;
+}
+
+static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
+{
+ pos.offset |= alloc_freespace_genbits(a);
+ return pos;
+}
+
+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+{
+ unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+ BCH_ALLOC_V4_U64s_V0) +
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
+ (sizeof(struct bch_backpointer) / sizeof(u64));
+
+ BUG_ON(ret > U8_MAX - BKEY_U64s);
+ return ret;
+}
+
+static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
+{
+ set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
+
+void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
+
+static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert)
+{
+ const struct bch_alloc_v4 *ret;
+
+ if (unlikely(k.k->type != KEY_TYPE_alloc_v4))
+ goto slowpath;
+
+ ret = bkey_s_c_to_alloc_v4(k).v;
+ if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s)
+ goto slowpath;
return ret;
+slowpath:
+ __bch2_alloc_to_v4(k, convert);
+ return convert;
}
-#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
-const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
+
+int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-#define bch2_bkey_ops_alloc (struct bkey_ops) { \
+#define bch2_bkey_ops_alloc ((struct bkey_ops) { \
.key_invalid = bch2_alloc_v1_invalid, \
.val_to_text = bch2_alloc_to_text, \
-}
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
+})
-#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \
+#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \
.key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \
-}
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
+})
-#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \
+#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \
.key_invalid = bch2_alloc_v3_invalid, \
.val_to_text = bch2_alloc_to_text, \
-}
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
+})
+
+#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \
+ .key_invalid = bch2_alloc_v4_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+ .swab = bch2_alloc_v4_swab, \
+ .trans_trigger = bch2_trans_mark_alloc, \
+ .atomic_trigger = bch2_mark_alloc, \
+})
+
+int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \
+ .key_invalid = bch2_bucket_gens_invalid, \
+ .val_to_text = bch2_bucket_gens_to_text, \
+})
+
+int bch2_bucket_gens_init(struct bch_fs *);
static inline bool bkey_is_alloc(const struct bkey *k)
{
@@ -102,44 +201,47 @@ static inline bool bkey_is_alloc(const struct bkey *k)
}
int bch2_alloc_read(struct bch_fs *);
+int bch2_bucket_gens_read(struct bch_fs *);
-static inline void bch2_wake_allocator(struct bch_dev *ca)
+int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_check_alloc_info(struct bch_fs *);
+int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_do_discards(struct bch_fs *);
+
+static inline u64 should_invalidate_buckets(struct bch_dev *ca,
+ struct bch_dev_usage u)
{
- struct task_struct *p;
+ u64 want_free = ca->mi.nbuckets >> 7;
+ u64 free = max_t(s64, 0,
+ u.d[BCH_DATA_free].buckets
+ + u.d[BCH_DATA_need_discard].buckets
+ - bch2_dev_buckets_reserved(ca, RESERVE_stripe));
- rcu_read_lock();
- p = rcu_dereference(ca->alloc_thread);
- if (p)
- wake_up_process(p);
- rcu_read_unlock();
+ return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
}
-static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
- size_t bucket)
+void bch2_do_invalidates(struct bch_fs *);
+
+static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
{
- if (bch2_expensive_debug_checks) {
- size_t iter;
- long i;
- unsigned j;
+ return (void *) ((u64 *) &a->v +
+ (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+ BCH_ALLOC_V4_U64s_V0));
+}
- for (j = 0; j < RESERVE_NR; j++)
- fifo_for_each_entry(i, &ca->free[j], iter)
- BUG_ON(i == bucket);
- fifo_for_each_entry(i, &ca->free_inc, iter)
- BUG_ON(i == bucket);
- }
+static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
+{
+ return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
}
+int bch2_fs_freespace_init(struct bch_fs *);
+
void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_stop(struct bch_dev *);
-int bch2_dev_allocator_start(struct bch_dev *);
-
-int bch2_alloc_write_all(struct bch_fs *, unsigned);
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 0a634125dc90..350635f3b118 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -14,19 +14,34 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "btree_iter.h"
+#include "btree_update.h"
#include "btree_gc.h"
#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
+#include "error.h"
#include "io.h"
+#include "journal.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
#include <linux/math64.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <trace/events/bcachefs.h>
+const char * const bch2_alloc_reserves[] = {
+#define x(t) #t,
+ BCH_ALLOC_RESERVES()
+#undef x
+ NULL
+};
+
/*
* Open buckets represent a bucket that's currently being allocated from. They
* serve two purposes:
@@ -43,6 +58,17 @@
* reference _after_ doing the index update that makes its allocation reachable.
*/
+void bch2_reset_alloc_cursors(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL)
+ ca->alloc_cursor = 0;
+ rcu_read_unlock();
+}
+
static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
{
open_bucket_idx_t idx = ob - c->open_buckets;
@@ -71,14 +97,13 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
if (ob->ec) {
- bch2_ec_bucket_written(c, ob);
+ ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
return;
}
percpu_down_read(&c->mark_lock);
spin_lock(&ob->lock);
- bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
ob->valid = false;
ob->data_type = 0;
@@ -129,42 +154,17 @@ static void open_bucket_free_unused(struct bch_fs *c,
struct write_point *wp,
struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
- bool may_realloc = wp->data_type == BCH_DATA_user;
-
- BUG_ON(ca->open_buckets_partial_nr >
- ARRAY_SIZE(ca->open_buckets_partial));
-
- if (ca->open_buckets_partial_nr <
- ARRAY_SIZE(ca->open_buckets_partial) &&
- may_realloc) {
- spin_lock(&c->freelist_lock);
- ob->on_partial_list = true;
- ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
- ob - c->open_buckets;
- spin_unlock(&c->freelist_lock);
+ BUG_ON(c->open_buckets_partial_nr >=
+ ARRAY_SIZE(c->open_buckets_partial));
- closure_wake_up(&c->open_buckets_wait);
- closure_wake_up(&c->freelist_wait);
- } else {
- bch2_open_bucket_put(c, ob);
- }
-}
-
-static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct open_bucket *ob;
- unsigned i;
-
- rcu_read_lock();
- open_bucket_for_each(c, obs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ spin_lock(&c->freelist_lock);
+ ob->on_partial_list = true;
+ c->open_buckets_partial[c->open_buckets_partial_nr++] =
+ ob - c->open_buckets;
+ spin_unlock(&c->freelist_lock);
- BUG_ON(*bucket_gen(ca, ob->bucket) != ob->gen);
- }
- rcu_read_unlock();
-#endif
+ closure_wake_up(&c->open_buckets_wait);
+ closure_wake_up(&c->freelist_wait);
}
/* _only_ for allocating the journal on a new device: */
@@ -184,49 +184,48 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
{
switch (reserve) {
- case RESERVE_BTREE:
- case RESERVE_BTREE_MOVINGGC:
+ case RESERVE_btree:
+ case RESERVE_btree_movinggc:
return 0;
- case RESERVE_MOVINGGC:
+ case RESERVE_movinggc:
return OPEN_BUCKETS_COUNT / 4;
default:
return OPEN_BUCKETS_COUNT / 2;
}
}
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
- enum alloc_reserve reserve,
- bool may_alloc_partial,
- struct closure *cl)
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ u64 bucket,
+ enum alloc_reserve reserve,
+ const struct bch_alloc_v4 *a,
+ struct bucket_alloc_state *s,
+ struct closure *cl)
{
struct open_bucket *ob;
- long b = 0;
- spin_lock(&c->freelist_lock);
+ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+ s->skipped_nouse++;
+ return NULL;
+ }
- if (may_alloc_partial) {
- int i;
+ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+ s->skipped_open++;
+ return NULL;
+ }
- for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
- ob = c->open_buckets + ca->open_buckets_partial[i];
+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+ s->skipped_need_journal_commit++;
+ return NULL;
+ }
- if (reserve <= ob->alloc_reserve) {
- array_remove_item(ca->open_buckets_partial,
- ca->open_buckets_partial_nr,
- i);
- ob->on_partial_list = false;
- ob->alloc_reserve = reserve;
- spin_unlock(&c->freelist_lock);
- return ob;
- }
- }
+ if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
+ s->skipped_nocow++;
+ return NULL;
}
+ spin_lock(&c->freelist_lock);
+
if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
if (cl)
closure_wait(&c->open_buckets_wait, cl);
@@ -235,46 +234,25 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
c->blocked_allocate_open_bucket = local_clock();
spin_unlock(&c->freelist_lock);
- trace_open_bucket_alloc_fail(ca, reserve);
- return ERR_PTR(-OPEN_BUCKETS_EMPTY);
+ return ERR_PTR(-BCH_ERR_open_buckets_empty);
}
- if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
- goto out;
-
- switch (reserve) {
- case RESERVE_BTREE_MOVINGGC:
- case RESERVE_MOVINGGC:
- if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
- goto out;
- break;
- default:
- break;
+ /* Recheck under lock: */
+ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+ spin_unlock(&c->freelist_lock);
+ s->skipped_open++;
+ return NULL;
}
- if (cl)
- closure_wait(&c->freelist_wait, cl);
-
- if (!c->blocked_allocate)
- c->blocked_allocate = local_clock();
-
- spin_unlock(&c->freelist_lock);
-
- trace_bucket_alloc_fail(ca, reserve);
- return ERR_PTR(-FREELIST_EMPTY);
-out:
- verify_not_on_freelist(c, ca, b);
-
ob = bch2_open_bucket_alloc(c);
spin_lock(&ob->lock);
ob->valid = true;
ob->sectors_free = ca->mi.bucket_size;
- ob->alloc_reserve = reserve;
ob->dev = ca->dev_idx;
- ob->gen = *bucket_gen(ca, b);
- ob->bucket = b;
+ ob->gen = a->gen;
+ ob->bucket = bucket;
spin_unlock(&ob->lock);
ca->nr_open_buckets++;
@@ -295,10 +273,322 @@ out:
}
spin_unlock(&c->freelist_lock);
+ return ob;
+}
+
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+ enum alloc_reserve reserve, u64 free_entry,
+ struct bucket_alloc_state *s,
+ struct bkey_s_c freespace_k,
+ struct closure *cl)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter = { NULL };
+ struct bkey_s_c k;
+ struct open_bucket *ob;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ u64 b = free_entry & ~(~0ULL << 56);
+ unsigned genbits = free_entry >> 56;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
+ prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
+ " freespace key ",
+ ca->mi.first_bucket, ca->mi.nbuckets);
+ bch2_bkey_val_to_text(&buf, c, freespace_k);
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret) {
+ ob = ERR_PTR(ret);
+ goto err;
+ }
+
+ a = bch2_alloc_to_v4(k, &a_convert);
+
+ if (a->data_type != BCH_DATA_free) {
+ if (!test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+ ob = NULL;
+ goto err;
+ }
+
+ prt_printf(&buf, "non free bucket in freespace btree\n"
+ " freespace key ");
+ bch2_bkey_val_to_text(&buf, c, freespace_k);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ob = ERR_PTR(-EIO);
+ goto err;
+ }
+
+ if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
+ test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+ prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+ " freespace key ",
+ genbits, alloc_freespace_genbits(*a) >> 56);
+ bch2_bkey_val_to_text(&buf, c, freespace_k);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+ ob = ERR_PTR(-EIO);
+ goto err;
+
+ }
+
+ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+ struct bch_backpointer bp;
+ struct bpos bp_pos = POS_MIN;
+
+ ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
+ &bp_pos, &bp,
+ BTREE_ITER_NOPRESERVE);
+ if (ret) {
+ ob = ERR_PTR(ret);
+ goto err;
+ }
+
+ if (!bkey_eq(bp_pos, POS_MAX)) {
+ /*
+ * Bucket may have data in it - we don't call
+ * bc2h_trans_inconnsistent() because fsck hasn't
+ * finished yet
+ */
+ ob = NULL;
+ goto err;
+ }
+ }
+
+ ob = __try_alloc_bucket(c, ca, b, reserve, a, s, cl);
+ if (!ob)
+ iter.path->preserve = false;
+err:
+ set_btree_iter_dontneed(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ob;
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ *
+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_early(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ struct bucket_alloc_state *s,
+ struct closure *cl)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob = NULL;
+ u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+ u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
+ int ret;
+again:
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
+ BTREE_ITER_SLOTS, k, ret) {
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
- bch2_wake_allocator(ca);
+ if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
+ break;
+
+ if (ca->new_fs_bucket_idx &&
+ is_superblock_bucket(ca, k.k->p.offset))
+ continue;
+
+ a = bch2_alloc_to_v4(k, &a_convert);
+
+ if (a->data_type != BCH_DATA_free)
+ continue;
+
+ s->buckets_seen++;
+
+ ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, a, s, cl);
+ if (ob)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ ca->alloc_cursor = alloc_cursor;
+
+ if (!ob && ret)
+ ob = ERR_PTR(ret);
+
+ if (!ob && alloc_cursor > alloc_start) {
+ alloc_cursor = alloc_start;
+ goto again;
+ }
+
+ return ob;
+}
+
+static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ struct bucket_alloc_state *s,
+ struct closure *cl)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct open_bucket *ob = NULL;
+ u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 alloc_cursor = alloc_start;
+ int ret;
+
+ BUG_ON(ca->new_fs_bucket_idx);
+again:
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
+ POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
+ if (k.k->p.inode != ca->dev_idx)
+ break;
+
+ for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
+ alloc_cursor < k.k->p.offset;
+ alloc_cursor++) {
+ ret = btree_trans_too_many_iters(trans);
+ if (ret) {
+ ob = ERR_PTR(ret);
+ break;
+ }
+
+ s->buckets_seen++;
+
+ ob = try_alloc_bucket(trans, ca, reserve,
+ alloc_cursor, s, k, cl);
+ if (ob) {
+ iter.path->preserve = false;
+ break;
+ }
+ }
+
+ if (ob || ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ ca->alloc_cursor = alloc_cursor;
+
+ if (!ob && ret)
+ ob = ERR_PTR(ret);
+
+ if (!ob && alloc_start > ca->mi.first_bucket) {
+ alloc_cursor = alloc_start = ca->mi.first_bucket;
+ goto again;
+ }
- trace_bucket_alloc(ca, reserve);
+ return ob;
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ */
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+ struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ struct closure *cl,
+ struct bch_dev_usage *usage)
+{
+ struct bch_fs *c = trans->c;
+ struct open_bucket *ob = NULL;
+ bool freespace = READ_ONCE(ca->mi.freespace_initialized);
+ u64 avail;
+ struct bucket_alloc_state s = { 0 };
+ bool waiting = false;
+again:
+ bch2_dev_usage_read_fast(ca, usage);
+ avail = dev_buckets_free(ca, *usage, reserve);
+
+ if (usage->d[BCH_DATA_need_discard].buckets > avail)
+ bch2_do_discards(c);
+
+ if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
+ bch2_do_gc_gens(c);
+
+ if (should_invalidate_buckets(ca, *usage))
+ bch2_do_invalidates(c);
+
+ if (!avail) {
+ if (cl && !waiting) {
+ closure_wait(&c->freelist_wait, cl);
+ waiting = true;
+ goto again;
+ }
+
+ if (!c->blocked_allocate)
+ c->blocked_allocate = local_clock();
+
+ ob = ERR_PTR(-BCH_ERR_freelist_empty);
+ goto err;
+ }
+
+ if (waiting)
+ closure_wake_up(&c->freelist_wait);
+alloc:
+ ob = likely(freespace)
+ ? bch2_bucket_alloc_freelist(trans, ca, reserve, &s, cl)
+ : bch2_bucket_alloc_early(trans, ca, reserve, &s, cl);
+
+ if (s.skipped_need_journal_commit * 2 > avail)
+ bch2_journal_flush_async(&c->journal, NULL);
+
+ if (!ob && freespace && !test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) {
+ freespace = false;
+ goto alloc;
+ }
+err:
+ if (!ob)
+ ob = ERR_PTR(-BCH_ERR_no_buckets_found);
+
+ if (!IS_ERR(ob))
+ trace_and_count(c, bucket_alloc, ca,
+ bch2_alloc_reserves[reserve],
+ ob->bucket,
+ usage->d[BCH_DATA_free].buckets,
+ avail,
+ bch2_copygc_wait_amount(c),
+ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+ &s,
+ cl == NULL,
+ "");
+ else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
+ trace_and_count(c, bucket_alloc_fail, ca,
+ bch2_alloc_reserves[reserve],
+ 0,
+ usage->d[BCH_DATA_free].buckets,
+ avail,
+ bch2_copygc_wait_amount(c),
+ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+ &s,
+ cl == NULL,
+ bch2_err_str(PTR_ERR(ob)));
+
+ return ob;
+}
+
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+ enum alloc_reserve reserve,
+ struct closure *cl)
+{
+ struct bch_dev_usage usage;
+ struct open_bucket *ob;
+
+ bch2_trans_do(c, NULL, NULL, 0,
+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
+ cl, &usage)));
return ob;
}
@@ -325,11 +615,12 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
return ret;
}
-void bch2_dev_stripe_increment(struct bch_dev *ca,
- struct dev_stripe_state *stripe)
+static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
+ struct dev_stripe_state *stripe,
+ struct bch_dev_usage *usage)
{
u64 *v = stripe->next_alloc + ca->dev_idx;
- u64 free_space = dev_buckets_available(ca);
+ u64 free_space = dev_buckets_available(ca, RESERVE_none);
u64 free_space_inv = free_space
? div64_u64(1ULL << 48, free_space)
: 1ULL << 48;
@@ -345,12 +636,19 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
*v = *v < scale ? 0 : *v - scale;
}
-#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0)
-#define BUCKET_ALLOC_USE_DURABILITY (1 << 1)
+void bch2_dev_stripe_increment(struct bch_dev *ca,
+ struct dev_stripe_state *stripe)
+{
+ struct bch_dev_usage usage;
-static void add_new_bucket(struct bch_fs *c,
+ bch2_dev_usage_read_fast(ca, &usage);
+ bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+}
+
+static int add_new_bucket(struct bch_fs *c,
struct open_buckets *ptrs,
struct bch_devs_mask *devs_may_alloc,
+ unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
unsigned flags,
@@ -359,60 +657,85 @@ static void add_new_bucket(struct bch_fs *c,
unsigned durability =
bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+ BUG_ON(*nr_effective >= nr_replicas);
+ BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
+
__clear_bit(ob->dev, devs_may_alloc->d);
- *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY)
+ *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
? durability : 1;
*have_cache |= !durability;
ob_push(c, ptrs, ob);
+
+ if (*nr_effective >= nr_replicas)
+ return 1;
+ if (ob->ec)
+ return 1;
+ return 0;
}
-int bch2_bucket_alloc_set(struct bch_fs *c,
+int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
struct open_buckets *ptrs,
struct dev_stripe_state *stripe,
struct bch_devs_mask *devs_may_alloc,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- enum alloc_reserve reserve,
unsigned flags,
+ enum bch_data_type data_type,
+ enum alloc_reserve reserve,
struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct dev_alloc_list devs_sorted =
bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+ unsigned dev;
struct bch_dev *ca;
- int ret = -INSUFFICIENT_DEVICES;
+ int ret = -BCH_ERR_insufficient_devices;
unsigned i;
BUG_ON(*nr_effective >= nr_replicas);
for (i = 0; i < devs_sorted.nr; i++) {
+ struct bch_dev_usage usage;
struct open_bucket *ob;
- ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+ dev = devs_sorted.devs[i];
+
+ rcu_read_lock();
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca)
+ percpu_ref_get(&ca->ref);
+ rcu_read_unlock();
+
if (!ca)
continue;
- if (!ca->mi.durability && *have_cache)
+ if (!ca->mi.durability && *have_cache) {
+ percpu_ref_put(&ca->ref);
continue;
+ }
+
+ ob = bch2_bucket_alloc_trans(trans, ca, reserve, cl, &usage);
+ if (!IS_ERR(ob))
+ bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+ percpu_ref_put(&ca->ref);
- ob = bch2_bucket_alloc(c, ca, reserve,
- flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
if (IS_ERR(ob)) {
ret = PTR_ERR(ob);
-
- if (cl)
- return ret;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
+ break;
continue;
}
- add_new_bucket(c, ptrs, devs_may_alloc,
- nr_effective, have_cache, flags, ob);
+ ob->data_type = data_type;
- bch2_dev_stripe_increment(ca, stripe);
-
- if (*nr_effective >= nr_replicas)
- return 0;
+ if (add_new_bucket(c, ptrs, devs_may_alloc,
+ nr_replicas, nr_effective,
+ have_cache, flags, ob)) {
+ ret = 0;
+ break;
+ }
}
return ret;
@@ -426,26 +749,25 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
* it's to a device we don't want:
*/
-static int bucket_alloc_from_stripe(struct bch_fs *c,
+static int bucket_alloc_from_stripe(struct btree_trans *trans,
struct open_buckets *ptrs,
struct write_point *wp,
struct bch_devs_mask *devs_may_alloc,
u16 target,
- unsigned erasure_code,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
+ enum alloc_reserve reserve,
unsigned flags,
struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct dev_alloc_list devs_sorted;
struct ec_stripe_head *h;
struct open_bucket *ob;
struct bch_dev *ca;
unsigned i, ec_idx;
-
- if (!erasure_code)
- return 0;
+ int ret = 0;
if (nr_replicas < 2)
return 0;
@@ -453,11 +775,9 @@ static int bucket_alloc_from_stripe(struct bch_fs *c,
if (ec_open_bucket(c, ptrs))
return 0;
- h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1,
- wp == &c->copygc_write_point,
- cl);
+ h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, reserve, cl);
if (IS_ERR(h))
- return -PTR_ERR(h);
+ return PTR_ERR(h);
if (!h)
return 0;
@@ -479,55 +799,126 @@ got_bucket:
ob->ec_idx = ec_idx;
ob->ec = h->s;
+ ec_stripe_new_get(h->s, STRIPE_REF_io);
- add_new_bucket(c, ptrs, devs_may_alloc,
- nr_effective, have_cache, flags, ob);
- atomic_inc(&h->s->pin);
+ ret = add_new_bucket(c, ptrs, devs_may_alloc,
+ nr_replicas, nr_effective,
+ have_cache, flags, ob);
out_put_head:
bch2_ec_stripe_head_put(c, h);
- return 0;
+ return ret;
}
/* Sector allocator */
-static void get_buckets_from_writepoint(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- unsigned flags,
- bool need_ec)
+static bool want_bucket(struct bch_fs *c,
+ struct write_point *wp,
+ struct bch_devs_mask *devs_may_alloc,
+ bool *have_cache, bool ec,
+ struct open_bucket *ob)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+ if (!test_bit(ob->dev, devs_may_alloc->d))
+ return false;
+
+ if (ob->data_type != wp->data_type)
+ return false;
+
+ if (!ca->mi.durability &&
+ (wp->data_type == BCH_DATA_btree || ec || *have_cache))
+ return false;
+
+ if (ec != (ob->ec != NULL))
+ return false;
+
+ return true;
+}
+
+static int bucket_alloc_set_writepoint(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct write_point *wp,
+ struct bch_devs_mask *devs_may_alloc,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ bool ec, unsigned flags)
{
struct open_buckets ptrs_skip = { .nr = 0 };
struct open_bucket *ob;
unsigned i;
+ int ret = 0;
open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
-
- if (*nr_effective < nr_replicas &&
- test_bit(ob->dev, devs_may_alloc->d) &&
- (ca->mi.durability ||
- (wp->data_type == BCH_DATA_user && !*have_cache)) &&
- (ob->ec || !need_ec)) {
- add_new_bucket(c, ptrs, devs_may_alloc,
- nr_effective, have_cache,
- flags, ob);
- } else {
+ if (!ret && want_bucket(c, wp, devs_may_alloc,
+ have_cache, ec, ob))
+ ret = add_new_bucket(c, ptrs, devs_may_alloc,
+ nr_replicas, nr_effective,
+ have_cache, flags, ob);
+ else
ob_push(c, &ptrs_skip, ob);
- }
}
wp->ptrs = ptrs_skip;
+
+ return ret;
+}
+
+static int bucket_alloc_set_partial(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct write_point *wp,
+ struct bch_devs_mask *devs_may_alloc,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache, bool ec,
+ enum alloc_reserve reserve,
+ unsigned flags)
+{
+ int i, ret = 0;
+
+ if (!c->open_buckets_partial_nr)
+ return 0;
+
+ spin_lock(&c->freelist_lock);
+
+ if (!c->open_buckets_partial_nr)
+ goto unlock;
+
+ for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
+ struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
+
+ if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_dev_usage usage;
+ u64 avail;
+
+ bch2_dev_usage_read_fast(ca, &usage);
+ avail = dev_buckets_free(ca, usage, reserve);
+ if (!avail)
+ continue;
+
+ array_remove_item(c->open_buckets_partial,
+ c->open_buckets_partial_nr,
+ i);
+ ob->on_partial_list = false;
+
+ ret = add_new_bucket(c, ptrs, devs_may_alloc,
+ nr_replicas, nr_effective,
+ have_cache, flags, ob);
+ if (ret)
+ break;
+ }
+ }
+unlock:
+ spin_unlock(&c->freelist_lock);
+ return ret;
}
-static int open_bucket_add_buckets(struct bch_fs *c,
+static int __open_bucket_add_buckets(struct btree_trans *trans,
struct open_buckets *ptrs,
struct write_point *wp,
struct bch_devs_list *devs_have,
u16 target,
- unsigned erasure_code,
+ bool erasure_code,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
@@ -535,11 +926,12 @@ static int open_bucket_add_buckets(struct bch_fs *c,
unsigned flags,
struct closure *_cl)
{
+ struct bch_fs *c = trans->c;
struct bch_devs_mask devs;
struct open_bucket *ob;
struct closure *cl = NULL;
- int ret;
unsigned i;
+ int ret;
rcu_read_lock();
devs = target_rw_devs(c, wp->data_type, target);
@@ -552,95 +944,175 @@ static int open_bucket_add_buckets(struct bch_fs *c,
open_bucket_for_each(c, ptrs, ob, i)
__clear_bit(ob->dev, devs.d);
+ if (erasure_code && ec_open_bucket(c, ptrs))
+ return 0;
+
+ ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
+ nr_replicas, nr_effective,
+ have_cache, erasure_code, flags);
+ if (ret)
+ return ret;
+
+ ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
+ nr_replicas, nr_effective,
+ have_cache, erasure_code, reserve, flags);
+ if (ret)
+ return ret;
+
if (erasure_code) {
- if (!ec_open_bucket(c, ptrs)) {
- get_buckets_from_writepoint(c, ptrs, wp, &devs,
- nr_replicas, nr_effective,
- have_cache, flags, true);
- if (*nr_effective >= nr_replicas)
- return 0;
+ ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
+ target,
+ nr_replicas, nr_effective,
+ have_cache,
+ reserve, flags, _cl);
+ } else {
+retry_blocking:
+ /*
+ * Try nonblocking first, so that if one device is full we'll try from
+ * other devices:
+ */
+ ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
+ nr_replicas, nr_effective, have_cache,
+ flags, wp->data_type, reserve, cl);
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+ !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
+ !cl && _cl) {
+ cl = _cl;
+ goto retry_blocking;
}
- if (!ec_open_bucket(c, ptrs)) {
- ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs,
- target, erasure_code,
- nr_replicas, nr_effective,
- have_cache, flags, _cl);
- if (ret == -FREELIST_EMPTY ||
- ret == -OPEN_BUCKETS_EMPTY)
- return ret;
- if (*nr_effective >= nr_replicas)
- return 0;
- }
}
- get_buckets_from_writepoint(c, ptrs, wp, &devs,
- nr_replicas, nr_effective,
- have_cache, flags, false);
- if (*nr_effective >= nr_replicas)
- return 0;
+ return ret;
+}
- percpu_down_read(&c->mark_lock);
- rcu_read_lock();
+static int open_bucket_add_buckets(struct btree_trans *trans,
+ struct open_buckets *ptrs,
+ struct write_point *wp,
+ struct bch_devs_list *devs_have,
+ u16 target,
+ unsigned erasure_code,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ enum alloc_reserve reserve,
+ unsigned flags,
+ struct closure *cl)
+{
+ int ret;
-retry_blocking:
- /*
- * Try nonblocking first, so that if one device is full we'll try from
- * other devices:
- */
- ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
+ if (erasure_code) {
+ ret = __open_bucket_add_buckets(trans, ptrs, wp,
+ devs_have, target, erasure_code,
nr_replicas, nr_effective, have_cache,
reserve, flags, cl);
- if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
- cl = _cl;
- goto retry_blocking;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
+ bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
+ bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ return ret;
+ if (*nr_effective >= nr_replicas)
+ return 0;
}
- rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
-
- return ret;
+ ret = __open_bucket_add_buckets(trans, ptrs, wp,
+ devs_have, target, false,
+ nr_replicas, nr_effective, have_cache,
+ reserve, flags, cl);
+ return ret < 0 ? ret : 0;
}
-void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
- struct open_buckets *obs)
+static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
+ struct bch_dev *ca, bool ec)
{
- struct open_buckets ptrs = { .nr = 0 };
- struct open_bucket *ob, *ob2;
- unsigned i, j;
-
- open_bucket_for_each(c, obs, ob, i) {
- bool drop = !ca || ob->dev == ca->dev_idx;
+ if (ec) {
+ return ob->ec != NULL;
+ } else if (ca) {
+ bool drop = ob->dev == ca->dev_idx;
+ struct open_bucket *ob2;
+ unsigned i;
if (!drop && ob->ec) {
mutex_lock(&ob->ec->lock);
- for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
- if (!ob->ec->blocks[j])
+ for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) {
+ if (!ob->ec->blocks[i])
continue;
- ob2 = c->open_buckets + ob->ec->blocks[j];
+ ob2 = c->open_buckets + ob->ec->blocks[i];
drop |= ob2->dev == ca->dev_idx;
}
mutex_unlock(&ob->ec->lock);
}
- if (drop)
- bch2_open_bucket_put(c, ob);
- else
- ob_push(c, &ptrs, ob);
+ return drop;
+ } else {
+ return true;
}
-
- *obs = ptrs;
}
-void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
- struct write_point *wp)
+static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+ bool ec, struct write_point *wp)
{
+ struct open_buckets ptrs = { .nr = 0 };
+ struct open_bucket *ob;
+ unsigned i;
+
mutex_lock(&wp->lock);
- bch2_open_buckets_stop_dev(c, ca, &wp->ptrs);
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ if (should_drop_bucket(ob, c, ca, ec))
+ bch2_open_bucket_put(c, ob);
+ else
+ ob_push(c, &ptrs, ob);
+ wp->ptrs = ptrs;
mutex_unlock(&wp->lock);
}
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
+ bool ec)
+{
+ unsigned i;
+
+ /* Next, close write points that point to this device... */
+ for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+ bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
+
+ bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
+ bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
+ bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
+
+ mutex_lock(&c->btree_reserve_cache_lock);
+ while (c->btree_reserve_cache_nr) {
+ struct btree_alloc *a =
+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+ bch2_open_buckets_put(c, &a->ob);
+ }
+ mutex_unlock(&c->btree_reserve_cache_lock);
+
+ spin_lock(&c->freelist_lock);
+ i = 0;
+ while (i < c->open_buckets_partial_nr) {
+ struct open_bucket *ob =
+ c->open_buckets + c->open_buckets_partial[i];
+
+ if (should_drop_bucket(ob, c, ca, ec)) {
+ --c->open_buckets_partial_nr;
+ swap(c->open_buckets_partial[i],
+ c->open_buckets_partial[c->open_buckets_partial_nr]);
+ ob->on_partial_list = false;
+ spin_unlock(&c->freelist_lock);
+ bch2_open_bucket_put(c, ob);
+ spin_lock(&c->freelist_lock);
+ } else {
+ i++;
+ }
+ }
+ spin_unlock(&c->freelist_lock);
+
+ bch2_ec_stop_dev(c, ca);
+}
+
static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
unsigned long write_point)
{
@@ -686,8 +1158,7 @@ static bool try_increase_writepoints(struct bch_fs *c)
return true;
}
-static bool try_decrease_writepoints(struct bch_fs *c,
- unsigned old_nr)
+static bool try_decrease_writepoints(struct bch_fs *c, unsigned old_nr)
{
struct write_point *wp;
@@ -708,19 +1179,29 @@ static bool try_decrease_writepoints(struct bch_fs *c,
hlist_del_rcu(&wp->node);
mutex_unlock(&c->write_points_hash_lock);
- bch2_writepoint_stop(c, NULL, wp);
+ bch2_writepoint_stop(c, NULL, false, wp);
return true;
}
-static struct write_point *writepoint_find(struct bch_fs *c,
+static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
+ struct mutex *lock)
+{
+ if (!mutex_trylock(lock)) {
+ bch2_trans_unlock(trans);
+ mutex_lock(lock);
+ }
+}
+
+static struct write_point *writepoint_find(struct btree_trans *trans,
unsigned long write_point)
{
+ struct bch_fs *c = trans->c;
struct write_point *wp, *oldest;
struct hlist_head *head;
if (!(write_point & 1UL)) {
wp = (struct write_point *) write_point;
- mutex_lock(&wp->lock);
+ bch2_trans_mutex_lock_norelock(trans, &wp->lock);
return wp;
}
@@ -729,7 +1210,7 @@ restart_find:
wp = __writepoint_find(head, write_point);
if (wp) {
lock_wp:
- mutex_lock(&wp->lock);
+ bch2_trans_mutex_lock_norelock(trans, &wp->lock);
if (wp->write_point == write_point)
goto out;
mutex_unlock(&wp->lock);
@@ -742,8 +1223,8 @@ restart_find_oldest:
if (!oldest || time_before64(wp->last_used, oldest->last_used))
oldest = wp;
- mutex_lock(&oldest->lock);
- mutex_lock(&c->write_points_hash_lock);
+ bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
+ bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
if (oldest >= c->write_points + c->write_points_nr ||
try_increase_writepoints(c)) {
mutex_unlock(&c->write_points_hash_lock);
@@ -764,35 +1245,35 @@ restart_find_oldest:
hlist_add_head_rcu(&wp->node, head);
mutex_unlock(&c->write_points_hash_lock);
out:
- wp->last_used = sched_clock();
+ wp->last_used = local_clock();
return wp;
}
/*
* Get us an open_bucket we can allocate from, return with it locked:
*/
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
- unsigned target,
- unsigned erasure_code,
- struct write_point_specifier write_point,
- struct bch_devs_list *devs_have,
- unsigned nr_replicas,
- unsigned nr_replicas_required,
- enum alloc_reserve reserve,
- unsigned flags,
- struct closure *cl)
+int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
+ unsigned target,
+ unsigned erasure_code,
+ struct write_point_specifier write_point,
+ struct bch_devs_list *devs_have,
+ unsigned nr_replicas,
+ unsigned nr_replicas_required,
+ enum alloc_reserve reserve,
+ unsigned flags,
+ struct closure *cl,
+ struct write_point **wp_ret)
{
+ struct bch_fs *c = trans->c;
struct write_point *wp;
struct open_bucket *ob;
struct open_buckets ptrs;
unsigned nr_effective, write_points_nr;
- unsigned ob_flags = 0;
bool have_cache;
int ret;
int i;
- if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
- ob_flags |= BUCKET_ALLOC_USE_DURABILITY;
+ BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
@@ -801,35 +1282,44 @@ retry:
write_points_nr = c->write_points_nr;
have_cache = false;
- wp = writepoint_find(c, write_point.v);
-
- if (wp->data_type == BCH_DATA_user)
- ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
+ *wp_ret = wp = writepoint_find(trans, write_point.v);
/* metadata may not allocate on cache devices: */
if (wp->data_type != BCH_DATA_user)
have_cache = true;
- if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
- ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+ if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
&have_cache, reserve,
- ob_flags, cl);
- } else {
- ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
- target, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, reserve,
- ob_flags, NULL);
- if (!ret)
+ flags, NULL);
+ if (!ret ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto alloc_done;
- ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+ /* Don't retry from all devices if we're out of open buckets: */
+ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ goto allocate_blocking;
+
+ /*
+ * Only try to allocate cache (durability = 0 devices) from the
+ * specified target:
+ */
+ have_cache = true;
+
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
0, erasure_code,
nr_replicas, &nr_effective,
&have_cache, reserve,
- ob_flags, cl);
+ flags, cl);
+ } else {
+allocate_blocking:
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+ target, erasure_code,
+ nr_replicas, &nr_effective,
+ &have_cache, reserve,
+ flags, cl);
}
alloc_done:
BUG_ON(!ret && nr_effective < nr_replicas);
@@ -837,7 +1327,7 @@ alloc_done:
if (erasure_code && !ec_open_bucket(c, &ptrs))
pr_debug("failed to get ec bucket: ret %u", ret);
- if (ret == -INSUFFICIENT_DEVICES &&
+ if (ret == -BCH_ERR_insufficient_devices &&
nr_effective >= nr_replicas_required)
ret = 0;
@@ -857,9 +1347,7 @@ alloc_done:
BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
- verify_not_stale(c, &wp->ptrs);
-
- return wp;
+ return 0;
err:
open_bucket_for_each(c, &wp->ptrs, ob, i)
if (ptrs.nr < ARRAY_SIZE(ptrs.v))
@@ -870,19 +1358,17 @@ err:
mutex_unlock(&wp->lock);
- if (ret == -FREELIST_EMPTY &&
+ if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
try_decrease_writepoints(c, write_points_nr))
goto retry;
- switch (ret) {
- case -OPEN_BUCKETS_EMPTY:
- case -FREELIST_EMPTY:
- return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
- case -INSUFFICIENT_DEVICES:
- return ERR_PTR(-EROFS);
- default:
- BUG();
- }
+ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
+ bch2_err_matches(ret, BCH_ERR_freelist_empty))
+ return cl
+ ? -BCH_ERR_bucket_alloc_blocked
+ : -BCH_ERR_ENOSPC_bucket_alloc;
+
+ return ret;
}
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
@@ -899,34 +1385,11 @@ struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
};
}
-/*
- * Append pointers to the space we just allocated to @k, and mark @sectors space
- * as allocated out of @ob
- */
void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
struct bkey_i *k, unsigned sectors,
bool cached)
-
{
- struct open_bucket *ob;
- unsigned i;
-
- BUG_ON(sectors > wp->sectors_free);
- wp->sectors_free -= sectors;
-
- open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
- struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
-
- ptr.cached = cached ||
- (!ca->mi.durability &&
- wp->data_type == BCH_DATA_user);
-
- bch2_bkey_append_ptr(k, ptr);
-
- BUG_ON(sectors > ob->sectors_free);
- ob->sectors_free -= sectors;
- }
+ bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
}
/*
@@ -935,17 +1398,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
*/
void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
{
- struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
- wp->ptrs = keep;
-
- mutex_unlock(&wp->lock);
-
- bch2_open_buckets_put(c, &ptrs);
+ bch2_alloc_sectors_done_inlined(c, wp);
}
static inline void writepoint_init(struct write_point *wp,
@@ -953,6 +1406,10 @@ static inline void writepoint_init(struct write_point *wp,
{
mutex_init(&wp->lock);
wp->data_type = type;
+
+ INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
+ INIT_LIST_HEAD(&wp->writes);
+ spin_lock_init(&wp->writes_lock);
}
void bch2_fs_allocator_foreground_init(struct bch_fs *c)
@@ -983,28 +1440,91 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
wp < c->write_points + c->write_points_nr; wp++) {
writepoint_init(wp, BCH_DATA_user);
- wp->last_used = sched_clock();
+ wp->last_used = local_clock();
wp->write_point = (unsigned long) wp;
hlist_add_head_rcu(&wp->node,
writepoint_hash(c, wp->write_point));
}
}
+static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ unsigned data_type = ob->data_type;
+ barrier(); /* READ_ONCE() doesn't work on bitfields */
+
+ prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
+ ob - c->open_buckets,
+ atomic_read(&ob->pin),
+ data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
+ ob->dev, ob->bucket, ob->gen,
+ ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
+ if (ob->ec)
+ prt_printf(out, " ec idx %llu", ob->ec->idx);
+ if (ob->on_partial_list)
+ prt_str(out, " partial");
+ prt_newline(out);
+}
+
void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
{
struct open_bucket *ob;
+ out->atomic++;
+
for (ob = c->open_buckets;
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
spin_lock(&ob->lock);
- if (ob->valid && !ob->on_partial_list) {
- pr_buf(out, "%zu ref %u type %s\n",
- ob - c->open_buckets,
- atomic_read(&ob->pin),
- bch2_data_types[ob->data_type]);
- }
+ if (ob->valid && !ob->on_partial_list)
+ bch2_open_bucket_to_text(out, c, ob);
spin_unlock(&ob->lock);
}
+ --out->atomic;
+}
+
+void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ unsigned i;
+
+ out->atomic++;
+ spin_lock(&c->freelist_lock);
+
+ for (i = 0; i < c->open_buckets_partial_nr; i++)
+ bch2_open_bucket_to_text(out, c,
+ c->open_buckets + c->open_buckets_partial[i]);
+
+ spin_unlock(&c->freelist_lock);
+ --out->atomic;
+}
+
+static const char * const bch2_write_point_states[] = {
+#define x(n) #n,
+ WRITE_POINT_STATES()
+#undef x
+ NULL
+};
+
+void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct write_point *wp;
+ unsigned i;
+
+ for (wp = c->write_points;
+ wp < c->write_points + ARRAY_SIZE(c->write_points);
+ wp++) {
+ prt_printf(out, "%lu: ", wp->write_point);
+ prt_human_readable_u64(out, wp->sectors_allocated);
+
+ prt_printf(out, " last wrote: ");
+ bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+ for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+ prt_printf(out, " %s: ", bch2_write_point_states[i]);
+ bch2_pr_time_units(out, wp->time[i]);
+ }
+
+ prt_newline(out);
+ }
}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index d466bda9afc8..8a1cf425091b 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -4,6 +4,8 @@
#include "bcachefs.h"
#include "alloc_types.h"
+#include "extents.h"
+#include "super.h"
#include <linux/hash.h>
@@ -12,6 +14,10 @@ struct bch_dev;
struct bch_fs;
struct bch_devs_List;
+extern const char * const bch2_alloc_reserves[];
+
+void bch2_reset_alloc_cursors(struct bch_fs *);
+
struct dev_alloc_list {
unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX];
@@ -25,8 +31,7 @@ void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
long bch2_bucket_alloc_new_fs(struct bch_dev *);
struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
- enum alloc_reserve, bool,
- struct closure *);
+ enum alloc_reserve, struct closure *);
static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
struct open_bucket *ob)
@@ -77,6 +82,21 @@ static inline void bch2_open_buckets_put(struct bch_fs *c,
ptrs->nr = 0;
}
+static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp)
+{
+ struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
+ struct open_bucket *ob;
+ unsigned i;
+
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
+ wp->ptrs = keep;
+
+ mutex_unlock(&wp->lock);
+
+ bch2_open_buckets_put(c, &ptrs);
+}
+
static inline void bch2_open_bucket_get(struct bch_fs *c,
struct write_point *wp,
struct open_buckets *ptrs)
@@ -115,30 +135,74 @@ static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucke
return false;
}
-int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
+static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+ bool ret;
+
+ if (bch2_bucket_is_open(c, dev, bucket))
+ return true;
+
+ spin_lock(&c->freelist_lock);
+ ret = bch2_bucket_is_open(c, dev, bucket);
+ spin_unlock(&c->freelist_lock);
+
+ return ret;
+}
+
+int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
- unsigned, unsigned *, bool *, enum alloc_reserve,
- unsigned, struct closure *);
-
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
- unsigned, unsigned,
- struct write_point_specifier,
- struct bch_devs_list *,
- unsigned, unsigned,
- enum alloc_reserve,
- unsigned,
- struct closure *);
+ unsigned, unsigned *, bool *, unsigned,
+ enum bch_data_type, enum alloc_reserve,
+ struct closure *);
+
+int bch2_alloc_sectors_start_trans(struct btree_trans *,
+ unsigned, unsigned,
+ struct write_point_specifier,
+ struct bch_devs_list *,
+ unsigned, unsigned,
+ enum alloc_reserve,
+ unsigned,
+ struct closure *,
+ struct write_point **);
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+static inline void
+bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
+ struct bkey_i *k, unsigned sectors,
+ bool cached)
+{
+ struct open_bucket *ob;
+ unsigned i;
+
+ BUG_ON(sectors > wp->sectors_free);
+ wp->sectors_free -= sectors;
+ wp->sectors_allocated += sectors;
+
+ open_bucket_for_each(c, &wp->ptrs, ob, i) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
+
+ ptr.cached = cached ||
+ (!ca->mi.durability &&
+ wp->data_type == BCH_DATA_user);
+
+ bch2_bkey_append_ptr(k, ptr);
+
+ BUG_ON(sectors > ob->sectors_free);
+ ob->sectors_free -= sectors;
+ }
+}
+
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
struct bkey_i *, unsigned, bool);
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
-void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
- struct open_buckets *);
-
-void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
- struct write_point *);
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
static inline struct write_point_specifier writepoint_hashed(unsigned long v)
{
@@ -153,5 +217,8 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
void bch2_fs_allocator_foreground_init(struct bch_fs *);
void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 409232e3d998..cd0c50aae416 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -8,30 +8,30 @@
#include "clock_types.h"
#include "fifo.h"
-struct ec_bucket_buf;
+struct bucket_alloc_state {
+ u64 buckets_seen;
+ u64 skipped_open;
+ u64 skipped_need_journal_commit;
+ u64 skipped_nocow;
+ u64 skipped_nouse;
+};
-#define ALLOC_THREAD_STATES() \
- x(stopped) \
- x(running) \
- x(blocked) \
- x(blocked_full)
+struct ec_bucket_buf;
-enum allocator_states {
-#define x(n) ALLOCATOR_##n,
- ALLOC_THREAD_STATES()
-#undef x
-};
+#define BCH_ALLOC_RESERVES() \
+ x(btree_movinggc) \
+ x(btree) \
+ x(movinggc) \
+ x(none) \
+ x(stripe)
enum alloc_reserve {
- RESERVE_BTREE_MOVINGGC = -2,
- RESERVE_BTREE = -1,
- RESERVE_MOVINGGC = 0,
- RESERVE_NONE = 1,
- RESERVE_NR = 2,
+#define x(name) RESERVE_##name,
+ BCH_ALLOC_RESERVES()
+#undef x
+ RESERVE_NR,
};
-typedef FIFO(long) alloc_fifo;
-
#define OPEN_BUCKETS_COUNT 1024
#define WRITE_POINT_HASH_NR 32
@@ -53,14 +53,13 @@ struct open_bucket {
* the block in the stripe this open_bucket corresponds to:
*/
u8 ec_idx;
- enum bch_data_type data_type:3;
+ enum bch_data_type data_type:6;
unsigned valid:1;
unsigned on_partial_list:1;
- int alloc_reserve:3;
- unsigned sectors_free;
u8 dev;
u8 gen;
+ u32 sectors_free;
u64 bucket;
struct ec_stripe_new *ec;
};
@@ -76,30 +75,50 @@ struct dev_stripe_state {
u64 next_alloc[BCH_SB_MEMBERS_MAX];
};
+#define WRITE_POINT_STATES() \
+ x(stopped) \
+ x(waiting_io) \
+ x(waiting_work) \
+ x(running)
+
+enum write_point_state {
+#define x(n) WRITE_POINT_##n,
+ WRITE_POINT_STATES()
+#undef x
+ WRITE_POINT_STATE_NR
+};
+
struct write_point {
- struct hlist_node node;
- struct mutex lock;
- u64 last_used;
- unsigned long write_point;
- enum bch_data_type data_type;
+ struct {
+ struct hlist_node node;
+ struct mutex lock;
+ u64 last_used;
+ unsigned long write_point;
+ enum bch_data_type data_type;
+
+ /* calculated based on how many pointers we're actually going to use: */
+ unsigned sectors_free;
+
+ struct open_buckets ptrs;
+ struct dev_stripe_state stripe;
- /* calculated based on how many pointers we're actually going to use: */
- unsigned sectors_free;
+ u64 sectors_allocated;
+ } __attribute__((__aligned__(SMP_CACHE_BYTES)));
- struct open_buckets ptrs;
- struct dev_stripe_state stripe;
+ struct {
+ struct work_struct index_update_work;
+
+ struct list_head writes;
+ spinlock_t writes_lock;
+
+ enum write_point_state state;
+ u64 last_state_change;
+ u64 time[WRITE_POINT_STATE_NR];
+ } __attribute__((__aligned__(SMP_CACHE_BYTES)));
};
struct write_point_specifier {
unsigned long v;
};
-struct alloc_heap_entry {
- size_t bucket;
- size_t nr;
- unsigned long key;
-};
-
-typedef HEAP(struct alloc_heap_entry) alloc_heap;
-
#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
new file mode 100644
index 000000000000..a3a1ed6e5968
--- /dev/null
+++ b/fs/bcachefs/backpointers.c
@@ -0,0 +1,886 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bbpos.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+
+#include <linux/mm.h>
+
+static bool extent_matches_bp(struct bch_fs *c,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k,
+ struct bpos bucket,
+ struct bch_backpointer bp)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bpos bucket2;
+ struct bch_backpointer bp2;
+
+ if (p.ptr.cached)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
+ &bucket2, &bp2);
+ if (bpos_eq(bucket, bucket2) &&
+ !memcmp(&bp, &bp2, sizeof(bp)))
+ return true;
+ }
+
+ return false;
+}
+
+int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
+{
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+ struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+
+ if (bkey_val_bytes(bp.k) < sizeof(*bp.v)) {
+ prt_str(err, "incorrect value size");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
+ prt_str(err, "backpointer at wrong pos");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ return 0;
+}
+
+void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
+{
+ prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
+ bch2_btree_ids[bp->btree_id],
+ bp->level,
+ (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+ (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+ bp->bucket_len);
+ bch2_bpos_to_text(out, bp->pos);
+}
+
+void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ prt_str(out, "bucket=");
+ bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
+ prt_str(out, " ");
+
+ bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
+}
+
+void bch2_backpointer_swab(struct bkey_s k)
+{
+ struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
+
+ bp.v->bucket_offset = swab32(bp.v->bucket_offset);
+ bp.v->bucket_len = swab32(bp.v->bucket_len);
+ bch2_bpos_swab(&bp.v->pos);
+}
+
+static noinline int backpointer_mod_err(struct btree_trans *trans,
+ struct bch_backpointer bp,
+ struct bkey_s_c bp_k,
+ struct bkey_s_c orig_k,
+ bool insert)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ if (insert) {
+ prt_printf(&buf, "existing backpointer found when inserting ");
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "found ");
+ bch2_bkey_val_to_text(&buf, c, bp_k);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "for ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+
+ bch_err(c, "%s", buf.buf);
+ } else if (test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+ prt_printf(&buf, "backpointer not found when deleting");
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "searching for ");
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "got ");
+ bch2_bkey_val_to_text(&buf, c, bp_k);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "for ");
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+
+ bch_err(c, "%s", buf.buf);
+ }
+
+ printbuf_exit(&buf);
+
+ if (test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+ bch2_inconsistent_error(c);
+ return -EIO;
+ } else {
+ return 0;
+ }
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
+ struct bpos bucket,
+ struct bch_backpointer bp,
+ struct bkey_s_c orig_k,
+ bool insert)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_backpointer *bp_k;
+ struct btree_iter bp_iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+ ret = PTR_ERR_OR_ZERO(bp_k);
+ if (ret)
+ return ret;
+
+ bkey_backpointer_init(&bp_k->k_i);
+ bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
+ bp_k->v = bp;
+
+ if (!insert) {
+ bp_k->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp_k->k, 0);
+ }
+
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+ bp_k->k.p,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&bp_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (insert
+ ? k.k->type
+ : (k.k->type != KEY_TYPE_backpointer ||
+ memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) {
+ ret = backpointer_mod_err(trans, bp, k, orig_k, insert);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ return ret;
+}
+
+/*
+ * Find the next backpointer >= *bp_offset:
+ */
+int bch2_get_next_backpointer(struct btree_trans *trans,
+ struct bpos bucket, int gen,
+ struct bpos *bp_pos,
+ struct bch_backpointer *bp,
+ unsigned iter_flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
+ struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL };
+ struct bkey_s_c k;
+ int ret = 0;
+
+ if (bpos_ge(*bp_pos, bp_end_pos))
+ goto done;
+
+ if (gen >= 0) {
+ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+ bucket, BTREE_ITER_CACHED|iter_flags);
+ k = bch2_btree_iter_peek_slot(&alloc_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto out;
+
+ if (k.k->type != KEY_TYPE_alloc_v4 ||
+ bkey_s_c_to_alloc_v4(k).v->gen != gen)
+ goto done;
+ }
+
+ *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0));
+
+ for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
+ *bp_pos, iter_flags, k, ret) {
+ if (bpos_ge(k.k->p, bp_end_pos))
+ break;
+
+ *bp_pos = k.k->p;
+ *bp = *bkey_s_c_to_backpointer(k).v;
+ goto out;
+ }
+done:
+ *bp_pos = SPOS_MAX;
+out:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ return ret;
+}
+
+static void backpointer_not_found(struct btree_trans *trans,
+ struct bpos bp_pos,
+ struct bch_backpointer bp,
+ struct bkey_s_c k,
+ const char *thing_it_points_to)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+
+ if (likely(!bch2_backpointers_no_use_write_buffer))
+ return;
+
+ prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
+ thing_it_points_to);
+ prt_printf(&buf, "bucket: ");
+ bch2_bpos_to_text(&buf, bucket);
+ prt_printf(&buf, "\n ");
+
+ prt_printf(&buf, "backpointer pos: ");
+ bch2_bpos_to_text(&buf, bp_pos);
+ prt_printf(&buf, "\n ");
+
+ bch2_backpointer_to_text(&buf, &bp);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
+ bch_err_ratelimited(c, "%s", buf.buf);
+ else
+ bch2_trans_inconsistent(trans, "%s", buf.buf);
+
+ printbuf_exit(&buf);
+}
+
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos bp_pos,
+ struct bch_backpointer bp,
+ unsigned iter_flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+ struct bkey_s_c k;
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.btree_id,
+ bp.pos,
+ 0,
+ min(bp.level, c->btree_roots[bp.btree_id].level),
+ iter_flags);
+ k = bch2_btree_iter_peek_slot(iter);
+ if (bkey_err(k)) {
+ bch2_trans_iter_exit(trans, iter);
+ return k;
+ }
+
+ if (bp.level == c->btree_roots[bp.btree_id].level + 1)
+ k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key);
+
+ if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+ return k;
+
+ bch2_trans_iter_exit(trans, iter);
+
+ if (unlikely(bch2_backpointers_no_use_write_buffer)) {
+ if (bp.level) {
+ struct btree *b;
+
+ /*
+ * If a backpointer for a btree node wasn't found, it may be
+ * because it was overwritten by a new btree node that hasn't
+ * been written out yet - backpointer_get_node() checks for
+ * this:
+ */
+ b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
+ if (!IS_ERR_OR_NULL(b))
+ return bkey_i_to_s_c(&b->key);
+
+ bch2_trans_iter_exit(trans, iter);
+
+ if (IS_ERR(b))
+ return bkey_s_c_err(PTR_ERR(b));
+ return bkey_s_c_null;
+ }
+
+ backpointer_not_found(trans, bp_pos, bp, k, "extent");
+ }
+
+ return bkey_s_c_null;
+}
+
+struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos bp_pos,
+ struct bch_backpointer bp)
+{
+ struct bch_fs *c = trans->c;
+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+ struct btree *b;
+
+ BUG_ON(!bp.level);
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.btree_id,
+ bp.pos,
+ 0,
+ bp.level - 1,
+ 0);
+ b = bch2_btree_iter_peek_node(iter);
+ if (IS_ERR(b))
+ goto err;
+
+ if (b && extent_matches_bp(c, bp.btree_id, bp.level,
+ bkey_i_to_s_c(&b->key),
+ bucket, bp))
+ return b;
+
+ if (b && btree_node_will_make_reachable(b)) {
+ b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+ } else {
+ backpointer_not_found(trans, bp_pos, bp,
+ bkey_i_to_s_c(&b->key), "btree node");
+ b = NULL;
+ }
+err:
+ bch2_trans_iter_exit(trans, iter);
+ return b;
+}
+
+static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter alloc_iter = { NULL };
+ struct bch_dev *ca;
+ struct bkey_s_c alloc_k;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
+ "backpointer for mising device:\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, bp_iter, 0);
+ goto out;
+ }
+
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+ bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+ bp_pos_to_bucket(c, k.k->p), 0);
+
+ alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+ ret = bkey_err(alloc_k);
+ if (ret)
+ goto out;
+
+ if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
+ "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+ alloc_iter.pos.inode, alloc_iter.pos.offset,
+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, bp_iter, 0);
+ goto out;
+ }
+out:
+fsck_err:
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/* verify that every backpointer has a corresponding alloc key */
+int bch2_check_btree_backpointers(struct bch_fs *c)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ return bch2_trans_run(c,
+ for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_backpointers, POS_MIN, 0, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ bch2_check_btree_backpointer(&trans, &iter, k)));
+}
+
+struct bpos_level {
+ unsigned level;
+ struct bpos pos;
+};
+
+static int check_bp_exists(struct btree_trans *trans,
+ struct bpos bucket,
+ struct bch_backpointer bp,
+ struct bkey_s_c orig_k,
+ struct bpos bucket_start,
+ struct bpos bucket_end,
+ struct bpos_level *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter bp_iter = { NULL };
+ struct printbuf buf = PRINTBUF;
+ struct bkey_s_c bp_k;
+ int ret;
+
+ if (bpos_lt(bucket, bucket_start) ||
+ bpos_gt(bucket, bucket_end))
+ return 0;
+
+ if (!bch2_dev_bucket_exists(c, bucket))
+ goto missing;
+
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp(c, bucket, bp.bucket_offset),
+ 0);
+ bp_k = bch2_btree_iter_peek_slot(&bp_iter);
+ ret = bkey_err(bp_k);
+ if (ret)
+ goto err;
+
+ if (bp_k.k->type != KEY_TYPE_backpointer ||
+ memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
+ if (last_flushed->level != bp.level ||
+ !bpos_eq(last_flushed->pos, orig_k.k->p)) {
+ last_flushed->level = bp.level;
+ last_flushed->pos = orig_k.k->p;
+
+ ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+ -BCH_ERR_transaction_restart_write_buffer_flush;
+ goto out;
+ }
+ goto missing;
+ }
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ printbuf_exit(&buf);
+ return ret;
+missing:
+ prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
+ bch2_btree_ids[bp.btree_id], bp.level);
+ bch2_bkey_val_to_text(&buf, c, orig_k);
+ prt_printf(&buf, "\nbp pos ");
+ bch2_bpos_to_text(&buf, bp_iter.pos);
+
+ if (c->sb.version < bcachefs_metadata_version_backpointers ||
+ c->opts.reconstruct_alloc ||
+ fsck_err(c, "%s", buf.buf))
+ ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
+
+ goto out;
+}
+
+static int check_extent_to_backpointers(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos bucket_start,
+ struct bpos bucket_end,
+ struct bpos_level *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_btree_iter_peek_all_levels(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+ if (!k.k)
+ return 0;
+
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bpos bucket_pos;
+ struct bch_backpointer bp;
+
+ if (p.ptr.cached)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+ k, p, &bucket_pos, &bp);
+
+ ret = check_bp_exists(trans, bucket_pos, bp, k,
+ bucket_start, bucket_end,
+ last_flushed);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int check_btree_root_to_backpointers(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos bucket_start,
+ struct bpos bucket_end,
+ struct bpos_level *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct btree *b;
+ struct bkey_s_c k;
+ struct bkey_ptrs_c ptrs;
+ struct extent_ptr_decoded p;
+ const union bch_extent_entry *entry;
+ int ret;
+
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+ c->btree_roots[btree_id].level, 0);
+ b = bch2_btree_iter_peek_node(&iter);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto err;
+
+ BUG_ON(b != btree_node_root(c, b));
+
+ k = bkey_i_to_s_c(&b->key);
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ struct bpos bucket_pos;
+ struct bch_backpointer bp;
+
+ if (p.ptr.cached)
+ continue;
+
+ bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1,
+ k, p, &bucket_pos, &bp);
+
+ ret = check_bp_exists(trans, bucket_pos, bp, k,
+ bucket_start, bucket_end,
+ last_flushed);
+ if (ret)
+ goto err;
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
+{
+ return (struct bbpos) {
+ .btree = bp.btree_id,
+ .pos = bp.pos,
+ };
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+ struct sysinfo i;
+ u64 mem_bytes;
+
+ si_meminfo(&i);
+ mem_bytes = i.totalram * i.mem_unit;
+ return div_u64(mem_bytes >> 1, btree_bytes(c));
+}
+
+int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
+ unsigned btree_leaf_mask,
+ unsigned btree_interior_mask,
+ struct bbpos start, struct bbpos *end)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+ enum btree_id btree;
+ int ret = 0;
+
+ for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
+ unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+
+ if (!((1U << btree) & btree_leaf_mask) &&
+ !((1U << btree) & btree_interior_mask))
+ continue;
+
+ bch2_trans_node_iter_init(trans, &iter, btree,
+ btree == start.btree ? start.pos : POS_MIN,
+ 0, depth, 0);
+ /*
+ * for_each_btree_key_contineu() doesn't check the return value
+ * from bch2_btree_iter_advance(), which is needed when
+ * iterating over interior nodes where we'll see keys at
+ * SPOS_MAX:
+ */
+ do {
+ k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
+ ret = bkey_err(k);
+ if (!k.k || ret)
+ break;
+
+ --btree_nodes;
+ if (!btree_nodes) {
+ *end = BBPOS(btree, k.k->p);
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+ }
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ *end = BBPOS_MAX;
+ return ret;
+}
+
+static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
+ struct bpos bucket_start,
+ struct bpos bucket_end)
+{
+ struct btree_iter iter;
+ enum btree_id btree_id;
+ struct bpos_level last_flushed = { UINT_MAX };
+ int ret = 0;
+
+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+ unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+ depth,
+ BTREE_ITER_ALL_LEVELS|
+ BTREE_ITER_PREFETCH);
+
+ do {
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_extent_to_backpointers(trans, &iter,
+ bucket_start, bucket_end,
+ &last_flushed));
+ if (ret)
+ break;
+ } while (!bch2_btree_iter_advance(&iter));
+
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ break;
+
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_btree_root_to_backpointers(trans, btree_id,
+ bucket_start, bucket_end,
+ &last_flushed));
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
+ struct bpos bucket)
+{
+ return bch2_dev_exists2(c, bucket.inode)
+ ? bucket_pos_to_bp(c, bucket, 0)
+ : bucket;
+}
+
+int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
+ struct bpos start, struct bpos *end)
+{
+ struct btree_iter alloc_iter;
+ struct btree_iter bp_iter;
+ struct bkey_s_c alloc_k, bp_k;
+ size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+ bool alloc_end = false, bp_end = false;
+ int ret = 0;
+
+ bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+ start, 0, 1, 0);
+ bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+ bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
+ while (1) {
+ alloc_k = !alloc_end
+ ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
+ : bkey_s_c_null;
+ bp_k = !bp_end
+ ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
+ : bkey_s_c_null;
+
+ ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
+ if ((!alloc_k.k && !bp_k.k) || ret) {
+ *end = SPOS_MAX;
+ break;
+ }
+
+ --btree_nodes;
+ if (!btree_nodes) {
+ *end = alloc_k.k->p;
+ break;
+ }
+
+ if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
+ bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
+ if (!bch2_btree_iter_advance(&alloc_iter))
+ alloc_end = true;
+ } else {
+ if (!bch2_btree_iter_advance(&bp_iter))
+ bp_end = true;
+ }
+ }
+ bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ return ret;
+}
+
+int bch2_check_extents_to_backpointers(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct bpos start = POS_MIN, end;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ while (1) {
+ ret = bch2_get_alloc_in_memory_pos(&trans, start, &end);
+ if (ret)
+ break;
+
+ if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
+ bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
+ __func__, btree_nodes_fit_in_ram(c));
+
+ if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "check_extents_to_backpointers(): ");
+ bch2_bpos_to_text(&buf, start);
+ prt_str(&buf, "-");
+ bch2_bpos_to_text(&buf, end);
+
+ bch_verbose(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ ret = bch2_check_extents_to_backpointers_pass(&trans, start, end);
+ if (ret || bpos_eq(end, SPOS_MAX))
+ break;
+
+ start = bpos_successor(end);
+ }
+ bch2_trans_exit(&trans);
+
+ return ret;
+}
+
+static int check_one_backpointer(struct btree_trans *trans,
+ struct bbpos start,
+ struct bbpos end,
+ struct bkey_s_c_backpointer bp,
+ struct bpos *last_flushed_pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bbpos pos = bp_to_bbpos(*bp.v);
+ struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ if (bbpos_cmp(pos, start) < 0 ||
+ bbpos_cmp(pos, end) > 0)
+ return 0;
+
+ k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0);
+ ret = bkey_err(k);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ return 0;
+ if (ret)
+ return ret;
+
+ if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) {
+ *last_flushed_pos = bp.k->p;
+ ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+ -BCH_ERR_transaction_restart_write_buffer_flush;
+ goto out;
+ }
+
+ if (fsck_err_on(!k.k, c,
+ "backpointer for missing extent\n %s",
+ (bch2_backpointer_k_to_text(&buf, c, bp.s_c), buf.buf)))
+ return bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
+out:
+fsck_err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
+ struct bbpos start,
+ struct bbpos end)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bpos last_flushed_pos = SPOS_MAX;
+
+ return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
+ POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_one_backpointer(trans, start, end,
+ bkey_s_c_to_backpointer(k),
+ &last_flushed_pos));
+}
+
+int bch2_check_backpointers_to_extents(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ while (1) {
+ ret = bch2_get_btree_in_memory_pos(&trans,
+ (1U << BTREE_ID_extents)|
+ (1U << BTREE_ID_reflink),
+ ~0,
+ start, &end);
+ if (ret)
+ break;
+
+ if (!bbpos_cmp(start, BBPOS_MIN) &&
+ bbpos_cmp(end, BBPOS_MAX))
+ bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass",
+ __func__, btree_nodes_fit_in_ram(c));
+
+ if (bbpos_cmp(start, BBPOS_MIN) ||
+ bbpos_cmp(end, BBPOS_MAX)) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "check_backpointers_to_extents(): ");
+ bch2_bbpos_to_text(&buf, start);
+ prt_str(&buf, "-");
+ bch2_bbpos_to_text(&buf, end);
+
+ bch_verbose(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ ret = bch2_check_backpointers_to_extents_pass(&trans, start, end);
+ if (ret || !bbpos_cmp(end, BBPOS_MAX))
+ break;
+
+ start = bbpos_successor(end);
+ }
+ bch2_trans_exit(&trans);
+
+ return ret;
+}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
new file mode 100644
index 000000000000..9c03709ade50
--- /dev/null
+++ b/fs/bcachefs/backpointers.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "super.h"
+
+int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
+ unsigned, struct printbuf *);
+void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
+void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_backpointer_swab(struct bkey_s);
+
+#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \
+ .key_invalid = bch2_backpointer_invalid, \
+ .val_to_text = bch2_backpointer_k_to_text, \
+ .swab = bch2_backpointer_swab, \
+})
+
+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10
+
+/*
+ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
+ * btree:
+ */
+static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
+ struct bpos bp_pos)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
+ u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+ return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
+}
+
+/*
+ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
+ */
+static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
+ struct bpos bucket,
+ u64 bucket_offset)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+ struct bpos ret;
+
+ ret = POS(bucket.inode,
+ (bucket_to_sector(ca, bucket.offset) <<
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+
+ EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
+
+ return ret;
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos,
+ struct bch_backpointer, struct bkey_s_c, bool);
+
+static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
+ struct bpos bucket,
+ struct bch_backpointer bp,
+ struct bkey_s_c orig_k,
+ bool insert)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_backpointer *bp_k;
+ int ret;
+
+ if (unlikely(bch2_backpointers_no_use_write_buffer))
+ return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert);
+
+ bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+ ret = PTR_ERR_OR_ZERO(bp_k);
+ if (ret)
+ return ret;
+
+ bkey_backpointer_init(&bp_k->k_i);
+ bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
+ bp_k->v = bp;
+
+ if (!insert) {
+ bp_k->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp_k->k, 0);
+ }
+
+ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i);
+}
+
+static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p)
+{
+ return level ? BCH_DATA_btree :
+ p.has_ec ? BCH_DATA_stripe :
+ BCH_DATA_user;
+}
+
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ struct bpos *bucket_pos, struct bch_backpointer *bp)
+{
+ enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+ s64 sectors = level ? btree_sectors(c) : k.k->size;
+ u32 bucket_offset;
+
+ *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
+ *bp = (struct bch_backpointer) {
+ .btree_id = btree_id,
+ .level = level,
+ .data_type = data_type,
+ .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
+ p.crc.offset,
+ .bucket_len = ptr_disk_sectors(sectors, p),
+ .pos = k.k->p,
+ };
+}
+
+int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
+ struct bpos *, struct bch_backpointer *, unsigned);
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
+ struct bpos, struct bch_backpointer,
+ unsigned);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
+ struct bpos, struct bch_backpointer);
+
+int bch2_check_btree_backpointers(struct bch_fs *);
+int bch2_check_extents_to_backpointers(struct bch_fs *);
+int bch2_check_backpointers_to_extents(struct bch_fs *);
+
+#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
new file mode 100644
index 000000000000..1fbed1f8378d
--- /dev/null
+++ b/fs/bcachefs/bbpos.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_H
+#define _BCACHEFS_BBPOS_H
+
+#include "bkey_methods.h"
+
+struct bbpos {
+ enum btree_id btree;
+ struct bpos pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+ return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN BBPOS(0, POS_MIN)
+#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
+
+static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
+{
+ return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
+}
+
+static inline struct bbpos bbpos_successor(struct bbpos pos)
+{
+ if (bpos_cmp(pos.pos, SPOS_MAX)) {
+ pos.pos = bpos_successor(pos.pos);
+ return pos;
+ }
+
+ if (pos.btree != BTREE_ID_NR) {
+ pos.btree++;
+ pos.pos = POS_MIN;
+ return pos;
+ }
+
+ BUG();
+}
+
+static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
+{
+ prt_str(out, bch2_btree_ids[pos.btree]);
+ prt_char(out, ':');
+ bch2_bpos_to_text(out, pos.pos);
+}
+
+#endif /* _BCACHEFS_BBPOS_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f1e4871a74c3..1e7c810d3569 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -107,7 +107,7 @@
*
* BTREE NODES:
*
- * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
* free smaller than a bucket - so, that's how big our btree nodes are.
*
* (If buckets are really big we'll only use part of the bucket for a btree node
@@ -206,11 +206,25 @@
#include "bcachefs_format.h"
#include "errcode.h"
#include "fifo.h"
+#include "nocow_locking_types.h"
#include "opts.h"
#include "util.h"
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_WRITE_REF_DEBUG
+#endif
+
+#ifndef dynamic_fault
#define dynamic_fault(...) 0
-#define race_fault(...) 0
+#endif
+
+#define race_fault(...) dynamic_fault("bcachefs:race")
+
+#define trace_and_count(_c, _name, ...) \
+do { \
+ this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \
+ trace_##_name(__VA_ARGS__); \
+} while (0)
#define bch2_fs_init_fault(name) \
dynamic_fault("bcachefs:bch_fs_init:" name)
@@ -220,13 +234,31 @@
dynamic_fault("bcachefs:meta:write:" name)
#ifdef __KERNEL__
-#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
-#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
+#define BCACHEFS_LOG_PREFIX
+#endif
+
+#ifdef BCACHEFS_LOG_PREFIX
+
+#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name)
+#define bch2_fmt_dev(_ca, fmt) "bcachefs (%s): " fmt "\n", ((_ca)->name)
+#define bch2_fmt_dev_offset(_ca, _offset, fmt) "bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset)
+#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \
+ "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset)
+
#else
-#define bch2_fmt(_c, fmt) fmt "\n"
-#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum)
+
+#define bch2_log_msg(_c, fmt) fmt
+#define bch2_fmt_dev(_ca, fmt) "%s: " fmt "\n", ((_ca)->name)
+#define bch2_fmt_dev_offset(_ca, _offset, fmt) "%s sector %llu: " fmt "\n", ((_ca)->name), (_offset)
+#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum)
+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \
+ "inum %llu offset %llu: " fmt "\n", (_inum), (_offset)
+
#endif
+#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
+
#define bch_info(c, fmt, ...) \
printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_notice(c, fmt, ...) \
@@ -235,13 +267,28 @@
printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn_ratelimited(c, fmt, ...) \
printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+
#define bch_err(c, fmt, ...) \
printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_dev(ca, fmt, ...) \
+ printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+#define bch_err_dev_offset(ca, _offset, fmt, ...) \
+ printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+#define bch_err_inum(c, _inum, fmt, ...) \
+ printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
+ printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
#define bch_err_ratelimited(c, fmt, ...) \
printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_dev_ratelimited(ca, fmt, ...) \
+ printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
+ printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
+ printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
#define bch_verbose(c, fmt, ...) \
do { \
@@ -272,18 +319,18 @@ do { \
"done in memory") \
BCH_DEBUG_PARAM(verify_all_btree_replicas, \
"When reading btree nodes, read all replicas and " \
- "compare them")
+ "compare them") \
+ BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \
+ "Don't use the write buffer for backpointers, enabling "\
+ "extra runtime checks")
-/* Parameters that should only be compiled in in debug mode: */
+/* Parameters that should only be compiled in debug mode: */
#define BCH_DEBUG_PARAMS_DEBUG() \
BCH_DEBUG_PARAM(expensive_debug_checks, \
"Enables various runtime debugging checks that " \
"significantly affect performance") \
BCH_DEBUG_PARAM(debug_check_iterators, \
"Enables extra verification for btree iterators") \
- BCH_DEBUG_PARAM(debug_check_bkeys, \
- "Run bkey_debugcheck (primarily checking GC/allocation "\
- "information) when iterating over keys") \
BCH_DEBUG_PARAM(debug_check_btree_accounting, \
"Verify btree accounting for keys within a node") \
BCH_DEBUG_PARAM(journal_seq_verify, \
@@ -332,9 +379,6 @@ BCH_DEBUG_PARAMS_DEBUG()
x(btree_interior_update_foreground) \
x(btree_interior_update_total) \
x(btree_gc) \
- x(btree_lock_contended_read) \
- x(btree_lock_contended_intent) \
- x(btree_lock_contended_write) \
x(data_write) \
x(data_read) \
x(data_promote) \
@@ -343,7 +387,8 @@ BCH_DEBUG_PARAMS_DEBUG()
x(journal_flush_seq) \
x(blocked_journal) \
x(blocked_allocate) \
- x(blocked_allocate_open_bucket)
+ x(blocked_allocate_open_bucket) \
+ x(nocow_lock_contended)
enum bch_time_stats {
#define x(name) BCH_TIME_##name,
@@ -354,6 +399,7 @@ enum bch_time_stats {
#include "alloc_types.h"
#include "btree_types.h"
+#include "btree_write_buffer_types.h"
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
#include "clock_types.h"
@@ -394,6 +440,11 @@ enum gc_phase {
GC_PHASE_BTREE_reflink,
GC_PHASE_BTREE_subvolumes,
GC_PHASE_BTREE_snapshots,
+ GC_PHASE_BTREE_lru,
+ GC_PHASE_BTREE_freespace,
+ GC_PHASE_BTREE_need_discard,
+ GC_PHASE_BTREE_backpointers,
+ GC_PHASE_BTREE_bucket_gens,
GC_PHASE_PENDING_DELETE,
};
@@ -438,6 +489,7 @@ struct bch_dev {
struct bch_sb *sb_read_scratch;
int sb_write_error;
dev_t dev;
+ atomic_t flush_seq;
struct bch_devs_mask self;
@@ -450,8 +502,9 @@ struct bch_dev {
* gc_lock, for device resize - holding any is sufficient for access:
* Or rcu_read_lock(), but only for ptr_stale():
*/
- struct bucket_array __rcu *buckets[2];
- struct bucket_gens *bucket_gens;
+ struct bucket_array __rcu *buckets_gc;
+ struct bucket_gens __rcu *bucket_gens;
+ u8 *oldest_gen;
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
@@ -461,34 +514,15 @@ struct bch_dev {
/* Allocator: */
u64 new_fs_bucket_idx;
- struct task_struct __rcu *alloc_thread;
+ u64 alloc_cursor;
- /*
- * free: Buckets that are ready to be used
- *
- * free_inc: Incoming buckets - these are buckets that currently have
- * cached data in them, and we can't reuse them until after we write
- * their new gen to disk. After prio_write() finishes writing the new
- * gens/prios, they'll be moved to the free list (and possibly discarded
- * in the process)
- */
- alloc_fifo free[RESERVE_NR];
- alloc_fifo free_inc;
unsigned nr_open_buckets;
-
- open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
- open_bucket_idx_t open_buckets_partial_nr;
-
- size_t fifo_last_bucket;
+ unsigned nr_btree_reserve;
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
size_t buckets_waiting_on_journal;
- enum allocator_states allocator_state;
-
- alloc_heap alloc_heap;
-
atomic64_t rebalance_work;
struct journal_device journal;
@@ -498,7 +532,7 @@ struct bch_dev {
/* The rest of this all shows up in sysfs */
atomic64_t cur_latency[2];
- struct time_stats io_latency[2];
+ struct bch2_time_stats io_latency[2];
#define CONGESTED_MAX 1024
atomic_t congested;
@@ -509,43 +543,51 @@ struct bch_dev {
enum {
/* startup: */
- BCH_FS_INITIALIZED,
- BCH_FS_ALLOC_READ_DONE,
- BCH_FS_ALLOC_CLEAN,
- BCH_FS_ALLOCATOR_RUNNING,
- BCH_FS_ALLOCATOR_STOPPING,
- BCH_FS_INITIAL_GC_DONE,
- BCH_FS_INITIAL_GC_UNFIXED,
- BCH_FS_TOPOLOGY_REPAIR_DONE,
- BCH_FS_FSCK_DONE,
BCH_FS_STARTED,
+ BCH_FS_MAY_GO_RW,
BCH_FS_RW,
BCH_FS_WAS_RW,
/* shutdown: */
BCH_FS_STOPPING,
BCH_FS_EMERGENCY_RO,
+ BCH_FS_GOING_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
+ BCH_FS_CLEAN_SHUTDOWN,
+
+ /* fsck passes: */
+ BCH_FS_TOPOLOGY_REPAIR_DONE,
+ BCH_FS_INITIAL_GC_DONE, /* kill when we enumerate fsck passes */
+ BCH_FS_CHECK_ALLOC_DONE,
+ BCH_FS_CHECK_LRUS_DONE,
+ BCH_FS_CHECK_BACKPOINTERS_DONE,
+ BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE,
+ BCH_FS_FSCK_DONE,
+ BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */
+ BCH_FS_NEED_ANOTHER_GC,
+
+ BCH_FS_HAVE_DELETED_SNAPSHOTS,
/* errors: */
BCH_FS_ERROR,
BCH_FS_TOPOLOGY_ERROR,
BCH_FS_ERRORS_FIXED,
BCH_FS_ERRORS_NOT_FIXED,
-
- /* misc: */
- BCH_FS_NEED_ANOTHER_GC,
- BCH_FS_DELETED_NODES,
- BCH_FS_NEED_ALLOC_WRITE,
- BCH_FS_REBUILD_REPLICAS,
- BCH_FS_HOLD_BTREE_WRITES,
};
struct btree_debug {
unsigned id;
- struct dentry *btree;
- struct dentry *btree_format;
- struct dentry *failed;
+};
+
+#define BCH_TRANSACTIONS_NR 128
+
+struct btree_transaction_stats {
+ struct bch2_time_stats lock_hold_times;
+ struct mutex lock;
+ unsigned nr_max_paths;
+ unsigned wb_updates_size;
+ unsigned max_mem;
+ char *max_paths_text;
};
struct bch_fs_pcpu {
@@ -563,17 +605,22 @@ struct journal_seq_blacklist_table {
struct journal_keys {
struct journal_key {
+ u64 journal_seq;
+ u32 journal_offset;
enum btree_id btree_id:8;
unsigned level:8;
bool allocated;
bool overwritten;
struct bkey_i *k;
- u32 journal_seq;
- u32 journal_offset;
} *d;
+ /*
+ * Gap buffer: instead of all the empty space in the array being at the
+ * end of the buffer - from @nr to @size - the empty space is at @gap.
+ * This means that sequential insertions are O(n) instead of O(n^2).
+ */
+ size_t gap;
size_t nr;
size_t size;
- u64 journal_seq_base;
};
struct btree_path_buf {
@@ -582,26 +629,37 @@ struct btree_path_buf {
#define REPLICAS_DELTA_LIST_MAX (1U << 16)
-struct snapshot_t {
- u32 parent;
- u32 children[2];
- u32 subvol; /* Nonzero only if a subvolume points to this node: */
- u32 equiv;
-};
-
-typedef struct {
- u32 subvol;
- u64 inum;
-} subvol_inum;
-
#define BCACHEFS_ROOT_SUBVOL_INUM \
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
+#define BCH_WRITE_REFS() \
+ x(trans) \
+ x(write) \
+ x(promote) \
+ x(node_rewrite) \
+ x(stripe_create) \
+ x(stripe_delete) \
+ x(reflink) \
+ x(fallocate) \
+ x(discard) \
+ x(invalidate) \
+ x(delete_dead_snapshots) \
+ x(snapshot_delete_pagecache) \
+ x(sysfs)
+
+enum bch_write_ref {
+#define x(n) BCH_WRITE_REF_##n,
+ BCH_WRITE_REFS()
+#undef x
+ BCH_WRITE_REF_NR,
+};
+
struct bch_fs {
struct closure cl;
struct list_head list;
struct kobject kobj;
+ struct kobject counters_kobj;
struct kobject internal;
struct kobject opts_dir;
struct kobject time_stats;
@@ -617,7 +675,11 @@ struct bch_fs {
struct rw_semaphore state_lock;
/* Counts outstanding writes, for clean transition to read-only */
+#ifdef BCH_WRITE_REF_DEBUG
+ atomic_long_t writes[BCH_WRITE_REF_NR];
+#else
struct percpu_ref writes;
+#endif
struct work_struct read_only_work;
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
@@ -673,7 +735,7 @@ struct bch_fs {
struct mutex snapshot_table_lock;
struct work_struct snapshot_delete_work;
struct work_struct snapshot_wait_for_pagecache_and_delete_work;
- struct snapshot_id_list snapshots_unlinked;
+ snapshot_id_list snapshots_unlinked;
struct mutex snapshots_unlinked_lock;
/* BTREE CACHE */
@@ -705,6 +767,16 @@ struct bch_fs {
struct workqueue_struct *btree_interior_update_worker;
struct work_struct btree_interior_update_work;
+ struct list_head pending_node_rewrites;
+ struct mutex pending_node_rewrites_lock;
+
+ /* btree_io.c: */
+ spinlock_t btree_write_error_lock;
+ struct btree_write_stats {
+ atomic64_t nr;
+ atomic64_t bytes;
+ } btree_write_stats[BTREE_WRITE_TYPE_NR];
+
/* btree_iter.c: */
struct mutex btree_trans_lock;
struct list_head btree_trans_list;
@@ -716,11 +788,20 @@ struct bch_fs {
bool btree_trans_barrier_initialized;
struct btree_key_cache btree_key_cache;
+ unsigned btree_key_cache_btrees;
+
+ struct btree_write_buffer btree_write_buffer;
struct workqueue_struct *btree_update_wq;
struct workqueue_struct *btree_io_complete_wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
+ /*
+ * Use a dedicated wq for write ref holder tasks. Required to avoid
+ * dependency problems with other wq tasks that can block on ref
+ * draining, such as read-only transition.
+ */
+ struct workqueue_struct *write_ref_wq;
/* ALLOCATION */
struct bch_devs_mask rw_devs[BCH_DATA_NR];
@@ -771,6 +852,9 @@ struct bch_fs {
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT];
+ open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
+ open_bucket_idx_t open_buckets_partial_nr;
+
struct write_point btree_write_point;
struct write_point rebalance_write_point;
@@ -780,6 +864,8 @@ struct bch_fs {
unsigned write_points_nr;
struct buckets_waiting_for_journal buckets_waiting_for_journal;
+ struct work_struct discard_work;
+ struct work_struct invalidate_work;
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
@@ -806,6 +892,7 @@ struct bch_fs {
* it's not while a gc is in progress.
*/
struct rw_semaphore gc_lock;
+ struct mutex gc_gens_lock;
/* IO PATH */
struct semaphore io_in_flight;
@@ -814,6 +901,8 @@ struct bch_fs {
struct bio_set bio_write;
struct mutex bio_bounce_pages_lock;
mempool_t bio_bounce_pages;
+ struct bucket_nocow_lock_table
+ nocow_locks;
struct rhashtable promote_table;
mempool_t compression_bounce[2];
@@ -822,32 +911,40 @@ struct bch_fs {
ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
- struct crypto_skcipher *chacha20;
+ struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
mempool_t large_bkey_pool;
+ /* MOVE.C */
+ struct list_head moving_context_list;
+ struct mutex moving_context_lock;
+
+ struct list_head data_progress_list;
+ struct mutex data_progress_lock;
+
/* REBALANCE */
struct bch_fs_rebalance rebalance;
/* COPYGC */
struct task_struct *copygc_thread;
- copygc_heap copygc_heap;
struct write_point copygc_write_point;
+ s64 copygc_wait_at;
s64 copygc_wait;
-
- /* DATA PROGRESS STATS */
- struct list_head data_progress_list;
- struct mutex data_progress_lock;
+ bool copygc_running;
+ wait_queue_head_t copygc_running_wq;
/* STRIPES: */
GENRADIX(struct stripe) stripes;
GENRADIX(struct gc_stripe) gc_stripes;
+ struct hlist_head ec_stripes_new[32];
+ spinlock_t ec_stripes_new_lock;
+
ec_stripes_heap ec_stripes_heap;
- spinlock_t ec_stripes_heap_lock;
+ struct mutex ec_stripes_heap_lock;
/* ERASURE CODING */
struct list_head ec_stripe_head_list;
@@ -855,29 +952,29 @@ struct bch_fs {
struct list_head ec_stripe_new_list;
struct mutex ec_stripe_new_lock;
+ wait_queue_head_t ec_stripe_new_wait;
struct work_struct ec_stripe_create_work;
u64 ec_stripe_hint;
- struct bio_set ec_bioset;
-
struct work_struct ec_stripe_delete_work;
- struct llist_head ec_stripe_delete_list;
+
+ struct bio_set ec_bioset;
/* REFLINK */
u64 reflink_hint;
reflink_gc_table reflink_gc_table;
size_t reflink_gc_nr;
+ /* fs.c */
+ struct list_head vfs_inodes_list;
+ struct mutex vfs_inodes_lock;
+
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
struct bio_set dio_write_bioset;
struct bio_set dio_read_bioset;
-
-
- atomic64_t btree_writes_nr;
- atomic64_t btree_writes_sectors;
- spinlock_t btree_write_error_lock;
+ struct bio_set nocow_flush_bioset;
/* ERRORS */
struct list_head fsck_errors;
@@ -888,7 +985,8 @@ struct bch_fs {
struct bch_memquota_type quotas[QTYP_NR];
/* DEBUG JUNK */
- struct dentry *debug;
+ struct dentry *fs_debug_dir;
+ struct dentry *btree_debug_dir;
struct btree_debug btree_debug[BTREE_ID_NR];
struct btree *verify_data;
struct btree_node *verify_ondisk;
@@ -906,24 +1004,65 @@ struct bch_fs {
mempool_t btree_bounce_pool;
struct journal journal;
- struct list_head journal_entries;
+ GENRADIX(struct journal_replay *) journal_entries;
+ u64 journal_entries_base_seq;
struct journal_keys journal_keys;
struct list_head journal_iters;
u64 last_bucket_seq_cleanup;
- /* The rest of this all shows up in sysfs */
- atomic_long_t read_realloc_races;
- atomic_long_t extent_migrate_done;
- atomic_long_t extent_migrate_raced;
+ u64 counters_on_mount[BCH_COUNTER_NR];
+ u64 __percpu *counters;
unsigned btree_gc_periodic:1;
unsigned copy_gc_enabled:1;
bool promote_whole_extents;
- struct time_stats times[BCH_TIME_STAT_NR];
+ struct bch2_time_stats times[BCH_TIME_STAT_NR];
+
+ struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
};
+extern struct wait_queue_head bch2_read_only_wait;
+
+static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+ atomic_long_inc(&c->writes[ref]);
+#else
+ percpu_ref_get(&c->writes);
+#endif
+}
+
+static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+ return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+ atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+ return percpu_ref_tryget_live(&c->writes);
+#endif
+}
+
+static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+ long v = atomic_long_dec_return(&c->writes[ref]);
+
+ BUG_ON(v < 0);
+ if (v)
+ return;
+ for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+ if (atomic_long_read(&c->writes[i]))
+ return;
+
+ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+ wake_up(&bch2_read_only_wait);
+#else
+ percpu_ref_put(&c->writes);
+#endif
+}
+
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
{
#ifndef NO_BCACHEFS_FS
@@ -952,6 +1091,11 @@ static inline size_t btree_sectors(const struct bch_fs *c)
return c->opts.btree_node_size >> 9;
}
+static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
+{
+ return c->btree_key_cache_btrees & (1U << btree);
+}
+
static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
{
struct timespec64 t;
@@ -983,4 +1127,7 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
return dev < c->sb.nr_devices && c->devs[dev];
}
+#define BKEY_PADDED_ONSTACK(key, pad) \
+ struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+
#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 5153f0e42054..7d1c0b1e3c54 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -78,6 +78,21 @@
#include <linux/uuid.h>
#include "vstructs.h"
+#define BITMASK(name, type, field, offset, end) \
+static const unsigned name##_OFFSET = offset; \
+static const unsigned name##_BITS = (end - offset); \
+ \
+static inline __u64 name(const type *k) \
+{ \
+ return (k->field >> offset) & ~(~0ULL << (end - offset)); \
+} \
+ \
+static inline void SET_##name(type *k, __u64 v) \
+{ \
+ k->field &= ~(~(~0ULL << (end - offset)) << offset); \
+ k->field |= (v & ~(~0ULL << (end - offset))) << offset; \
+}
+
#define LE_BITMASK(_bits, name, type, field, offset, end) \
static const unsigned name##_OFFSET = offset; \
static const unsigned name##_BITS = (end - offset); \
@@ -132,7 +147,7 @@ struct bpos {
#else
#error edit for your odd byteorder.
#endif
-} __attribute__((packed, aligned(4)));
+} __packed __aligned(4);
#define KEY_INODE_MAX ((__u64)~0ULL)
#define KEY_OFFSET_MAX ((__u64)~0ULL)
@@ -166,7 +181,7 @@ struct bversion {
__u32 hi;
__u64 lo;
#endif
-} __attribute__((packed, aligned(4)));
+} __packed __aligned(4);
struct bkey {
/* Size of combined key and value, in u64s */
@@ -199,7 +214,7 @@ struct bkey {
__u8 pad[1];
#endif
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
struct bkey_packed {
__u64 _data[0];
@@ -233,7 +248,7 @@ struct bkey_packed {
* to the same size as struct bkey should hopefully be safest.
*/
__u8 pad[sizeof(struct bkey) - 3];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64))
#define BKEY_U64s_MAX U8_MAX
@@ -275,16 +290,8 @@ enum bch_bkey_fields {
struct bkey_i {
__u64 _data[0];
- union {
- struct {
- /* Size of combined key and value, in u64s */
- __u8 u64s;
- };
- struct {
- struct bkey k;
- struct bch_val v;
- };
- };
+ struct bkey k;
+ struct bch_val v;
};
#define KEY(_inode, _offset, _size) \
@@ -303,7 +310,7 @@ static inline void bkey_init(struct bkey *k)
#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64))
#define __BKEY_PADDED(key, pad) \
- struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+ struct bkey_i key; __u64 key ## _pad[pad]
/*
* - DELETED keys are used internally to mark keys that should be ignored but
@@ -321,7 +328,7 @@ static inline void bkey_init(struct bkey *k)
* number.
*
* - WHITEOUT: for hash table btrees
-*/
+ */
#define BCH_BKEY_TYPES() \
x(deleted, 0) \
x(whiteout, 1) \
@@ -347,7 +354,13 @@ static inline void bkey_init(struct bkey *k)
x(subvolume, 21) \
x(snapshot, 22) \
x(inode_v2, 23) \
- x(alloc_v3, 24)
+ x(alloc_v3, 24) \
+ x(set, 25) \
+ x(lru, 26) \
+ x(alloc_v4, 27) \
+ x(backpointer, 28) \
+ x(inode_v3, 29) \
+ x(bucket_gens, 30)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -377,6 +390,10 @@ struct bch_hash_whiteout {
struct bch_val v;
};
+struct bch_set {
+ struct bch_val v;
+};
+
/* Extents */
/*
@@ -454,7 +471,7 @@ struct bch_hash_whiteout {
struct bch_csum {
__le64 lo;
__le64 hi;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
#define BCH_EXTENT_ENTRY_TYPES() \
x(ptr, 0) \
@@ -491,7 +508,7 @@ struct bch_extent_crc32 {
_compressed_size:7,
type:2;
#endif
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
#define CRC32_SIZE_MAX (1U << 7)
#define CRC32_NONCE_MAX 0
@@ -517,7 +534,7 @@ struct bch_extent_crc64 {
type:3;
#endif
__u64 csum_lo;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
#define CRC64_SIZE_MAX (1U << 9)
#define CRC64_NONCE_MAX ((1U << 10) - 1)
@@ -541,7 +558,7 @@ struct bch_extent_crc128 {
type:4;
#endif
struct bch_csum csum;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
#define CRC128_SIZE_MAX (1U << 13)
#define CRC128_NONCE_MAX ((1U << 13) - 1)
@@ -554,7 +571,7 @@ struct bch_extent_ptr {
__u64 type:1,
cached:1,
unused:1,
- reservation:1,
+ unwritten:1,
offset:44, /* 8 petabytes */
dev:8,
gen:8;
@@ -562,12 +579,12 @@ struct bch_extent_ptr {
__u64 gen:8,
dev:8,
offset:44,
- reservation:1,
+ unwritten:1,
unused:1,
cached:1,
type:1;
#endif
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
struct bch_extent_stripe_ptr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
@@ -617,9 +634,9 @@ union bch_extent_entry {
struct bch_btree_ptr {
struct bch_val v;
- struct bch_extent_ptr start[0];
__u64 _data[0];
-} __attribute__((packed, aligned(8)));
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
struct bch_btree_ptr_v2 {
struct bch_val v;
@@ -629,18 +646,18 @@ struct bch_btree_ptr_v2 {
__le16 sectors_written;
__le16 flags;
struct bpos min_key;
- struct bch_extent_ptr start[0];
__u64 _data[0];
-} __attribute__((packed, aligned(8)));
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
struct bch_extent {
struct bch_val v;
- union bch_extent_entry start[0];
__u64 _data[0];
-} __attribute__((packed, aligned(8)));
+ union bch_extent_entry start[];
+} __packed __aligned(8);
struct bch_reservation {
struct bch_val v;
@@ -648,7 +665,7 @@ struct bch_reservation {
__le32 generation;
__u8 nr_replicas;
__u8 pad[3];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
/* Maximum size (in u64s) a single pointer could be: */
#define BKEY_EXTENT_PTR_U64s_MAX\
@@ -682,7 +699,7 @@ struct bch_inode {
__le32 bi_flags;
__le16 bi_mode;
__u8 fields[0];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
struct bch_inode_v2 {
struct bch_val v;
@@ -692,20 +709,35 @@ struct bch_inode_v2 {
__le64 bi_flags;
__le16 bi_mode;
__u8 fields[0];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
+
+struct bch_inode_v3 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le64 bi_sectors;
+ __le64 bi_size;
+ __le64 bi_version;
+ __u8 fields[0];
+} __packed __aligned(8);
+
+#define INODEv3_FIELDS_START_INITIAL 6
+#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(u64))
struct bch_inode_generation {
struct bch_val v;
__le32 bi_generation;
__le32 pad;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
/*
* bi_subvol and bi_parent_subvol are only set for subvolume roots:
*/
-#define BCH_INODE_FIELDS() \
+#define BCH_INODE_FIELDS_v2() \
x(bi_atime, 96) \
x(bi_ctime, 96) \
x(bi_mtime, 96) \
@@ -732,6 +764,32 @@ struct bch_inode_generation {
x(bi_subvol, 32) \
x(bi_parent_subvol, 32)
+#define BCH_INODE_FIELDS_v3() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32) \
+ x(bi_nocow, 8)
+
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
x(data_checksum, 8) \
@@ -742,7 +800,8 @@ struct bch_inode_generation {
x(promote_target, 16) \
x(foreground_target, 16) \
x(background_target, 16) \
- x(erasure_code, 16)
+ x(erasure_code, 16) \
+ x(nocow, 8)
enum inode_opt_id {
#define x(name, ...) \
@@ -757,16 +816,16 @@ enum {
* User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
* flags)
*/
- __BCH_INODE_SYNC = 0,
- __BCH_INODE_IMMUTABLE = 1,
- __BCH_INODE_APPEND = 2,
- __BCH_INODE_NODUMP = 3,
- __BCH_INODE_NOATIME = 4,
+ __BCH_INODE_SYNC = 0,
+ __BCH_INODE_IMMUTABLE = 1,
+ __BCH_INODE_APPEND = 2,
+ __BCH_INODE_NODUMP = 3,
+ __BCH_INODE_NOATIME = 4,
- __BCH_INODE_I_SIZE_DIRTY= 5,
- __BCH_INODE_I_SECTORS_DIRTY= 6,
- __BCH_INODE_UNLINKED = 7,
- __BCH_INODE_BACKPTR_UNTRUSTED = 8,
+ __BCH_INODE_I_SIZE_DIRTY = 5,
+ __BCH_INODE_I_SECTORS_DIRTY = 6,
+ __BCH_INODE_UNLINKED = 7,
+ __BCH_INODE_BACKPTR_UNTRUSTED = 8,
/* bits 20+ reserved for packed fields below: */
};
@@ -788,6 +847,13 @@ LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
+LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+ struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
+
/* Dirents */
/*
@@ -820,15 +886,14 @@ struct bch_dirent {
__u8 d_type;
__u8 d_name[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
#define DT_SUBVOL 16
#define BCH_DT_MAX 17
-#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \
+#define BCH_NAME_MAX ((unsigned) (U8_MAX * sizeof(u64) - \
sizeof(struct bkey) - \
- offsetof(struct bch_dirent, d_name))
-
+ offsetof(struct bch_dirent, d_name)))
/* Xattrs */
@@ -844,7 +909,7 @@ struct bch_xattr {
__u8 x_name_len;
__le16 x_val_len;
__u8 x_name[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
/* Bucket/allocation information: */
@@ -853,7 +918,7 @@ struct bch_alloc {
__u8 fields;
__u8 gen;
__u8 data[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
#define BCH_ALLOC_FIELDS_V1() \
x(read_time, 16) \
@@ -865,6 +930,12 @@ struct bch_alloc {
x(stripe, 32) \
x(stripe_redundancy, 8)
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
struct bch_alloc_v2 {
struct bch_val v;
__u8 nr_fields;
@@ -872,13 +943,13 @@ struct bch_alloc_v2 {
__u8 oldest_gen;
__u8 data_type;
__u8 data[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
#define BCH_ALLOC_FIELDS_V2() \
x(read_time, 64) \
x(write_time, 64) \
- x(dirty_sectors, 16) \
- x(cached_sectors, 16) \
+ x(dirty_sectors, 32) \
+ x(cached_sectors, 32) \
x(stripe, 32) \
x(stripe_redundancy, 8)
@@ -891,14 +962,55 @@ struct bch_alloc_v3 {
__u8 oldest_gen;
__u8 data_type;
__u8 data[];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
- BCH_ALLOC_FIELDS_V1()
-#undef x
- BCH_ALLOC_FIELD_NR
-};
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
+
+struct bch_alloc_v4 {
+ struct bch_val v;
+ __u64 journal_seq;
+ __u32 flags;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 stripe_redundancy;
+ __u32 dirty_sectors;
+ __u32 cached_sectors;
+ __u64 io_time[2];
+ __u32 stripe;
+ __u32 nr_external_backpointers;
+ __u64 fragmentation_lru;
+} __packed __aligned(8);
+
+#define BCH_ALLOC_V4_U64s_V0 6
+#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
+
+#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40
+
+struct bch_backpointer {
+ struct bch_val v;
+ __u8 btree_id;
+ __u8 level;
+ __u8 data_type;
+ __u64 bucket_offset:40;
+ __u32 bucket_len;
+ struct bpos pos;
+} __packed __aligned(8);
+
+#define KEY_TYPE_BUCKET_GENS_BITS 8
+#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
+#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
+
+struct bch_bucket_gens {
+ struct bch_val v;
+ u8 gens[KEY_TYPE_BUCKET_GENS_NR];
+} __packed __aligned(8);
/* Quotas: */
@@ -923,7 +1035,7 @@ struct bch_quota_counter {
struct bch_quota {
struct bch_val v;
struct bch_quota_counter c[Q_COUNTERS];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
/* Erasure coding */
@@ -938,8 +1050,8 @@ struct bch_stripe {
__u8 csum_type;
__u8 pad;
- struct bch_extent_ptr ptrs[0];
-} __attribute__((packed, aligned(8)));
+ struct bch_extent_ptr ptrs[];
+} __packed __aligned(8);
/* Reflink: */
@@ -956,14 +1068,14 @@ struct bch_reflink_p {
*/
__le32 front_pad;
__le32 back_pad;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
struct bch_reflink_v {
struct bch_val v;
__le64 refcount;
union bch_extent_entry start[0];
__u64 _data[0];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
struct bch_indirect_inline_data {
struct bch_val v;
@@ -1015,6 +1127,15 @@ LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
/* True if a subvolume points to this snapshot node: */
LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+/* LRU btree: */
+
+struct bch_lru {
+ struct bch_val v;
+ __le64 idx;
+} __packed __aligned(8);
+
+#define LRU_ID_STRIPES (1U << 16)
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -1023,16 +1144,18 @@ struct bch_sb_field {
__le32 type;
};
-#define BCH_SB_FIELDS() \
- x(journal, 0) \
- x(members, 1) \
- x(crypt, 2) \
- x(replicas_v0, 3) \
- x(quota, 4) \
- x(disk_groups, 5) \
- x(clean, 6) \
- x(replicas, 7) \
- x(journal_seq_blacklist, 8)
+#define BCH_SB_FIELDS() \
+ x(journal, 0) \
+ x(members, 1) \
+ x(crypt, 2) \
+ x(replicas_v0, 3) \
+ x(quota, 4) \
+ x(disk_groups, 5) \
+ x(clean, 6) \
+ x(replicas, 7) \
+ x(journal_seq_blacklist, 8) \
+ x(journal_v2, 9) \
+ x(counters, 10)
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
@@ -1041,6 +1164,14 @@ enum bch_sb_field_type {
BCH_SB_FIELD_NR
};
+/*
+ * Most superblock fields are replicated in all device's superblocks - a few are
+ * not:
+ */
+#define BCH_SINGLE_DEVICE_SB_FIELDS \
+ ((1U << BCH_SB_FIELD_journal)| \
+ (1U << BCH_SB_FIELD_journal_v2))
+
/* BCH_SB_FIELD_journal: */
struct bch_sb_field_journal {
@@ -1048,6 +1179,15 @@ struct bch_sb_field_journal {
__le64 buckets[0];
};
+struct bch_sb_field_journal_v2 {
+ struct bch_sb_field field;
+
+ struct bch_sb_field_journal_v2_entry {
+ __le64 start;
+ __le64 nr;
+ } d[0];
+};
+
/* BCH_SB_FIELD_members: */
#define BCH_MIN_NR_NBUCKETS (1 << 6)
@@ -1069,6 +1209,8 @@ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15)
LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20)
LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28)
LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+ struct bch_member, flags[0], 30, 31)
#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
@@ -1144,13 +1286,16 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
/* BCH_SB_FIELD_replicas: */
#define BCH_DATA_TYPES() \
- x(none, 0) \
+ x(free, 0) \
x(sb, 1) \
x(journal, 2) \
x(btree, 3) \
x(user, 4) \
x(cached, 5) \
- x(parity, 6)
+ x(parity, 6) \
+ x(stripe, 7) \
+ x(need_gc_gens, 8) \
+ x(need_discard, 9)
enum bch_data_type {
#define x(t, n) BCH_DATA_##t,
@@ -1159,31 +1304,54 @@ enum bch_data_type {
BCH_DATA_NR
};
+static inline bool data_type_is_empty(enum bch_data_type type)
+{
+ switch (type) {
+ case BCH_DATA_free:
+ case BCH_DATA_need_gc_gens:
+ case BCH_DATA_need_discard:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool data_type_is_hidden(enum bch_data_type type)
+{
+ switch (type) {
+ case BCH_DATA_sb:
+ case BCH_DATA_journal:
+ return true;
+ default:
+ return false;
+ }
+}
+
struct bch_replicas_entry_v0 {
__u8 data_type;
__u8 nr_devs;
- __u8 devs[0];
-} __attribute__((packed));
+ __u8 devs[];
+} __packed;
struct bch_sb_field_replicas_v0 {
struct bch_sb_field field;
- struct bch_replicas_entry_v0 entries[0];
-} __attribute__((packed, aligned(8)));
+ struct bch_replicas_entry_v0 entries[];
+} __packed __aligned(8);
struct bch_replicas_entry {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
- __u8 devs[0];
-} __attribute__((packed));
+ __u8 devs[];
+} __packed;
#define replicas_entry_bytes(_i) \
(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
struct bch_sb_field_replicas {
struct bch_sb_field field;
- struct bch_replicas_entry entries[0];
-} __attribute__((packed, aligned(8)));
+ struct bch_replicas_entry entries[];
+} __packed __aligned(8);
/* BCH_SB_FIELD_quota: */
@@ -1200,7 +1368,7 @@ struct bch_sb_quota_type {
struct bch_sb_field_quota {
struct bch_sb_field field;
struct bch_sb_quota_type q[QTYP_NR];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
/* BCH_SB_FIELD_disk_groups: */
@@ -1209,7 +1377,7 @@ struct bch_sb_field_quota {
struct bch_disk_group {
__u8 label[BCH_SB_LABEL_SIZE];
__le64 flags[2];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
@@ -1218,7 +1386,100 @@ LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
struct bch_sb_field_disk_groups {
struct bch_sb_field field;
struct bch_disk_group entries[0];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_counters */
+
+#define BCH_PERSISTENT_COUNTERS() \
+ x(io_read, 0) \
+ x(io_write, 1) \
+ x(io_move, 2) \
+ x(bucket_invalidate, 3) \
+ x(bucket_discard, 4) \
+ x(bucket_alloc, 5) \
+ x(bucket_alloc_fail, 6) \
+ x(btree_cache_scan, 7) \
+ x(btree_cache_reap, 8) \
+ x(btree_cache_cannibalize, 9) \
+ x(btree_cache_cannibalize_lock, 10) \
+ x(btree_cache_cannibalize_lock_fail, 11) \
+ x(btree_cache_cannibalize_unlock, 12) \
+ x(btree_node_write, 13) \
+ x(btree_node_read, 14) \
+ x(btree_node_compact, 15) \
+ x(btree_node_merge, 16) \
+ x(btree_node_split, 17) \
+ x(btree_node_rewrite, 18) \
+ x(btree_node_alloc, 19) \
+ x(btree_node_free, 20) \
+ x(btree_node_set_root, 21) \
+ x(btree_path_relock_fail, 22) \
+ x(btree_path_upgrade_fail, 23) \
+ x(btree_reserve_get_fail, 24) \
+ x(journal_entry_full, 25) \
+ x(journal_full, 26) \
+ x(journal_reclaim_finish, 27) \
+ x(journal_reclaim_start, 28) \
+ x(journal_write, 29) \
+ x(read_promote, 30) \
+ x(read_bounce, 31) \
+ x(read_split, 33) \
+ x(read_retry, 32) \
+ x(read_reuse_race, 34) \
+ x(move_extent_read, 35) \
+ x(move_extent_write, 36) \
+ x(move_extent_finish, 37) \
+ x(move_extent_fail, 38) \
+ x(move_extent_alloc_mem_fail, 39) \
+ x(copygc, 40) \
+ x(copygc_wait, 41) \
+ x(gc_gens_end, 42) \
+ x(gc_gens_start, 43) \
+ x(trans_blocked_journal_reclaim, 44) \
+ x(trans_restart_btree_node_reused, 45) \
+ x(trans_restart_btree_node_split, 46) \
+ x(trans_restart_fault_inject, 47) \
+ x(trans_restart_iter_upgrade, 48) \
+ x(trans_restart_journal_preres_get, 49) \
+ x(trans_restart_journal_reclaim, 50) \
+ x(trans_restart_journal_res_get, 51) \
+ x(trans_restart_key_cache_key_realloced, 52) \
+ x(trans_restart_key_cache_raced, 53) \
+ x(trans_restart_mark_replicas, 54) \
+ x(trans_restart_mem_realloced, 55) \
+ x(trans_restart_memory_allocation_failure, 56) \
+ x(trans_restart_relock, 57) \
+ x(trans_restart_relock_after_fill, 58) \
+ x(trans_restart_relock_key_cache_fill, 59) \
+ x(trans_restart_relock_next_node, 60) \
+ x(trans_restart_relock_parent_for_fill, 61) \
+ x(trans_restart_relock_path, 62) \
+ x(trans_restart_relock_path_intent, 63) \
+ x(trans_restart_too_many_iters, 64) \
+ x(trans_restart_traverse, 65) \
+ x(trans_restart_upgrade, 66) \
+ x(trans_restart_would_deadlock, 67) \
+ x(trans_restart_would_deadlock_write, 68) \
+ x(trans_restart_injected, 69) \
+ x(trans_restart_key_cache_upgrade, 70) \
+ x(trans_traverse_all, 71) \
+ x(transaction_commit, 72) \
+ x(write_super, 73) \
+ x(trans_restart_would_deadlock_recursion_limit, 74) \
+ x(trans_restart_write_buffer_flush, 75) \
+ x(trans_restart_split_race, 76)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+ struct bch_sb_field field;
+ __le64 d[0];
+};
/*
* On clean shutdown, store btree roots and current journal sequence number in
@@ -1275,19 +1536,33 @@ struct bch_sb_field_journal_seq_blacklist {
#define BCH_JSET_VERSION_OLD 2
#define BCH_BSET_VERSION_OLD 3
+#define BCH_METADATA_VERSIONS() \
+ x(bkey_renumber, 10) \
+ x(inode_btree_change, 11) \
+ x(snapshot, 12) \
+ x(inode_backpointers, 13) \
+ x(btree_ptr_sectors_written, 14) \
+ x(snapshot_2, 15) \
+ x(reflink_p_fix, 16) \
+ x(subvol_dirent, 17) \
+ x(inode_v2, 18) \
+ x(freespace, 19) \
+ x(alloc_v4, 20) \
+ x(new_data_types, 21) \
+ x(backpointers, 22) \
+ x(inode_v3, 23) \
+ x(unwritten_extents, 24) \
+ x(bucket_gens, 25) \
+ x(lru_v2, 26) \
+ x(fragmentation_lru, 27) \
+ x(no_bps_in_alloc_keys, 28)
+
enum bcachefs_metadata_version {
- bcachefs_metadata_version_min = 9,
- bcachefs_metadata_version_new_versioning = 10,
- bcachefs_metadata_version_bkey_renumber = 10,
- bcachefs_metadata_version_inode_btree_change = 11,
- bcachefs_metadata_version_snapshot = 12,
- bcachefs_metadata_version_inode_backpointers = 13,
- bcachefs_metadata_version_btree_ptr_sectors_written = 14,
- bcachefs_metadata_version_snapshot_2 = 15,
- bcachefs_metadata_version_reflink_p_fix = 16,
- bcachefs_metadata_version_subvol_dirent = 17,
- bcachefs_metadata_version_inode_v2 = 18,
- bcachefs_metadata_version_max = 19,
+ bcachefs_metadata_version_min = 9,
+#define x(t, n) bcachefs_metadata_version_##t = n,
+ BCH_METADATA_VERSIONS()
+#undef x
+ bcachefs_metadata_version_max
};
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
@@ -1302,7 +1577,7 @@ struct bch_sb_layout {
__u8 nr_superblocks;
__u8 pad[5];
__le64 sb_offset[61];
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
#define BCH_SB_LAYOUT_SECTOR 7
@@ -1312,7 +1587,7 @@ struct bch_sb_layout {
* @version_min - Oldest metadata version this filesystem contains; so we can
* safely drop compatibility code and refuse to mount filesystems
* we'd need it for
- * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC)
+ * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC)
* @seq - incremented each time superblock is written
* @uuid - used for generating various magic numbers and identifying
* member devices, never changes
@@ -1353,7 +1628,7 @@ struct bch_sb {
struct bch_sb_field start[0];
__le64 _data[0];
};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
/*
* Flags:
@@ -1428,6 +1703,8 @@ LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
+LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
+LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54);
/*
* Features:
@@ -1626,6 +1903,9 @@ enum bch_compression_opts {
#define BCACHE_MAGIC \
UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \
0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
+#define BCHFS_MAGIC \
+ UUID_LE(0xf67385c6, 0xce66, 0xa990, \
+ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
#define BCACHEFS_STATFS_MAGIC 0xca451a4e
@@ -1635,6 +1915,7 @@ enum bch_compression_opts {
static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
{
__le64 ret;
+
memcpy(&ret, &sb->uuid, sizeof(ret));
return ret;
}
@@ -1663,7 +1944,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(data_usage, 6) \
x(clock, 7) \
x(dev_usage, 8) \
- x(log, 9)
+ x(log, 9) \
+ x(overwrite, 10)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@@ -1708,26 +1990,26 @@ enum {
struct jset_entry_usage {
struct jset_entry entry;
__le64 v;
-} __attribute__((packed));
+} __packed;
struct jset_entry_data_usage {
struct jset_entry entry;
__le64 v;
struct bch_replicas_entry r;
-} __attribute__((packed));
+} __packed;
struct jset_entry_clock {
struct jset_entry entry;
__u8 rw;
__u8 pad[7];
__le64 time;
-} __attribute__((packed));
+} __packed;
struct jset_entry_dev_usage_type {
__le64 buckets;
__le64 sectors;
__le64 fragmented;
-} __attribute__((packed));
+} __packed;
struct jset_entry_dev_usage {
struct jset_entry entry;
@@ -1735,10 +2017,10 @@ struct jset_entry_dev_usage {
__u32 pad;
__le64 buckets_ec;
- __le64 buckets_unavailable;
+ __le64 _buckets_unavailable; /* No longer used */
struct jset_entry_dev_usage_type d[];
-} __attribute__((packed));
+} __packed;
static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
{
@@ -1749,7 +2031,7 @@ static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage
struct jset_entry_log {
struct jset_entry entry;
u8 d[];
-} __attribute__((packed));
+} __packed;
/*
* On disk format for a journal entry:
@@ -1784,7 +2066,7 @@ struct jset {
struct jset_entry start[0];
__u64 _data[0];
};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
@@ -1795,16 +2077,21 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
/* Btree: */
#define BCH_BTREE_IDS() \
- x(extents, 0) \
- x(inodes, 1) \
- x(dirents, 2) \
- x(xattrs, 3) \
- x(alloc, 4) \
- x(quotas, 5) \
- x(stripes, 6) \
- x(reflink, 7) \
- x(subvolumes, 8) \
- x(snapshots, 9)
+ x(extents, 0) \
+ x(inodes, 1) \
+ x(dirents, 2) \
+ x(xattrs, 3) \
+ x(alloc, 4) \
+ x(quotas, 5) \
+ x(stripes, 6) \
+ x(reflink, 7) \
+ x(subvolumes, 8) \
+ x(snapshots, 9) \
+ x(lru, 10) \
+ x(freespace, 11) \
+ x(need_discard, 12) \
+ x(backpointers, 13) \
+ x(bucket_gens, 14)
enum btree_id {
#define x(kwd, val) BTREE_ID_##kwd = val,
@@ -1843,7 +2130,7 @@ struct bset {
struct bkey_packed start[0];
__u64 _data[0];
};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4);
@@ -1876,7 +2163,7 @@ struct btree_node {
};
};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4);
LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
@@ -1897,6 +2184,6 @@ struct btree_node_entry {
};
};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
#endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 930981ad5535..ad47a506a907 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -208,7 +208,7 @@ struct bch_ioctl_data {
__u64 pad[8];
};
};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
enum bch_data_event {
BCH_DATA_EVENT_PROGRESS = 0,
@@ -224,7 +224,7 @@ struct bch_ioctl_data_progress {
__u64 sectors_done;
__u64 sectors_total;
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
struct bch_ioctl_data_event {
__u8 type;
@@ -233,12 +233,12 @@ struct bch_ioctl_data_event {
struct bch_ioctl_data_progress p;
__u64 pad2[15];
};
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
struct bch_replicas_usage {
__u64 sectors;
struct bch_replicas_entry r;
-} __attribute__((packed));
+} __packed;
static inline struct bch_replicas_usage *
replicas_usage_next(struct bch_replicas_usage *u)
@@ -285,13 +285,14 @@ struct bch_ioctl_dev_usage {
__u32 bucket_size;
__u64 nr_buckets;
- __u64 available_buckets;
- __u64 buckets[BCH_DATA_NR];
- __u64 sectors[BCH_DATA_NR];
+ __u64 buckets_ec;
- __u64 ec_buckets;
- __u64 ec_sectors;
+ struct bch_ioctl_dev_usage_type {
+ __u64 buckets;
+ __u64 sectors;
+ __u64 fragmented;
+ } d[BCH_DATA_NR];
};
/*
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 946dd27f09fc..ff5d01e6e674 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "bkey.h"
+#include "bkey_cmp.h"
#include "bkey_methods.h"
#include "bset.h"
#include "util.h"
@@ -16,36 +17,49 @@
const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
- const struct bkey_packed *);
-
-void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits)
+void bch2_bkey_packed_to_binary_text(struct printbuf *out,
+ const struct bkey_format *f,
+ const struct bkey_packed *k)
{
- unsigned bit = high_bit_offset, done = 0;
+ const u64 *p = high_word(f, k);
+ unsigned word_bits = 64 - high_bit_offset;
+ unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
+ u64 v = *p & (~0ULL >> high_bit_offset);
+
+ if (!nr_key_bits) {
+ prt_str(out, "(empty)");
+ return;
+ }
while (1) {
- while (bit < 64) {
- if (done && !(done % 8))
- *out++ = ' ';
- *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
- bit++;
- done++;
- if (done == nr_bits) {
- *out++ = '\0';
- return;
- }
+ unsigned next_key_bits = nr_key_bits;
+
+ if (nr_key_bits < 64) {
+ v >>= 64 - nr_key_bits;
+ next_key_bits = 0;
+ } else {
+ next_key_bits -= 64;
}
+ bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+
+ if (!next_key_bits)
+ break;
+
+ prt_char(out, ' ');
+
p = next_word(p);
- bit = 0;
+ v = *p;
+ word_bits = 64;
+ nr_key_bits = next_key_bits;
}
}
#ifdef CONFIG_BCACHEFS_DEBUG
static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
- const struct bkey *unpacked,
- const struct bkey_format *format)
+ const struct bkey *unpacked,
+ const struct bkey_format *format)
{
struct bkey tmp;
@@ -57,22 +71,35 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
tmp = __bch2_bkey_unpack_key(format, packed);
if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
- char buf1[160], buf2[160];
- char buf3[160], buf4[160];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_to_text(&PBUF(buf1), unpacked);
- bch2_bkey_to_text(&PBUF(buf2), &tmp);
- bch2_to_binary(buf3, (void *) unpacked, 80);
- bch2_to_binary(buf4, high_word(format, packed), 80);
-
- panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
+ prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
format->key_u64s,
format->bits_per_field[0],
format->bits_per_field[1],
format->bits_per_field[2],
format->bits_per_field[3],
- format->bits_per_field[4],
- buf1, buf2, buf3, buf4);
+ format->bits_per_field[4]);
+
+ prt_printf(&buf, "compiled unpack: ");
+ bch2_bkey_to_text(&buf, unpacked);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "c unpack: ");
+ bch2_bkey_to_text(&buf, &tmp);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "compiled unpack: ");
+ bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+ (struct bkey_packed *) unpacked);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "c unpack: ");
+ bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+ (struct bkey_packed *) &tmp);
+ prt_newline(&buf);
+
+ panic("%s", buf.buf);
}
}
@@ -201,9 +228,10 @@ static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
{
struct pack_state out_s = pack_state_init(out_f, out);
struct unpack_state in_s = unpack_state_init(in_f, in);
+ u64 *w = out->_data;
unsigned i;
- out->_data[0] = 0;
+ *w = 0;
for (i = 0; i < BKEY_NR_FIELDS; i++)
if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
@@ -234,14 +262,6 @@ bool bch2_bkey_transform(const struct bkey_format *out_f,
return true;
}
-#define bkey_fields() \
- x(BKEY_FIELD_INODE, p.inode) \
- x(BKEY_FIELD_OFFSET, p.offset) \
- x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
- x(BKEY_FIELD_SIZE, size) \
- x(BKEY_FIELD_VERSION_HI, version.hi) \
- x(BKEY_FIELD_VERSION_LO, version.lo)
-
struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
const struct bkey_packed *in)
{
@@ -292,25 +312,17 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
const struct bkey_format *format)
{
struct pack_state state = pack_state_init(format, out);
+ u64 *w = out->_data;
EBUG_ON((void *) in == (void *) out);
EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
EBUG_ON(in->format != KEY_FORMAT_CURRENT);
- out->_data[0] = 0;
+ *w = 0;
#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false;
bkey_fields()
#undef x
-
- /*
- * Extents - we have to guarantee that if an extent is packed, a trimmed
- * version will also pack:
- */
- if (bkey_start_offset(in) <
- le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
- return false;
-
pack_state_finish(&state, out);
out->u64s = format->key_u64s + in->u64s - BKEY_U64s;
out->format = KEY_FORMAT_LOCAL_BTREE;
@@ -439,6 +451,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
{
const struct bkey_format *f = &b->format;
struct pack_state state = pack_state_init(f, out);
+ u64 *w = out->_data;
#ifdef CONFIG_BCACHEFS_DEBUG
struct bpos orig = in;
#endif
@@ -451,7 +464,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
* enough - we need to make sure to zero them out:
*/
for (i = 0; i < f->key_u64s; i++)
- out->_data[i] = 0;
+ w[i] = 0;
if (unlikely(in.snapshot <
le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
@@ -475,18 +488,18 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
return BKEY_PACK_POS_FAIL;
- if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) {
+ if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) {
in.offset = KEY_OFFSET_MAX;
in.snapshot = KEY_SNAPSHOT_MAX;
exact = false;
}
- if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) {
+ if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) {
in.snapshot = KEY_SNAPSHOT_MAX;
exact = false;
}
- if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))
+ if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)))
exact = false;
pack_state_finish(&state, out);
@@ -523,24 +536,6 @@ void bch2_bkey_format_init(struct bkey_format_state *s)
s->field_min[BKEY_FIELD_SIZE] = 0;
}
-static void __bkey_format_add(struct bkey_format_state *s,
- unsigned field, u64 v)
-{
- s->field_min[field] = min(s->field_min[field], v);
- s->field_max[field] = max(s->field_max[field], v);
-}
-
-/*
- * Changes @format so that @k can be successfully packed with @format
- */
-void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
-{
-#define x(id, field) __bkey_format_add(s, id, k->field);
- bkey_fields()
-#undef x
- __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
-}
-
void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
{
unsigned field = 0;
@@ -731,50 +726,6 @@ unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
#ifdef CONFIG_X86_64
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
- unsigned nr_key_bits)
-{
- long d0, d1, d2, d3;
- int cmp;
-
- /* we shouldn't need asm for this, but gcc is being retarded: */
-
- asm(".intel_syntax noprefix;"
- "xor eax, eax;"
- "xor edx, edx;"
- "1:;"
- "mov r8, [rdi];"
- "mov r9, [rsi];"
- "sub ecx, 64;"
- "jl 2f;"
-
- "cmp r8, r9;"
- "jnz 3f;"
-
- "lea rdi, [rdi - 8];"
- "lea rsi, [rsi - 8];"
- "jmp 1b;"
-
- "2:;"
- "not ecx;"
- "shr r8, 1;"
- "shr r9, 1;"
- "shr r8, cl;"
- "shr r9, cl;"
- "cmp r8, r9;"
-
- "3:\n"
- "seta al;"
- "setb dl;"
- "sub eax, edx;"
- ".att_syntax prefix;"
- : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
- : "0" (l), "1" (r), "3" (nr_key_bits)
- : "r8", "r9", "cc", "memory");
-
- return cmp;
-}
-
#define I(_x) (*(out)++ = (_x))
#define I1(i0) I(i0)
#define I2(i0, i1) (I1(i0), I(i1))
@@ -1005,40 +956,6 @@ int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
}
#else
-static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
- unsigned nr_key_bits)
-{
- u64 l_v, r_v;
-
- if (!nr_key_bits)
- return 0;
-
- /* for big endian, skip past header */
- nr_key_bits += high_bit_offset;
- l_v = *l & (~0ULL >> high_bit_offset);
- r_v = *r & (~0ULL >> high_bit_offset);
-
- while (1) {
- if (nr_key_bits < 64) {
- l_v >>= 64 - nr_key_bits;
- r_v >>= 64 - nr_key_bits;
- nr_key_bits = 0;
- } else {
- nr_key_bits -= 64;
- }
-
- if (!nr_key_bits || l_v != r_v)
- break;
-
- l = next_word(l);
- r = next_word(r);
-
- l_v = *l;
- r_v = *r;
- }
-
- return cmp_int(l_v, r_v);
-}
#endif
__pure
@@ -1046,19 +963,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
const struct bkey_packed *r,
const struct btree *b)
{
- const struct bkey_format *f = &b->format;
- int ret;
-
- EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
- EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-
- ret = __bkey_cmp_bits(high_word(f, l),
- high_word(f, r),
- b->nr_key_bits);
-
- EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
- bkey_unpack_pos(b, r)));
- return ret;
+ return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
}
__pure __flatten
@@ -1074,20 +979,7 @@ int bch2_bkey_cmp_packed(const struct btree *b,
const struct bkey_packed *l,
const struct bkey_packed *r)
{
- struct bkey unpacked;
-
- if (likely(bkey_packed(l) && bkey_packed(r)))
- return __bch2_bkey_cmp_packed_format_checked(l, r, b);
-
- if (bkey_packed(l)) {
- __bkey_unpack_key_format_checked(b, &unpacked, l);
- l = (void*) &unpacked;
- } else if (bkey_packed(r)) {
- __bkey_unpack_key_format_checked(b, &unpacked, r);
- r = (void*) &unpacked;
- }
-
- return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+ return bch2_bkey_cmp_packed_inlined(b, l, r);
}
__pure __flatten
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 7dee3d8e0a3d..2650bd639b55 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -5,6 +5,7 @@
#include <linux/bug.h>
#include "bcachefs_format.h"
+#include "btree_types.h"
#include "util.h"
#include "vstructs.h"
@@ -12,7 +13,9 @@
#define HAVE_BCACHEFS_COMPILED_UNPACK 1
#endif
-void bch2_to_binary(char *, const u64 *, unsigned);
+void bch2_bkey_packed_to_binary_text(struct printbuf *,
+ const struct bkey_format *,
+ const struct bkey_packed *);
/* bkey with split value, const */
struct bkey_s_c {
@@ -31,7 +34,12 @@ struct bkey_s {
};
};
-#define bkey_next(_k) vstruct_next(_k)
+#define bkey_p_next(_k) vstruct_next(_k)
+
+static inline struct bkey_i *bkey_next(struct bkey_i *k)
+{
+ return (struct bkey_i *) (k->_data + k->k.u64s);
+}
#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
@@ -42,12 +50,15 @@ static inline size_t bkey_val_bytes(const struct bkey *k)
static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
{
- k->u64s = BKEY_U64s + val_u64s;
+ unsigned u64s = BKEY_U64s + val_u64s;
+
+ BUG_ON(u64s > U8_MAX);
+ k->u64s = u64s;
}
static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
{
- k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
+ set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
}
#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
@@ -83,17 +94,6 @@ do { \
struct btree;
-struct bkey_format_state {
- u64 field_min[BKEY_NR_FIELDS];
- u64 field_max[BKEY_NR_FIELDS];
-};
-
-void bch2_bkey_format_init(struct bkey_format_state *);
-void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
-void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
-struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-const char *bch2_bkey_format_validate(struct bkey_format *);
-
__pure
unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
const struct bkey_packed *,
@@ -129,8 +129,9 @@ int bkey_cmp_left_packed(const struct btree *b,
}
/*
- * we prefer to pass bpos by ref, but it's often enough terribly convenient to
- * pass it by by val... as much as I hate c++, const ref would be nice here:
+ * The compiler generates better code when we pass bpos by ref, but it's often
+ * enough terribly convenient to pass it by val... as much as I hate c++, const
+ * ref would be nice here:
*/
__pure __flatten
static inline int bkey_cmp_left_packed_byval(const struct btree *b,
@@ -140,6 +141,37 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b,
return bkey_cmp_left_packed(b, l, &r);
}
+static __always_inline bool bpos_eq(struct bpos l, struct bpos r)
+{
+ return !((l.inode ^ r.inode) |
+ (l.offset ^ r.offset) |
+ (l.snapshot ^ r.snapshot));
+}
+
+static __always_inline bool bpos_lt(struct bpos l, struct bpos r)
+{
+ return l.inode != r.inode ? l.inode < r.inode :
+ l.offset != r.offset ? l.offset < r.offset :
+ l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false;
+}
+
+static __always_inline bool bpos_le(struct bpos l, struct bpos r)
+{
+ return l.inode != r.inode ? l.inode < r.inode :
+ l.offset != r.offset ? l.offset < r.offset :
+ l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true;
+}
+
+static __always_inline bool bpos_gt(struct bpos l, struct bpos r)
+{
+ return bpos_lt(r, l);
+}
+
+static __always_inline bool bpos_ge(struct bpos l, struct bpos r)
+{
+ return bpos_le(r, l);
+}
+
static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
{
return cmp_int(l.inode, r.inode) ?:
@@ -147,20 +179,60 @@ static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
cmp_int(l.snapshot, r.snapshot);
}
+static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+{
+ return bpos_lt(l, r) ? l : r;
+}
+
+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+{
+ return bpos_gt(l, r) ? l : r;
+}
+
+static __always_inline bool bkey_eq(struct bpos l, struct bpos r)
+{
+ return !((l.inode ^ r.inode) |
+ (l.offset ^ r.offset));
+}
+
+static __always_inline bool bkey_lt(struct bpos l, struct bpos r)
+{
+ return l.inode != r.inode
+ ? l.inode < r.inode
+ : l.offset < r.offset;
+}
+
+static __always_inline bool bkey_le(struct bpos l, struct bpos r)
+{
+ return l.inode != r.inode
+ ? l.inode < r.inode
+ : l.offset <= r.offset;
+}
+
+static __always_inline bool bkey_gt(struct bpos l, struct bpos r)
+{
+ return bkey_lt(r, l);
+}
+
+static __always_inline bool bkey_ge(struct bpos l, struct bpos r)
+{
+ return bkey_le(r, l);
+}
+
static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
{
return cmp_int(l.inode, r.inode) ?:
cmp_int(l.offset, r.offset);
}
-static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+static inline struct bpos bkey_min(struct bpos l, struct bpos r)
{
- return bpos_cmp(l, r) < 0 ? l : r;
+ return bkey_lt(l, r) ? l : r;
}
-static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+static inline struct bpos bkey_max(struct bpos l, struct bpos r)
{
- return bpos_cmp(l, r) > 0 ? l : r;
+ return bkey_gt(l, r) ? l : r;
}
void bch2_bpos_swab(struct bpos *);
@@ -351,6 +423,99 @@ void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
const struct bkey_format *);
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+ struct bkey *dst,
+ const struct bkey_packed *src)
+{
+ if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
+ compiled_unpack_fn unpack_fn = b->aux_data;
+ unpack_fn(dst, src);
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+ bch2_expensive_debug_checks) {
+ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+
+ BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
+ }
+ } else {
+ *dst = __bch2_bkey_unpack_key(&b->format, src);
+ }
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+ const struct bkey_packed *src)
+{
+ struct bkey dst;
+
+ __bkey_unpack_key_format_checked(b, &dst, src);
+ return dst;
+}
+
+static inline void __bkey_unpack_key(const struct btree *b,
+ struct bkey *dst,
+ const struct bkey_packed *src)
+{
+ if (likely(bkey_packed(src)))
+ __bkey_unpack_key_format_checked(b, dst, src);
+ else
+ *dst = *packed_to_bkey_c(src);
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+ const struct bkey_packed *src)
+{
+ return likely(bkey_packed(src))
+ ? bkey_unpack_key_format_checked(b, src)
+ : *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+ const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+ return bkey_unpack_key_format_checked(b, src).p;
+#else
+ return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+ const struct bkey_packed *src)
+{
+ return likely(bkey_packed(src))
+ ? bkey_unpack_pos_format_checked(b, src)
+ : packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(const struct btree *b,
+ const struct bkey_packed *k,
+ struct bkey *u)
+{
+ __bkey_unpack_key(b, u, k);
+
+ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(const struct btree *b,
+ struct bkey_packed *k,
+ struct bkey *u)
+{
+ __bkey_unpack_key(b, u, k);
+
+ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
static inline u64 bkey_field_max(const struct bkey_format *f,
enum bch_bkey_fields nr)
{
@@ -563,4 +728,39 @@ void bch2_bkey_pack_test(void);
static inline void bch2_bkey_pack_test(void) {}
#endif
+#define bkey_fields() \
+ x(BKEY_FIELD_INODE, p.inode) \
+ x(BKEY_FIELD_OFFSET, p.offset) \
+ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
+ x(BKEY_FIELD_SIZE, size) \
+ x(BKEY_FIELD_VERSION_HI, version.hi) \
+ x(BKEY_FIELD_VERSION_LO, version.lo)
+
+struct bkey_format_state {
+ u64 field_min[BKEY_NR_FIELDS];
+ u64 field_max[BKEY_NR_FIELDS];
+};
+
+void bch2_bkey_format_init(struct bkey_format_state *);
+
+static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v)
+{
+ s->field_min[field] = min(s->field_min[field], v);
+ s->field_max[field] = max(s->field_max[field], v);
+}
+
+/*
+ * Changes @format so that @k can be successfully packed with @format
+ */
+static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
+{
+#define x(id, field) __bkey_format_add(s, id, k->field);
+ bkey_fields()
+#undef x
+}
+
+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+const char *bch2_bkey_format_validate(struct bkey_format *);
+
#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
index 0d7c67a959af..a30c4ae8eb36 100644
--- a/fs/bcachefs/bkey_buf.h
+++ b/fs/bcachefs/bkey_buf.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_BKEY_BUF_H
#include "bcachefs.h"
+#include "bkey.h"
struct bkey_buf {
struct bkey_i *k;
diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h
new file mode 100644
index 000000000000..5f42a6e69360
--- /dev/null
+++ b/fs/bcachefs/bkey_cmp.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_CMP_H
+#define _BCACHEFS_BKEY_CMP_H
+
+#include "bkey.h"
+
+#ifdef CONFIG_X86_64
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+ unsigned nr_key_bits)
+{
+ long d0, d1, d2, d3;
+ int cmp;
+
+ /* we shouldn't need asm for this, but gcc is being retarded: */
+
+ asm(".intel_syntax noprefix;"
+ "xor eax, eax;"
+ "xor edx, edx;"
+ "1:;"
+ "mov r8, [rdi];"
+ "mov r9, [rsi];"
+ "sub ecx, 64;"
+ "jl 2f;"
+
+ "cmp r8, r9;"
+ "jnz 3f;"
+
+ "lea rdi, [rdi - 8];"
+ "lea rsi, [rsi - 8];"
+ "jmp 1b;"
+
+ "2:;"
+ "not ecx;"
+ "shr r8, 1;"
+ "shr r9, 1;"
+ "shr r8, cl;"
+ "shr r9, cl;"
+ "cmp r8, r9;"
+
+ "3:\n"
+ "seta al;"
+ "setb dl;"
+ "sub eax, edx;"
+ ".att_syntax prefix;"
+ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+ : "0" (l), "1" (r), "3" (nr_key_bits)
+ : "r8", "r9", "cc", "memory");
+
+ return cmp;
+}
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+ unsigned nr_key_bits)
+{
+ u64 l_v, r_v;
+
+ if (!nr_key_bits)
+ return 0;
+
+ /* for big endian, skip past header */
+ nr_key_bits += high_bit_offset;
+ l_v = *l & (~0ULL >> high_bit_offset);
+ r_v = *r & (~0ULL >> high_bit_offset);
+
+ while (1) {
+ if (nr_key_bits < 64) {
+ l_v >>= 64 - nr_key_bits;
+ r_v >>= 64 - nr_key_bits;
+ nr_key_bits = 0;
+ } else {
+ nr_key_bits -= 64;
+ }
+
+ if (!nr_key_bits || l_v != r_v)
+ break;
+
+ l = next_word(l);
+ r = next_word(r);
+
+ l_v = *l;
+ r_v = *r;
+ }
+
+ return cmp_int(l_v, r_v);
+}
+#endif
+
+static inline __pure __flatten
+int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
+ const struct bkey_packed *r,
+ const struct btree *b)
+{
+ const struct bkey_format *f = &b->format;
+ int ret;
+
+ EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+ ret = __bkey_cmp_bits(high_word(f, l),
+ high_word(f, r),
+ b->nr_key_bits);
+
+ EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
+ bkey_unpack_pos(b, r)));
+ return ret;
+}
+
+static inline __pure __flatten
+int bch2_bkey_cmp_packed_inlined(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
+{
+ struct bkey unpacked;
+
+ if (likely(bkey_packed(l) && bkey_packed(r)))
+ return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
+
+ if (bkey_packed(l)) {
+ __bkey_unpack_key_format_checked(b, &unpacked, l);
+ l = (void *) &unpacked;
+ } else if (bkey_packed(r)) {
+ __bkey_unpack_key_format_checked(b, &unpacked, r);
+ r = (void *) &unpacked;
+ }
+
+ return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+}
+
+#endif /* _BCACHEFS_BKEY_CMP_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5c900cf8a8a2..72d95831d65d 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "backpointers.h"
#include "bkey_methods.h"
#include "btree_types.h"
#include "alloc_background.h"
@@ -9,6 +10,7 @@
#include "error.h"
#include "extents.h"
#include "inode.h"
+#include "lru.h"
#include "quota.h"
#include "reflink.h"
#include "subvolume.h"
@@ -21,53 +23,60 @@ const char * const bch2_bkey_types[] = {
NULL
};
-static const char *deleted_key_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
- return NULL;
+ return 0;
}
-#define bch2_bkey_ops_deleted (struct bkey_ops) { \
+#define bch2_bkey_ops_deleted ((struct bkey_ops) { \
.key_invalid = deleted_key_invalid, \
-}
+})
-#define bch2_bkey_ops_whiteout (struct bkey_ops) { \
+#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \
.key_invalid = deleted_key_invalid, \
-}
+})
-static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
- if (bkey_val_bytes(k.k))
- return "value size should be zero";
+ if (bkey_val_bytes(k.k)) {
+ prt_printf(err, "incorrect value size (%zu != 0)",
+ bkey_val_bytes(k.k));
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ return 0;
}
-#define bch2_bkey_ops_error (struct bkey_ops) { \
+#define bch2_bkey_ops_error ((struct bkey_ops) { \
.key_invalid = empty_val_key_invalid, \
-}
+})
-static const char *key_type_cookie_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
- if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie))
- return "incorrect value size";
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_cookie));
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ return 0;
}
-#define bch2_bkey_ops_cookie (struct bkey_ops) { \
+#define bch2_bkey_ops_cookie ((struct bkey_ops) { \
.key_invalid = key_type_cookie_invalid, \
-}
+})
-#define bch2_bkey_ops_hash_whiteout (struct bkey_ops) { \
+#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
.key_invalid = empty_val_key_invalid, \
-}
+})
-static const char *key_type_inline_data_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
- return NULL;
+ return 0;
}
static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
@@ -76,27 +85,53 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
unsigned datalen = bkey_inline_data_bytes(k.k);
- pr_buf(out, "datalen %u: %*phN",
+ prt_printf(out, "datalen %u: %*phN",
datalen, min(datalen, 32U), d.v->data);
}
-#define bch2_bkey_ops_inline_data (struct bkey_ops) { \
+#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \
.key_invalid = key_type_inline_data_invalid, \
.val_to_text = key_type_inline_data_to_text, \
+})
+
+static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
+{
+ if (bkey_val_bytes(k.k)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_cookie));
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ return 0;
+}
+
+static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+ bch2_key_resize(l.k, l.k->size + r.k->size);
+ return true;
}
+#define bch2_bkey_ops_set ((struct bkey_ops) { \
+ .key_invalid = key_type_set_invalid, \
+ .key_merge = key_type_set_merge, \
+})
+
const struct bkey_ops bch2_bkey_ops[] = {
#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
BCH_BKEY_TYPES()
#undef x
};
-const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
+int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
- if (k.k->type >= KEY_TYPE_MAX)
- return "invalid type";
+ if (k.k->type >= KEY_TYPE_MAX) {
+ prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX);
+ return -BCH_ERR_invalid_bkey;
+ }
- return bch2_bkey_ops[k.k->type].key_invalid(c, k);
+ return bch2_bkey_ops[k.k->type].key_invalid(c, k, flags, err);
}
static unsigned bch2_key_types_allowed[] = {
@@ -114,6 +149,7 @@ static unsigned bch2_key_types_allowed[] = {
(1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_inode)|
(1U << KEY_TYPE_inode_v2)|
+ (1U << KEY_TYPE_inode_v3)|
(1U << KEY_TYPE_inode_generation),
[BKEY_TYPE_dirents] =
(1U << KEY_TYPE_deleted)|
@@ -130,7 +166,8 @@ static unsigned bch2_key_types_allowed[] = {
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_alloc)|
(1U << KEY_TYPE_alloc_v2)|
- (1U << KEY_TYPE_alloc_v3),
+ (1U << KEY_TYPE_alloc_v3)|
+ (1U << KEY_TYPE_alloc_v4),
[BKEY_TYPE_quotas] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_quota),
@@ -147,128 +184,148 @@ static unsigned bch2_key_types_allowed[] = {
[BKEY_TYPE_snapshots] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_snapshot),
+ [BKEY_TYPE_lru] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_set),
+ [BKEY_TYPE_freespace] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_set),
+ [BKEY_TYPE_need_discard] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_set),
+ [BKEY_TYPE_backpointers] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_backpointer),
+ [BKEY_TYPE_bucket_gens] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_bucket_gens),
[BKEY_TYPE_btree] =
(1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_btree_ptr)|
(1U << KEY_TYPE_btree_ptr_v2),
};
-const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum btree_node_type type)
+int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum btree_node_type type,
+ unsigned flags, struct printbuf *err)
{
- if (k.k->u64s < BKEY_U64s)
- return "u64s too small";
-
- if (!(bch2_key_types_allowed[type] & (1U << k.k->type)))
- return "invalid key type for this btree";
+ if (k.k->u64s < BKEY_U64s) {
+ prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
+ return -BCH_ERR_invalid_bkey;
+ }
- if (type == BKEY_TYPE_btree &&
- bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
- return "value too big";
+ if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) {
+ prt_printf(err, "invalid key type for btree %s (%s)",
+ bch2_btree_ids[type], bch2_bkey_types[k.k->type]);
+ return -BCH_ERR_invalid_bkey;
+ }
if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
- if (k.k->size == 0)
- return "bad size field";
+ if (k.k->size == 0) {
+ prt_printf(err, "size == 0");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (k.k->size > k.k->p.offset)
- return "size greater than offset";
+ if (k.k->size > k.k->p.offset) {
+ prt_printf(err, "size greater than offset (%u > %llu)",
+ k.k->size, k.k->p.offset);
+ return -BCH_ERR_invalid_bkey;
+ }
} else {
- if (k.k->size)
- return "nonzero size field";
+ if (k.k->size) {
+ prt_printf(err, "size != 0");
+ return -BCH_ERR_invalid_bkey;
+ }
}
if (type != BKEY_TYPE_btree &&
!btree_type_has_snapshots(type) &&
- k.k->p.snapshot)
- return "nonzero snapshot";
+ k.k->p.snapshot) {
+ prt_printf(err, "nonzero snapshot");
+ return -BCH_ERR_invalid_bkey;
+ }
if (type != BKEY_TYPE_btree &&
btree_type_has_snapshots(type) &&
- !k.k->p.snapshot)
- return "invalid snapshot field";
+ !k.k->p.snapshot) {
+ prt_printf(err, "snapshot == 0");
+ return -BCH_ERR_invalid_bkey;
+ }
if (type != BKEY_TYPE_btree &&
- !bkey_cmp(k.k->p, POS_MAX))
- return "POS_MAX key";
-
- return NULL;
-}
+ bkey_eq(k.k->p, POS_MAX)) {
+ prt_printf(err, "key at POS_MAX");
+ return -BCH_ERR_invalid_bkey;
+ }
-const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum btree_node_type type)
-{
- return __bch2_bkey_invalid(c, k, type) ?:
- bch2_bkey_val_invalid(c, k);
+ return 0;
}
-const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
+int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum btree_node_type type,
+ unsigned flags, struct printbuf *err)
{
- if (bpos_cmp(k.k->p, b->data->min_key) < 0)
- return "key before start of btree node";
-
- if (bpos_cmp(k.k->p, b->data->max_key) > 0)
- return "key past end of btree node";
-
- return NULL;
+ return __bch2_bkey_invalid(c, k, type, flags, err) ?:
+ bch2_bkey_val_invalid(c, k, flags, err);
}
-void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
+int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
+ struct printbuf *err)
{
- const char *invalid;
-
- BUG_ON(!k.k->u64s);
-
- invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?:
- bch2_bkey_in_btree_node(b, k);
- if (invalid) {
- char buf[160];
+ if (bpos_lt(k.k->p, b->data->min_key)) {
+ prt_printf(err, "key before start of btree node");
+ return -BCH_ERR_invalid_bkey;
+ }
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid);
+ if (bpos_gt(k.k->p, b->data->max_key)) {
+ prt_printf(err, "key past end of btree node");
+ return -BCH_ERR_invalid_bkey;
}
+
+ return 0;
}
void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
{
- if (!bpos_cmp(pos, POS_MIN))
- pr_buf(out, "POS_MIN");
- else if (!bpos_cmp(pos, POS_MAX))
- pr_buf(out, "POS_MAX");
- else if (!bpos_cmp(pos, SPOS_MAX))
- pr_buf(out, "SPOS_MAX");
+ if (bpos_eq(pos, POS_MIN))
+ prt_printf(out, "POS_MIN");
+ else if (bpos_eq(pos, POS_MAX))
+ prt_printf(out, "POS_MAX");
+ else if (bpos_eq(pos, SPOS_MAX))
+ prt_printf(out, "SPOS_MAX");
else {
if (pos.inode == U64_MAX)
- pr_buf(out, "U64_MAX");
+ prt_printf(out, "U64_MAX");
else
- pr_buf(out, "%llu", pos.inode);
- pr_buf(out, ":");
+ prt_printf(out, "%llu", pos.inode);
+ prt_printf(out, ":");
if (pos.offset == U64_MAX)
- pr_buf(out, "U64_MAX");
+ prt_printf(out, "U64_MAX");
else
- pr_buf(out, "%llu", pos.offset);
- pr_buf(out, ":");
+ prt_printf(out, "%llu", pos.offset);
+ prt_printf(out, ":");
if (pos.snapshot == U32_MAX)
- pr_buf(out, "U32_MAX");
+ prt_printf(out, "U32_MAX");
else
- pr_buf(out, "%u", pos.snapshot);
+ prt_printf(out, "%u", pos.snapshot);
}
}
void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
{
if (k) {
- pr_buf(out, "u64s %u type ", k->u64s);
+ prt_printf(out, "u64s %u type ", k->u64s);
if (k->type < KEY_TYPE_MAX)
- pr_buf(out, "%s ", bch2_bkey_types[k->type]);
+ prt_printf(out, "%s ", bch2_bkey_types[k->type]);
else
- pr_buf(out, "%u ", k->type);
+ prt_printf(out, "%u ", k->type);
bch2_bpos_to_text(out, k->p);
- pr_buf(out, " len %u ver %llu", k->size, k->version.lo);
+ prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
} else {
- pr_buf(out, "(null)");
+ prt_printf(out, "(null)");
}
}
@@ -281,7 +338,7 @@ void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
if (likely(ops->val_to_text))
ops->val_to_text(out, c, k);
} else {
- pr_buf(out, "(invalid type %u)", k.k->type);
+ prt_printf(out, "(invalid type %u)", k.k->type);
}
}
@@ -291,7 +348,7 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_to_text(out, k.k);
if (bkey_val_bytes(k.k)) {
- pr_buf(out, ": ");
+ prt_printf(out, ": ");
bch2_val_to_text(out, c, k);
}
}
@@ -317,7 +374,11 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
{
const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
- return bch2_bkey_maybe_mergable(l.k, r.k) && ops->key_merge(c, l, r);
+ return bch2_bkey_maybe_mergable(l.k, r.k) &&
+ (u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
+ bch2_bkey_ops[l.k->type].key_merge &&
+ !bch2_key_merging_disabled &&
+ ops->key_merge(c, l, r);
}
static const struct old_bkey_type {
@@ -386,6 +447,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
btree_id == BTREE_ID_inodes) {
if (!bkey_packed(k)) {
struct bkey_i *u = packed_to_bkey(k);
+
swap(u->k.p.inode, u->k.p.offset);
} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
f->bits_per_field[BKEY_FIELD_OFFSET]) {
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 3012035db1a3..6ae517884a37 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -6,20 +6,31 @@
struct bch_fs;
struct btree;
+struct btree_trans;
struct bkey;
enum btree_node_type;
extern const char * const bch2_bkey_types[];
+/*
+ * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
+ * invalid, entire key will be deleted.
+ *
+ * When invalid, error string is returned via @err. @rw indicates whether key is
+ * being read or written; more aggressive checks can be enabled when rw == WRITE.
+ */
struct bkey_ops {
- /* Returns reason for being invalid if invalid, else NULL: */
- const char * (*key_invalid)(const struct bch_fs *,
- struct bkey_s_c);
+ int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
void (*swab)(struct bkey_s);
bool (*key_normalize)(struct bch_fs *, struct bkey_s);
bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+ int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *, unsigned);
+ int (*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
void (*compat)(enum btree_id id, unsigned version,
unsigned big_endian, int write,
struct bkey_s);
@@ -27,14 +38,14 @@ struct bkey_ops {
extern const struct bkey_ops bch2_bkey_ops[];
-const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
-const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
- enum btree_node_type);
-const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
- enum btree_node_type);
-const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
+#define BKEY_INVALID_FROM_JOURNAL (1 << 1)
-void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+ enum btree_node_type, unsigned, struct printbuf *);
+int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
+ enum btree_node_type, unsigned, struct printbuf *);
+int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
void bch2_bpos_to_text(struct printbuf *, struct bpos);
void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
@@ -51,14 +62,102 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
{
return l->type == r->type &&
!bversion_cmp(l->version, r->version) &&
- !bpos_cmp(l->p, bkey_start_pos(r)) &&
- (u64) l->size + r->size <= KEY_SIZE_MAX &&
- bch2_bkey_ops[l->type].key_merge &&
- !bch2_key_merging_disabled;
+ bpos_eq(l->p, bkey_start_pos(r));
}
bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+static inline int bch2_mark_key(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type];
+
+ return ops->atomic_trigger
+ ? ops->atomic_trigger(trans, btree, level, old, new, flags)
+ : 0;
+}
+
+enum btree_update_flags {
+ __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+ __BTREE_UPDATE_NOJOURNAL,
+ __BTREE_UPDATE_KEY_CACHE_RECLAIM,
+ __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY,
+
+ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
+
+ __BTREE_TRIGGER_INSERT,
+ __BTREE_TRIGGER_OVERWRITE,
+
+ __BTREE_TRIGGER_GC,
+ __BTREE_TRIGGER_BUCKET_INVALIDATE,
+ __BTREE_TRIGGER_NOATOMIC,
+};
+
+#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+#define BTREE_UPDATE_NO_KEY_CACHE_COHERENCY \
+ (1U << __BTREE_UPDATE_NO_KEY_CACHE_COHERENCY)
+
+#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
+
+#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
+#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
+
+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
+#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
+#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
+
+#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
+ ((1U << KEY_TYPE_alloc)| \
+ (1U << KEY_TYPE_alloc_v2)| \
+ (1U << KEY_TYPE_alloc_v3)| \
+ (1U << KEY_TYPE_alloc_v4)| \
+ (1U << KEY_TYPE_stripe)| \
+ (1U << KEY_TYPE_inode)| \
+ (1U << KEY_TYPE_inode_v2)| \
+ (1U << KEY_TYPE_snapshot))
+
+static inline int bch2_trans_mark_key(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type];
+
+ return ops->trans_trigger
+ ? ops->trans_trigger(trans, btree_id, level, old, new, flags)
+ : 0;
+}
+
+static inline int bch2_trans_mark_old(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, unsigned flags)
+{
+ struct bkey_i deleted;
+
+ bkey_init(&deleted.k);
+ deleted.k.p = old.k->p;
+
+ return bch2_trans_mark_key(trans, btree_id, level, old, &deleted,
+ BTREE_TRIGGER_OVERWRITE|flags);
+}
+
+static inline int bch2_trans_mark_new(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_i *new, unsigned flags)
+{
+ struct bkey_i deleted;
+
+ bkey_init(&deleted.k);
+ deleted.k.p = new->k.p;
+
+ return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
+ BTREE_TRIGGER_INSERT|flags);
+}
+
void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
index b1385a77da11..cdef41db7692 100644
--- a/fs/bcachefs/bkey_sort.c
+++ b/fs/bcachefs/bkey_sort.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bkey_buf.h"
+#include "bkey_cmp.h"
#include "bkey_sort.h"
#include "bset.h"
#include "extents.h"
@@ -45,7 +46,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
BUG_ON(!iter->used);
- i->k = bkey_next(i->k);
+ i->k = bkey_p_next(i->k);
BUG_ON(i->k > i->end);
@@ -107,7 +108,7 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
!should_drop_next_key(iter)) {
bkey_copy(out, k);
btree_keys_account_key_add(&nr, 0, out);
- out = bkey_next(out);
+ out = bkey_p_next(out);
}
sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
@@ -143,8 +144,10 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
else
bch2_bkey_unpack(src, (void *) out, in);
+ out->needs_whiteout = false;
+
btree_keys_account_key_add(&nr, 0, out);
- out = bkey_next(out);
+ out = bkey_p_next(out);
}
dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
@@ -155,7 +158,7 @@ static inline int sort_keys_cmp(struct btree *b,
struct bkey_packed *l,
struct bkey_packed *r)
{
- return bch2_bkey_cmp_packed(b, l, r) ?:
+ return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
(int) l->needs_whiteout - (int) r->needs_whiteout;
}
@@ -177,7 +180,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
continue;
while ((next = sort_iter_peek(iter)) &&
- !bch2_bkey_cmp_packed(iter->b, in, next)) {
+ !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) {
BUG_ON(in->needs_whiteout &&
next->needs_whiteout);
needs_whiteout |= in->needs_whiteout;
@@ -191,7 +194,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
bkey_copy(out, in);
}
out->needs_whiteout |= needs_whiteout;
- out = bkey_next(out);
+ out = bkey_p_next(out);
}
return (u64 *) out - (u64 *) dst;
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 6000a8796bc5..0216ad96777a 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -36,16 +36,7 @@ static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
{
- unsigned offset = __btree_node_key_to_offset(b, k);
- struct bset_tree *t;
-
- for_each_bset(b, t)
- if (offset <= t->end_offset) {
- EBUG_ON(offset < btree_bkey_first_offset(t));
- return t;
- }
-
- BUG();
+ return bch2_bkey_to_bset_inlined(b, k);
}
/*
@@ -70,7 +61,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
struct bkey_packed *_k, *_n;
struct bkey uk, n;
struct bkey_s_c k;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
if (!i->u64s)
return;
@@ -78,30 +69,33 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
for (_k = i->start;
_k < vstruct_last(i);
_k = _n) {
- _n = bkey_next(_k);
+ _n = bkey_p_next(_k);
k = bkey_disassemble(b, _k, &uk);
+
+ printbuf_reset(&buf);
if (c)
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
+ bch2_bkey_val_to_text(&buf, c, k);
else
- bch2_bkey_to_text(&PBUF(buf), k.k);
+ bch2_bkey_to_text(&buf, k.k);
printk(KERN_ERR "block %u key %5zu: %s\n", set,
- _k->_data - i->_data, buf);
+ _k->_data - i->_data, buf.buf);
if (_n == vstruct_last(i))
continue;
n = bkey_unpack_key(b, _n);
- if (bpos_cmp(n.p, k.k->p) < 0) {
+ if (bpos_lt(n.p, k.k->p)) {
printk(KERN_ERR "Key skipped backwards\n");
continue;
}
- if (!bkey_deleted(k.k) &&
- !bpos_cmp(n.p, k.k->p))
+ if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p))
printk(KERN_ERR "Duplicate keys\n");
}
+
+ printbuf_exit(&buf);
}
void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
@@ -118,6 +112,7 @@ void bch2_dump_btree_node_iter(struct btree *b,
struct btree_node_iter *iter)
{
struct btree_node_iter_set *set;
+ struct printbuf buf = PRINTBUF;
printk(KERN_ERR "btree node iter with %u/%u sets:\n",
__btree_node_iter_used(iter), b->nsets);
@@ -126,12 +121,14 @@ void bch2_dump_btree_node_iter(struct btree *b,
struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
struct bset_tree *t = bch2_bkey_to_bset(b, k);
struct bkey uk = bkey_unpack_key(b, k);
- char buf[100];
- bch2_bkey_to_text(&PBUF(buf), &uk);
+ printbuf_reset(&buf);
+ bch2_bkey_to_text(&buf, &uk);
printk(KERN_ERR "set %zu key %u: %s\n",
- t - b->set, set->k, buf);
+ t - b->set, set->k, buf.buf);
}
+
+ printbuf_exit(&buf);
}
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -167,13 +164,14 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
struct btree_node_iter_set *set;
struct bkey ku = bkey_unpack_key(b, k);
struct bkey nu = bkey_unpack_key(b, n);
- char buf1[80], buf2[80];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&PBUF(buf1), &ku);
- bch2_bkey_to_text(&PBUF(buf2), &nu);
+ bch2_bkey_to_text(&buf1, &ku);
+ bch2_bkey_to_text(&buf2, &nu);
printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
- buf1, buf2);
+ buf1.buf, buf2.buf);
printk(KERN_ERR "iter was:");
btree_node_iter_for_each(_iter, set) {
@@ -238,6 +236,8 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
struct bset_tree *t = bch2_bkey_to_bset(b, where);
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
#if 0
BUG_ON(prev &&
bkey_iter_cmp(b, prev, insert) > 0);
@@ -246,17 +246,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
bkey_iter_cmp(b, prev, insert) > 0) {
struct bkey k1 = bkey_unpack_key(b, prev);
struct bkey k2 = bkey_unpack_key(b, insert);
- char buf1[100];
- char buf2[100];
bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&PBUF(buf1), &k1);
- bch2_bkey_to_text(&PBUF(buf2), &k2);
+ bch2_bkey_to_text(&buf1, &k1);
+ bch2_bkey_to_text(&buf2, &k2);
panic("prev > insert:\n"
"prev key %s\n"
"insert key %s\n",
- buf1, buf2);
+ buf1.buf, buf2.buf);
}
#endif
#if 0
@@ -267,17 +265,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
bkey_iter_cmp(b, insert, next) > 0) {
struct bkey k1 = bkey_unpack_key(b, insert);
struct bkey k2 = bkey_unpack_key(b, next);
- char buf1[100];
- char buf2[100];
bch2_dump_btree_node(NULL, b);
- bch2_bkey_to_text(&PBUF(buf1), &k1);
- bch2_bkey_to_text(&PBUF(buf2), &k2);
+ bch2_bkey_to_text(&buf1, &k1);
+ bch2_bkey_to_text(&buf2, &k2);
panic("insert > next:\n"
"insert key %s\n"
"next key %s\n",
- buf1, buf2);
+ buf1.buf, buf2.buf);
}
#endif
}
@@ -536,7 +532,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
goto start;
while (1) {
if (rw_aux_to_bkey(b, t, j) == k) {
- BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k,
+ BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k,
bkey_unpack_pos(b, k)));
start:
if (++j == t->size)
@@ -546,7 +542,7 @@ start:
rw_aux_tree(b, t)[j - 1].offset);
}
- k = bkey_next(k);
+ k = bkey_p_next(k);
BUG_ON(k >= btree_bkey_last(b, t));
}
}
@@ -737,7 +733,7 @@ retry:
/* First we figure out where the first key in each cacheline is */
eytzinger1_for_each(j, t->size - 1) {
while (bkey_to_cacheline(b, t, k) < cacheline)
- prev = k, k = bkey_next(k);
+ prev = k, k = bkey_p_next(k);
if (k >= btree_bkey_last(b, t)) {
/* XXX: this path sucks */
@@ -754,7 +750,7 @@ retry:
}
while (k != btree_bkey_last(b, t))
- prev = k, k = bkey_next(k);
+ prev = k, k = bkey_p_next(k);
if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
bkey_init(&min_key.k);
@@ -892,7 +888,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
while ((p = __bkey_prev(b, t, k)) && !ret) {
- for (i = p; i != k; i = bkey_next(i))
+ for (i = p; i != k; i = bkey_p_next(i))
if (i->type >= min_key_type)
ret = i;
@@ -903,10 +899,10 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
BUG_ON(ret >= orig_k);
for (i = ret
- ? bkey_next(ret)
+ ? bkey_p_next(ret)
: btree_bkey_first(b, t);
i != orig_k;
- i = bkey_next(i))
+ i = bkey_p_next(i))
BUG_ON(i->type >= min_key_type);
}
@@ -959,7 +955,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
t->size -= j - l;
for (j = l; j < t->size; j++)
- rw_aux_tree(b, t)[j].offset += shift;
+ rw_aux_tree(b, t)[j].offset += shift;
EBUG_ON(l < t->size &&
rw_aux_tree(b, t)[l].offset ==
@@ -978,7 +974,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
struct bkey_packed *k = start;
while (1) {
- k = bkey_next(k);
+ k = bkey_p_next(k);
if (k == end)
break;
@@ -1071,7 +1067,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
while (l + 1 != r) {
unsigned m = (l + r) >> 1;
- if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
+ if (bpos_lt(rw_aux_tree(b, t)[m].k, *search))
l = m;
else
r = m;
@@ -1212,12 +1208,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
while (m != btree_bkey_last(b, t) &&
bkey_iter_cmp_p_or_unp(b, m,
lossy_packed_search, search) < 0)
- m = bkey_next(m);
+ m = bkey_p_next(m);
if (!packed_search)
while (m != btree_bkey_last(b, t) &&
bkey_iter_pos_cmp(b, m, search) < 0)
- m = bkey_next(m);
+ m = bkey_p_next(m);
if (bch2_expensive_debug_checks) {
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
@@ -1260,7 +1256,7 @@ void bch2_btree_node_iter_push(struct btree_node_iter *iter,
bch2_btree_node_iter_sort(iter, b);
}
-noinline __flatten __attribute__((cold))
+noinline __flatten __cold
static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
struct btree *b, struct bpos *search)
{
@@ -1324,8 +1320,8 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
struct bkey_packed *k[MAX_BSETS];
unsigned i;
- EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0);
- EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0);
+ EBUG_ON(bpos_lt(*search, b->data->min_key));
+ EBUG_ON(bpos_gt(*search, b->data->max_key));
bset_aux_tree_verify(b);
memset(iter, 0, sizeof(*iter));
@@ -1435,7 +1431,10 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
EBUG_ON(iter->data->k > iter->data->end);
if (unlikely(__btree_node_iter_set_end(iter, 0))) {
- bch2_btree_node_iter_set_drop(iter, iter->data);
+ /* avoid an expensive memmove call: */
+ iter->data[0] = iter->data[1];
+ iter->data[1] = iter->data[2];
+ iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
return;
}
@@ -1537,9 +1536,9 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
/* Mergesort */
-void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
+void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
{
- struct bset_tree *t;
+ const struct bset_tree *t;
for_each_bset(b, t) {
enum bset_aux_tree_type type = bset_aux_tree_type(t);
@@ -1567,9 +1566,6 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
struct bkey uk;
unsigned j, inorder;
- if (out->pos != out->end)
- *out->pos = '\0';
-
if (!bset_has_ro_aux_tree(t))
return;
@@ -1584,12 +1580,12 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
switch (bkey_float(b, t, j)->exponent) {
case BFLOAT_FAILED:
uk = bkey_unpack_key(b, k);
- pr_buf(out,
+ prt_printf(out,
" failed unpacked at depth %u\n"
"\t",
ilog2(j));
bch2_bpos_to_text(out, uk.p);
- pr_buf(out, "\n");
+ prt_printf(out, "\n");
break;
}
}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 0d46534c3dcd..632c2b8c5460 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -205,109 +205,15 @@ static inline size_t btree_aux_data_u64s(const struct btree *b)
return btree_aux_data_bytes(b) / sizeof(u64);
}
-typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-
-static inline void
-__bkey_unpack_key_format_checked(const struct btree *b,
- struct bkey *dst,
- const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
- {
- compiled_unpack_fn unpack_fn = b->aux_data;
- unpack_fn(dst, src);
-
- if (bch2_expensive_debug_checks) {
- struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
-
- BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
- }
- }
-#else
- *dst = __bch2_bkey_unpack_key(&b->format, src);
-#endif
-}
-
-static inline struct bkey
-bkey_unpack_key_format_checked(const struct btree *b,
- const struct bkey_packed *src)
-{
- struct bkey dst;
-
- __bkey_unpack_key_format_checked(b, &dst, src);
- return dst;
-}
-
-static inline void __bkey_unpack_key(const struct btree *b,
- struct bkey *dst,
- const struct bkey_packed *src)
-{
- if (likely(bkey_packed(src)))
- __bkey_unpack_key_format_checked(b, dst, src);
- else
- *dst = *packed_to_bkey_c(src);
-}
-
-/**
- * bkey_unpack_key -- unpack just the key, not the value
- */
-static inline struct bkey bkey_unpack_key(const struct btree *b,
- const struct bkey_packed *src)
-{
- return likely(bkey_packed(src))
- ? bkey_unpack_key_format_checked(b, src)
- : *packed_to_bkey_c(src);
-}
-
-static inline struct bpos
-bkey_unpack_pos_format_checked(const struct btree *b,
- const struct bkey_packed *src)
-{
-#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
- return bkey_unpack_key_format_checked(b, src).p;
-#else
- return __bkey_unpack_pos(&b->format, src);
-#endif
-}
-
-static inline struct bpos bkey_unpack_pos(const struct btree *b,
- const struct bkey_packed *src)
-{
- return likely(bkey_packed(src))
- ? bkey_unpack_pos_format_checked(b, src)
- : packed_to_bkey_c(src)->p;
-}
-
-/* Disassembled bkeys */
-
-static inline struct bkey_s_c bkey_disassemble(struct btree *b,
- const struct bkey_packed *k,
- struct bkey *u)
-{
- __bkey_unpack_key(b, u, k);
-
- return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
-}
-
-/* non const version: */
-static inline struct bkey_s __bkey_disassemble(struct btree *b,
- struct bkey_packed *k,
- struct bkey *u)
-{
- __bkey_unpack_key(b, u, k);
-
- return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
-}
-
#define for_each_bset(_b, _t) \
for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
#define bset_tree_for_each_key(_b, _t, _k) \
for (_k = btree_bkey_first(_b, _t); \
_k != btree_bkey_last(_b, _t); \
- _k = bkey_next(_k))
+ _k = bkey_p_next(_k))
-static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
+static inline bool bset_has_ro_aux_tree(const struct bset_tree *t)
{
return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
}
@@ -385,6 +291,21 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
}
+static inline struct bset_tree *
+bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
+{
+ unsigned offset = __btree_node_key_to_offset(b, k);
+ struct bset_tree *t;
+
+ for_each_bset(b, t)
+ if (offset <= t->end_offset) {
+ EBUG_ON(offset < btree_bkey_first_offset(t));
+ return t;
+ }
+
+ BUG();
+}
+
struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
@@ -526,6 +447,11 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
struct btree *,
struct bkey *);
+#define for_each_btree_node_key(b, k, iter) \
+ for (bch2_btree_node_iter_init_from_start((iter), (b)); \
+ (k = bch2_btree_node_iter_peek((iter), (b))); \
+ bch2_btree_node_iter_advance(iter, b))
+
#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \
for (bch2_btree_node_iter_init_from_start((iter), (b)); \
(k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
@@ -578,7 +504,7 @@ struct bset_stats {
size_t failed;
};
-void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
+void bch2_btree_keys_stats(const struct btree *, struct bset_stats *);
void bch2_bfloat_to_text(struct printbuf *, struct btree *,
struct bkey_packed *);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index cfede3344883..c53597a29e2e 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -7,13 +7,26 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "debug.h"
+#include "errcode.h"
#include "error.h"
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
+#include <linux/seq_buf.h>
#include <trace/events/bcachefs.h>
-struct lock_class_key bch2_btree_node_lock_key;
+#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
+do { \
+ if (shrinker_counter) \
+ bc->not_freed_##counter++; \
+} while (0)
+
+const char * const bch2_btree_node_flags[] = {
+#define x(f) #f,
+ BTREE_FLAGS()
+#undef x
+ NULL
+};
void bch2_recalc_btree_reserve(struct bch_fs *c)
{
@@ -35,6 +48,14 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc)
return max_t(int, 0, bc->used - bc->reserve);
}
+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
+{
+ if (b->c.lock.readers)
+ list_move(&b->list, &bc->freed_pcpu);
+ else
+ list_move(&b->list, &bc->freed_nonpcpu);
+}
+
static void btree_node_data_free(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
@@ -51,7 +72,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
b->aux_data = NULL;
bc->used--;
- list_move(&b->list, &bc->freed);
+
+ btree_node_to_freedlist(bc, b);
}
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -76,10 +98,9 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->data = kvpmalloc(btree_bytes(c), gfp);
if (!b->data)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
- b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp,
- PAGE_KERNEL_EXEC);
+ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
#else
b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
PROT_READ|PROT_WRITE|PROT_EXEC,
@@ -90,20 +111,25 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
if (!b->aux_data) {
kvpfree(b->data, btree_bytes(c));
b->data = NULL;
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
return 0;
}
-static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
+static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
{
- struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
+ struct btree *b;
+
+ b = kzalloc(sizeof(struct btree), gfp);
if (!b)
return NULL;
bkey_btree_ptr_init(&b->key);
- __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
+ bch2_btree_lock_init(&b->c);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ lockdep_set_no_check_recursion(&b->c.lock.dep_map);
+#endif
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
b->byte_order = ilog2(btree_bytes(c));
@@ -113,7 +139,9 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
- struct btree *b = __btree_node_mem_alloc(c);
+ struct btree *b;
+
+ b = __btree_node_mem_alloc(c, GFP_KERNEL);
if (!b)
return NULL;
@@ -132,12 +160,11 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+
BUG_ON(ret);
/* Cause future lookups for this node to fail: */
b->hash_val = 0;
-
- six_lock_wakeup_all(&b->c.lock);
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
@@ -157,15 +184,10 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
b->c.level = level;
b->c.btree_id = id;
- if (level)
- six_lock_pcpu_alloc(&b->c.lock);
- else
- six_lock_pcpu_free_rcu(&b->c.lock);
-
mutex_lock(&bc->lock);
ret = __bch2_btree_node_hash_insert(bc, b);
if (!ret)
- list_add(&b->list, &bc->live);
+ list_add_tail(&b->list, &bc->live);
mutex_unlock(&bc->lock);
return ret;
@@ -184,7 +206,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc,
* this version is for btree nodes that have already been freed (we're not
* reaping a real btree node)
*/
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
{
struct btree_cache *bc = &c->btree_cache;
int ret = 0;
@@ -194,40 +216,64 @@ wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush)
- return -ENOMEM;
+ if (!flush) {
+ if (btree_node_dirty(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
+ else if (btree_node_read_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ else if (btree_node_write_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
+ return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ }
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read(b);
bch2_btree_node_wait_on_write(b);
}
- if (!six_trylock_intent(&b->c.lock))
- return -ENOMEM;
+ if (!six_trylock_intent(&b->c.lock)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
+ return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ }
- if (!six_trylock_write(&b->c.lock))
+ if (!six_trylock_write(&b->c.lock)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
goto out_unlock_intent;
+ }
/* recheck under lock */
if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush)
+ if (!flush) {
+ if (btree_node_read_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ else if (btree_node_write_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
goto out_unlock;
+ }
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
}
- if (btree_node_noevict(b))
+ if (btree_node_noevict(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
goto out_unlock;
-
- if (!btree_node_may_write(b))
+ }
+ if (btree_node_write_blocked(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
goto out_unlock;
+ }
+ if (btree_node_will_make_reachable(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
+ goto out_unlock;
+ }
if (btree_node_dirty(b)) {
- if (!flush ||
- test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+ if (!flush) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
goto out_unlock;
+ }
/*
* Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted
@@ -235,9 +281,11 @@ wait_on_io:
* the post write cleanup:
*/
if (bch2_verify_btree_ondisk)
- bch2_btree_node_write(c, b, SIX_LOCK_intent);
+ bch2_btree_node_write(c, b, SIX_LOCK_intent,
+ BTREE_WRITE_cache_reclaim);
else
- __bch2_btree_node_write(c, b, false);
+ __bch2_btree_node_write(c, b,
+ BTREE_WRITE_cache_reclaim);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -245,24 +293,24 @@ wait_on_io:
}
out:
if (b->hash_val && !ret)
- trace_btree_node_reap(c, b);
+ trace_and_count(c, btree_cache_reap, c, b);
return ret;
out_unlock:
six_unlock_write(&b->c.lock);
out_unlock_intent:
six_unlock_intent(&b->c.lock);
- ret = -ENOMEM;
+ ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
goto out;
}
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
{
- return __btree_node_reclaim(c, b, false);
+ return __btree_node_reclaim(c, b, false, shrinker_counter);
}
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
{
- return __btree_node_reclaim(c, b, true);
+ return __btree_node_reclaim(c, b, true, false);
}
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
@@ -273,21 +321,18 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct btree_cache *bc = &c->btree_cache;
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
- unsigned long can_free;
- unsigned long touched = 0;
+ unsigned long can_free = 0;
unsigned long freed = 0;
+ unsigned long touched = 0;
unsigned i, flags;
unsigned long ret = SHRINK_STOP;
+ bool trigger_writes = atomic_read(&bc->dirty) + nr >=
+ bc->used * 3 / 4;
if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
- /* Return -1 if we can't do anything right now */
- if (sc->gfp_mask & __GFP_FS)
- mutex_lock(&bc->lock);
- else if (!mutex_trylock(&bc->lock))
- goto out_norestore;
-
+ mutex_lock(&bc->lock);
flags = memalloc_nofs_save();
/*
@@ -297,7 +342,6 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
* succeed, so that inserting keys into the btree can always succeed and
* IO can always make forward progress:
*/
- nr /= btree_pages(c);
can_free = btree_cache_can_free(bc);
nr = min_t(unsigned long, nr, can_free);
@@ -313,61 +357,61 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
touched++;
if (touched >= nr)
- break;
+ goto out;
- if (!btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b, true)) {
btree_node_data_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
+ bc->freed++;
}
}
restart:
list_for_each_entry_safe(b, t, &bc->live, list) {
touched++;
- if (touched >= nr) {
- /* Save position */
- if (&t->list != &bc->live)
- list_move_tail(&bc->live, &t->list);
- break;
- }
-
- if (!btree_node_accessed(b) &&
- !btree_node_reclaim(c, b)) {
- /* can't call bch2_btree_node_hash_remove under lock */
+ if (btree_node_accessed(b)) {
+ clear_btree_node_accessed(b);
+ bc->not_freed_access_bit++;
+ } else if (!btree_node_reclaim(c, b, true)) {
freed++;
- if (&t->list != &bc->live)
- list_move_tail(&bc->live, &t->list);
-
btree_node_data_free(c, b);
- mutex_unlock(&bc->lock);
+ bc->freed++;
bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
- if (freed >= nr)
- goto out;
-
- if (sc->gfp_mask & __GFP_FS)
- mutex_lock(&bc->lock);
- else if (!mutex_trylock(&bc->lock))
- goto out;
+ if (freed == nr)
+ goto out_rotate;
+ } else if (trigger_writes &&
+ btree_node_dirty(b) &&
+ !btree_node_will_make_reachable(b) &&
+ !btree_node_write_blocked(b) &&
+ six_trylock_read(&b->c.lock)) {
+ list_move(&bc->live, &b->list);
+ mutex_unlock(&bc->lock);
+ __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
+ six_unlock_read(&b->c.lock);
+ if (touched >= nr)
+ goto out_nounlock;
+ mutex_lock(&bc->lock);
goto restart;
- } else
- clear_btree_node_accessed(b);
- }
+ }
- mutex_unlock(&bc->lock);
+ if (touched >= nr)
+ break;
+ }
+out_rotate:
+ if (&t->list != &bc->live)
+ list_move_tail(&bc->live, &t->list);
out:
- ret = (unsigned long) freed * btree_pages(c);
+ mutex_unlock(&bc->lock);
+out_nounlock:
+ ret = freed;
memalloc_nofs_restore(flags);
-out_norestore:
- trace_btree_cache_scan(sc->nr_to_scan,
- sc->nr_to_scan / btree_pages(c),
- btree_cache_can_free(bc),
- ret);
+ trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
return ret;
}
@@ -381,7 +425,19 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
if (bch2_btree_shrinker_disabled)
return 0;
- return btree_cache_can_free(bc) * btree_pages(c);
+ return btree_cache_can_free(bc);
+}
+
+static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
+{
+ struct bch_fs *c = container_of(shrink, struct bch_fs,
+ btree_cache.shrink);
+ char *cbuf;
+ size_t buflen = seq_buf_get_buf(s, &cbuf);
+ struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
+
+ bch2_btree_cache_to_text(&out, &c->btree_cache);
+ seq_buf_commit(s, out.pos);
}
void bch2_fs_btree_cache_exit(struct bch_fs *c)
@@ -416,15 +472,17 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (btree_node_dirty(b))
bch2_btree_complete_write(c, b, btree_current_write(b));
- clear_btree_node_dirty(c, b);
+ clear_btree_node_dirty_acct(c, b);
btree_node_data_free(c, b);
}
BUG_ON(atomic_read(&c->btree_cache.dirty));
- while (!list_empty(&bc->freed)) {
- b = list_first_entry(&bc->freed, struct btree, list);
+ list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+ while (!list_empty(&bc->freed_nonpcpu)) {
+ b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
list_del(&b->list);
six_lock_pcpu_free(&b->c.lock);
kfree(b);
@@ -455,7 +513,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
for (i = 0; i < bc->reserve; i++)
if (!__bch2_btree_node_mem_alloc(c)) {
- ret = -ENOMEM;
+ ret = -BCH_ERR_ENOMEM_fs_btree_cache_init;
goto out;
}
@@ -465,9 +523,9 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bc->shrink.count_objects = bch2_btree_cache_count;
bc->shrink.scan_objects = bch2_btree_cache_scan;
+ bc->shrink.to_text = bch2_btree_cache_shrinker_to_text;
bc->shrink.seeks = 4;
- bc->shrink.batch = btree_pages(c) * 2;
- ret = register_shrinker(&bc->shrink);
+ ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
out:
pr_verbose_init(c->opts, "ret %i", ret);
return ret;
@@ -478,7 +536,8 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
mutex_init(&bc->lock);
INIT_LIST_HEAD(&bc->live);
INIT_LIST_HEAD(&bc->freeable);
- INIT_LIST_HEAD(&bc->freed);
+ INIT_LIST_HEAD(&bc->freed_pcpu);
+ INIT_LIST_HEAD(&bc->freed_nonpcpu);
}
/*
@@ -492,7 +551,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
struct btree_cache *bc = &c->btree_cache;
if (bc->alloc_lock == current) {
- trace_btree_node_cannibalize_unlock(c);
+ trace_and_count(c, btree_cache_cannibalize_unlock, c);
bc->alloc_lock = NULL;
closure_wake_up(&bc->alloc_wait);
}
@@ -508,8 +567,8 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
if (!cl) {
- trace_btree_node_cannibalize_lock_fail(c);
- return -ENOMEM;
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
}
closure_wait(&bc->alloc_wait, cl);
@@ -522,11 +581,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
}
- trace_btree_node_cannibalize_lock_fail(c);
- return -EAGAIN;
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
success:
- trace_btree_node_cannibalize_lock(c);
+ trace_and_count(c, btree_cache_cannibalize_lock, c);
return 0;
}
@@ -536,7 +595,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
struct btree *b;
list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_reclaim(c, b))
+ if (!btree_node_reclaim(c, b, false))
return b;
while (1) {
@@ -553,10 +612,14 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
}
}
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
+ struct list_head *freed = pcpu_read_locks
+ ? &bc->freed_pcpu
+ : &bc->freed_nonpcpu;
+ struct btree *b, *b2;
u64 start_time = local_clock();
unsigned flags;
@@ -564,44 +627,54 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
mutex_lock(&bc->lock);
/*
- * btree_free() doesn't free memory; it sticks the node on the end of
- * the list. Check if there's any freed nodes there:
- */
- list_for_each_entry(b, &bc->freeable, list)
- if (!btree_node_reclaim(c, b))
- goto got_node;
-
- /*
* We never free struct btree itself, just the memory that holds the on
* disk node. Check the freed list before allocating a new one:
*/
- list_for_each_entry(b, &bc->freed, list)
- if (!btree_node_reclaim(c, b))
+ list_for_each_entry(b, freed, list)
+ if (!btree_node_reclaim(c, b, false)) {
+ list_del_init(&b->list);
goto got_node;
+ }
- b = NULL;
-got_node:
- if (b)
- list_del_init(&b->list);
- mutex_unlock(&bc->lock);
-
+ b = __btree_node_mem_alloc(c, __GFP_NOWARN);
if (!b) {
- b = __btree_node_mem_alloc(c);
+ mutex_unlock(&bc->lock);
+ b = __btree_node_mem_alloc(c, GFP_KERNEL);
if (!b)
goto err;
-
- BUG_ON(!six_trylock_intent(&b->c.lock));
- BUG_ON(!six_trylock_write(&b->c.lock));
+ mutex_lock(&bc->lock);
}
- if (!b->data) {
- if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
- goto err;
+ if (pcpu_read_locks)
+ six_lock_pcpu_alloc(&b->c.lock);
- mutex_lock(&bc->lock);
- bc->used++;
- mutex_unlock(&bc->lock);
- }
+ BUG_ON(!six_trylock_intent(&b->c.lock));
+ BUG_ON(!six_trylock_write(&b->c.lock));
+got_node:
+
+ /*
+ * btree_free() doesn't free memory; it sticks the node on the end of
+ * the list. Check if there's any freed nodes there:
+ */
+ list_for_each_entry(b2, &bc->freeable, list)
+ if (!btree_node_reclaim(c, b2, false)) {
+ swap(b->data, b2->data);
+ swap(b->aux_data, b2->aux_data);
+ btree_node_to_freedlist(bc, b2);
+ six_unlock_write(&b2->c.lock);
+ six_unlock_intent(&b2->c.lock);
+ goto got_mem;
+ }
+
+ mutex_unlock(&bc->lock);
+
+ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
+ goto err;
+
+ mutex_lock(&bc->lock);
+ bc->used++;
+got_mem:
+ mutex_unlock(&bc->lock);
BUG_ON(btree_node_hashed(b));
BUG_ON(btree_node_dirty(b));
@@ -624,32 +697,35 @@ out:
err:
mutex_lock(&bc->lock);
- if (b) {
- list_add(&b->list, &bc->freed);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- }
-
/* Try to cannibalize another cached btree node: */
if (bc->alloc_lock == current) {
- b = btree_node_cannibalize(c);
- list_del_init(&b->list);
- mutex_unlock(&bc->lock);
+ b2 = btree_node_cannibalize(c);
+ bch2_btree_node_hash_remove(bc, b2);
+
+ if (b) {
+ swap(b->data, b2->data);
+ swap(b->aux_data, b2->aux_data);
+ btree_node_to_freedlist(bc, b2);
+ six_unlock_write(&b2->c.lock);
+ six_unlock_intent(&b2->c.lock);
+ } else {
+ b = b2;
+ list_del_init(&b->list);
+ }
- bch2_btree_node_hash_remove(bc, b);
+ mutex_unlock(&bc->lock);
- trace_btree_node_cannibalize(c);
+ trace_and_count(c, btree_cache_cannibalize, c);
goto out;
}
mutex_unlock(&bc->lock);
memalloc_nofs_restore(flags);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
}
/* Slowpath, don't want it inlined into btree_iter_traverse() */
-static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
- struct btree_trans *trans,
+static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
struct btree_path *path,
const struct bkey_i *k,
enum btree_id btree_id,
@@ -657,6 +733,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
enum six_lock_type lock_type,
bool sync)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
u32 seq;
@@ -666,17 +743,28 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
* Parent node must be locked, else we could read in a btree node that's
* been freed:
*/
- if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
- trace_trans_restart_relock_parent_for_fill(trans->fn,
- _THIS_IP_, btree_id, &path->pos);
- btree_trans_restart(trans);
- return ERR_PTR(-EINTR);
+ if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
+ trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
+ }
+
+ b = bch2_btree_node_mem_alloc(trans, level != 0);
+
+ if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+ trans->memory_allocation_failure = true;
+ trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
}
- b = bch2_btree_node_mem_alloc(c);
if (IS_ERR(b))
return b;
+ /*
+ * Btree nodes read in from disk should not have the accessed bit set
+ * initially, so that linear scans don't thrash the cache:
+ */
+ clear_btree_node_accessed(b);
+
bkey_copy(&b->key, k);
if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
/* raced with another fill: */
@@ -708,97 +796,74 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
if (!sync)
return NULL;
- if (trans &&
- (!bch2_trans_relock(trans) ||
- !bch2_btree_path_relock_intent(trans, path))) {
- BUG_ON(!trans->restarted);
- return ERR_PTR(-EINTR);
+ if (path) {
+ int ret = bch2_trans_relock(trans) ?:
+ bch2_btree_path_relock_intent(trans, path);
+ if (ret) {
+ BUG_ON(!trans->restarted);
+ return ERR_PTR(ret);
+ }
}
if (!six_relock_type(&b->c.lock, lock_type, seq)) {
- trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
- btree_id, &path->pos);
- btree_trans_restart(trans);
- return ERR_PTR(-EINTR);
+ if (path)
+ trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
}
return b;
}
-static int lock_node_check_fn(struct six_lock *lock, void *p)
-{
- struct btree *b = container_of(lock, struct btree, c.lock);
- const struct bkey_i *k = p;
-
- return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
-}
-
static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
{
- char buf1[200], buf2[100], buf3[100];
+ struct printbuf buf = PRINTBUF;
if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
return;
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key));
- bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
- bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
-
- bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
- "btree %s level %u\n"
- "ptr: %s\n"
- "header: btree %s level %llu\n"
- "min %s max %s\n",
- bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1,
- bch2_btree_ids[BTREE_NODE_ID(b->data)],
- BTREE_NODE_LEVEL(b->data),
- buf2, buf3);
+ prt_printf(&buf,
+ "btree node header doesn't match ptr\n"
+ "btree %s level %u\n"
+ "ptr: ",
+ bch2_btree_ids[b->c.btree_id], b->c.level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+ prt_printf(&buf, "\nheader: btree %s level %llu\n"
+ "min ",
+ bch2_btree_ids[BTREE_NODE_ID(b->data)],
+ BTREE_NODE_LEVEL(b->data));
+ bch2_bpos_to_text(&buf, b->data->min_key);
+
+ prt_printf(&buf, "\nmax ");
+ bch2_bpos_to_text(&buf, b->data->max_key);
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+ printbuf_exit(&buf);
}
static inline void btree_check_header(struct bch_fs *c, struct btree *b)
{
if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
b->c.level != BTREE_NODE_LEVEL(b->data) ||
- bpos_cmp(b->data->max_key, b->key.k.p) ||
+ !bpos_eq(b->data->max_key, b->key.k.p) ||
(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
- bpos_cmp(b->data->min_key,
+ !bpos_eq(b->data->min_key,
bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
btree_bad_header(c, b);
}
-/**
- * bch_btree_node_get - find a btree node in the cache and lock it, reading it
- * in from disk if necessary.
- *
- * If IO is necessary and running under generic_make_request, returns -EAGAIN.
- *
- * The btree node will have either a read or a write lock held, depending on
- * the @write parameter.
- */
-struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
- const struct bkey_i *k, unsigned level,
- enum six_lock_type lock_type,
- unsigned long trace_ip)
+static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+ const struct bkey_i *k, unsigned level,
+ enum six_lock_type lock_type,
+ unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
struct bset_tree *t;
+ int ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
-
- b = btree_node_mem_ptr(k);
-
- /*
- * Check b->hash_val _before_ calling btree_node_lock() - this might not
- * be the node we want anymore, and trying to lock the wrong node could
- * cause an unneccessary transaction restart:
- */
- if (likely(c->opts.btree_node_mem_ptr_optimization &&
- b &&
- b->hash_val == btree_ptr_hash_val(k)))
- goto lock_node;
retry:
b = btree_cache_find(bc, k);
if (unlikely(!b)) {
@@ -807,7 +872,7 @@ retry:
* else we could read in a btree node from disk that's been
* freed:
*/
- b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
+ b = bch2_btree_node_fill(trans, path, k, path->btree_id,
level, lock_type, true);
/* We raced and found the btree node in the cache */
@@ -817,44 +882,14 @@ retry:
if (IS_ERR(b))
return b;
} else {
-lock_node:
- /*
- * There's a potential deadlock with splits and insertions into
- * interior nodes we have to avoid:
- *
- * The other thread might be holding an intent lock on the node
- * we want, and they want to update its parent node so they're
- * going to upgrade their intent lock on the parent node to a
- * write lock.
- *
- * But if we're holding a read lock on the parent, and we're
- * trying to get the intent lock they're holding, we deadlock.
- *
- * So to avoid this we drop the read locks on parent nodes when
- * we're starting to take intent locks - and handle the race.
- *
- * The race is that they might be about to free the node we
- * want, and dropping our read lock on the parent node lets them
- * update the parent marking the node we want as freed, and then
- * free it:
- *
- * To guard against this, btree nodes are evicted from the cache
- * when they're freed - and b->hash_val is zeroed out, which we
- * check for after we lock the node.
- *
- * Then, bch2_btree_node_relock() on the parent will fail - because
- * the parent was modified, when the pointer to the node we want
- * was removed - and we'll bail out:
- */
if (btree_node_read_locked(path, level + 1))
- btree_node_unlock(path, level + 1);
+ btree_node_unlock(trans, path, level + 1);
- if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type,
- lock_node_check_fn, (void *) k, trace_ip)) {
- if (!trans->restarted)
- goto retry;
- return ERR_PTR(-EINTR);
- }
+ ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ERR_PTR(ret);
+
+ BUG_ON(ret);
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
b->c.level != level ||
@@ -863,13 +898,13 @@ lock_node:
if (bch2_btree_node_relock(trans, path, level + 1))
goto retry;
- trace_trans_restart_btree_node_reused(trans->fn,
- trace_ip,
- path->btree_id,
- &path->pos);
- btree_trans_restart(trans);
- return ERR_PTR(-EINTR);
+ trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
}
+
+ /* avoid atomic set bit if it's not needed: */
+ if (!btree_node_accessed(b))
+ set_btree_node_accessed(b);
}
if (unlikely(btree_node_read_in_flight(b))) {
@@ -884,11 +919,13 @@ lock_node:
* should_be_locked is not set on this path yet, so we need to
* relock it specifically:
*/
- if (trans &&
- (!bch2_trans_relock(trans) ||
- !bch2_btree_path_relock_intent(trans, path))) {
- BUG_ON(!trans->restarted);
- return ERR_PTR(-EINTR);
+ if (trans) {
+ int ret = bch2_trans_relock(trans) ?:
+ bch2_btree_path_relock_intent(trans, path);
+ if (ret) {
+ BUG_ON(!trans->restarted);
+ return ERR_PTR(ret);
+ }
}
if (!six_relock_type(&b->c.lock, lock_type, seq))
@@ -905,6 +942,104 @@ lock_node:
prefetch(p + L1_CACHE_BYTES * 2);
}
+ if (unlikely(btree_node_read_error(b))) {
+ six_unlock_type(&b->c.lock, lock_type);
+ return ERR_PTR(-EIO);
+ }
+
+ EBUG_ON(b->c.btree_id != path->btree_id);
+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+ btree_check_header(c, b);
+
+ return b;
+}
+
+/**
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * the @write parameter.
+ */
+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+ const struct bkey_i *k, unsigned level,
+ enum six_lock_type lock_type,
+ unsigned long trace_ip)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b;
+ struct bset_tree *t;
+ int ret;
+
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+ b = btree_node_mem_ptr(k);
+
+ /*
+ * Check b->hash_val _before_ calling btree_node_lock() - this might not
+ * be the node we want anymore, and trying to lock the wrong node could
+ * cause an unneccessary transaction restart:
+ */
+ if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
+ !b ||
+ b->hash_val != btree_ptr_hash_val(k)))
+ return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+
+ if (btree_node_read_locked(path, level + 1))
+ btree_node_unlock(trans, path, level + 1);
+
+ ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ERR_PTR(ret);
+
+ BUG_ON(ret);
+
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+ b->c.level != level ||
+ race_fault())) {
+ six_unlock_type(&b->c.lock, lock_type);
+ if (bch2_btree_node_relock(trans, path, level + 1))
+ return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+
+ trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
+ }
+
+ if (unlikely(btree_node_read_in_flight(b))) {
+ u32 seq = b->c.lock.state.seq;
+
+ six_unlock_type(&b->c.lock, lock_type);
+ bch2_trans_unlock(trans);
+
+ bch2_btree_node_wait_on_read(b);
+
+ /*
+ * should_be_locked is not set on this path yet, so we need to
+ * relock it specifically:
+ */
+ if (trans) {
+ int ret = bch2_trans_relock(trans) ?:
+ bch2_btree_path_relock_intent(trans, path);
+ if (ret) {
+ BUG_ON(!trans->restarted);
+ return ERR_PTR(ret);
+ }
+ }
+
+ if (!six_relock_type(&b->c.lock, lock_type, seq))
+ return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+ }
+
+ prefetch(b->aux_data);
+
+ for_each_bset(b, t) {
+ void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+ prefetch(p + L1_CACHE_BYTES * 0);
+ prefetch(p + L1_CACHE_BYTES * 1);
+ prefetch(p + L1_CACHE_BYTES * 2);
+ }
+
/* avoid atomic set bit if it's not needed: */
if (!btree_node_accessed(b))
set_btree_node_accessed(b);
@@ -921,12 +1056,13 @@ lock_node:
return b;
}
-struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
const struct bkey_i *k,
enum btree_id btree_id,
unsigned level,
bool nofill)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
struct bset_tree *t;
@@ -945,7 +1081,7 @@ retry:
if (nofill)
goto out;
- b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
+ b = bch2_btree_node_fill(trans, NULL, k, btree_id,
level, SIX_LOCK_read, true);
/* We raced and found the btree node in the cache */
@@ -960,9 +1096,11 @@ retry:
goto out;
} else {
lock_node:
- ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
- if (ret)
- goto retry;
+ ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ERR_PTR(ret);
+
+ BUG_ON(ret);
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
b->c.btree_id != btree_id ||
@@ -1003,12 +1141,12 @@ out:
return b;
}
-int bch2_btree_node_prefetch(struct bch_fs *c,
- struct btree_trans *trans,
+int bch2_btree_node_prefetch(struct btree_trans *trans,
struct btree_path *path,
const struct bkey_i *k,
enum btree_id btree_id, unsigned level)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
@@ -1019,13 +1157,14 @@ int bch2_btree_node_prefetch(struct bch_fs *c,
if (b)
return 0;
- b = bch2_btree_node_fill(c, trans, path, k, btree_id,
+ b = bch2_btree_node_fill(trans, path, k, btree_id,
level, SIX_LOCK_read, false);
return PTR_ERR_OR_ZERO(b);
}
-void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
+void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
@@ -1037,15 +1176,15 @@ wait_on_io:
/* XXX we're called from btree_gc which will be holding other btree
* nodes locked
- * */
+ */
__bch2_btree_node_wait_on_read(b);
__bch2_btree_node_wait_on_write(b);
- six_lock_intent(&b->c.lock, NULL, NULL);
- six_lock_write(&b->c.lock, NULL, NULL);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
if (btree_node_dirty(b)) {
- __bch2_btree_node_write(c, b, false);
+ __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
@@ -1063,7 +1202,7 @@ wait_on_io:
}
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
- struct btree *b)
+ const struct btree *b)
{
const struct bkey_format *f = &b->format;
struct bset_stats stats;
@@ -1072,15 +1211,15 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
bch2_btree_keys_stats(b, &stats);
- pr_buf(out, "l %u ", b->c.level);
+ prt_printf(out, "l %u ", b->c.level);
bch2_bpos_to_text(out, b->data->min_key);
- pr_buf(out, " - ");
+ prt_printf(out, " - ");
bch2_bpos_to_text(out, b->data->max_key);
- pr_buf(out, ":\n"
+ prt_printf(out, ":\n"
" ptrs: ");
bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
- pr_buf(out, "\n"
+ prt_printf(out, "\n"
" format: u64s %u fields %u %u %u %u %u\n"
" unpack fn len: %u\n"
" bytes used %zu/%zu (%zu%% full)\n"
@@ -1108,9 +1247,21 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
stats.failed);
}
-void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
{
- pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
- pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
- pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+ prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
+ prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
+ prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
+
+ prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
+ prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
+ prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
+ prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
+ prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
+ prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
+ prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
+ prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
+ prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
+ prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
+
}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index f7e10986f317..4900ed45422e 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -4,8 +4,9 @@
#include "bcachefs.h"
#include "btree_types.h"
+#include "bkey_methods.h"
-extern struct lock_class_key bch2_btree_node_lock_key;
+extern const char * const bch2_btree_node_flags[];
struct btree_iter;
@@ -20,19 +21,19 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
const struct bkey_i *, unsigned,
enum six_lock_type, unsigned long);
-struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
enum btree_id, unsigned, bool);
-int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
+int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *,
const struct bkey_i *, enum btree_id, unsigned);
-void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
+void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
void bch2_fs_btree_cache_exit(struct bch_fs *);
int bch2_fs_btree_cache_init(struct bch_fs *);
@@ -99,7 +100,7 @@ static inline unsigned btree_blocks(struct bch_fs *c)
#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b)
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
- struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
+ const struct btree *);
+void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 809c9a762303..fb4226aa0255 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -9,6 +9,7 @@
#include "alloc_foreground.h"
#include "bkey_methods.h"
#include "bkey_buf.h"
+#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "btree_io.h"
@@ -69,23 +70,23 @@ static int bch2_gc_check_topology(struct bch_fs *c,
struct bpos expected_start = bkey_deleted(&prev->k->k)
? node_start
: bpos_successor(prev->k->k.p);
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
int ret = 0;
if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
- if (bkey_deleted(&prev->k->k)) {
- struct printbuf out = PBUF(buf1);
- pr_buf(&out, "start of node: ");
- bch2_bpos_to_text(&out, node_start);
- } else {
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
- }
-
- if (bpos_cmp(expected_start, bp->v.min_key)) {
+ if (!bpos_eq(expected_start, bp->v.min_key)) {
bch2_topology_error(c);
+ if (bkey_deleted(&prev->k->k)) {
+ prt_printf(&buf1, "start of node: ");
+ bch2_bpos_to_text(&buf1, node_start);
+ } else {
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
+ }
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
+
if (__fsck_err(c,
FSCK_CAN_FIX|
FSCK_CAN_IGNORE|
@@ -94,20 +95,26 @@ static int bch2_gc_check_topology(struct bch_fs *c,
" prev %s\n"
" cur %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1,
- (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) &&
+ buf1.buf, buf2.buf) &&
!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
- return FSCK_ERR_START_TOPOLOGY_REPAIR;
+ ret = -BCH_ERR_need_topology_repair;
+ goto err;
} else {
set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
}
}
}
- if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
+ if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
bch2_topology_error(c);
+ printbuf_reset(&buf1);
+ printbuf_reset(&buf2);
+
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
+ bch2_bpos_to_text(&buf2, node_end);
+
if (__fsck_err(c,
FSCK_CAN_FIX|
FSCK_CAN_IGNORE|
@@ -116,18 +123,21 @@ static int bch2_gc_check_topology(struct bch_fs *c,
" %s\n"
" expected %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) &&
+ buf1.buf, buf2.buf) &&
!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
- return FSCK_ERR_START_TOPOLOGY_REPAIR;
+ ret = -BCH_ERR_need_topology_repair;
+ goto err;
} else {
set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
}
}
bch2_bkey_buf_copy(prev, c, cur.k);
+err:
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
@@ -155,10 +165,11 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
}
}
-static void bch2_btree_node_update_key_early(struct bch_fs *c,
+static void bch2_btree_node_update_key_early(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_i *new)
{
+ struct bch_fs *c = trans->c;
struct btree *b;
struct bkey_buf tmp;
int ret;
@@ -166,7 +177,7 @@ static void bch2_btree_node_update_key_early(struct bch_fs *c,
bch2_bkey_buf_init(&tmp);
bch2_bkey_buf_reassemble(&tmp, c, old);
- b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true);
+ b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
if (!IS_ERR_OR_NULL(b)) {
mutex_lock(&c->btree_cache.lock);
@@ -188,9 +199,9 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
struct bkey_i_btree_ptr_v2 *new;
int ret;
- new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
+ new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_gc_repair_key;
btree_ptr_to_v2(b, new);
b->data->min_key = new_min;
@@ -204,7 +215,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
}
bch2_btree_node_drop_keys_outside_node(b);
-
+ bkey_copy(&b->key, &new->k_i);
return 0;
}
@@ -217,9 +228,9 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
if (ret)
return ret;
- new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
+ new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_gc_repair_key;
btree_ptr_to_v2(b, new);
b->data->max_key = new_max;
@@ -250,95 +261,107 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
struct bpos expected_start = !prev
? b->data->min_key
: bpos_successor(prev->key.k.p);
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
int ret = 0;
if (!prev) {
- struct printbuf out = PBUF(buf1);
- pr_buf(&out, "start of node: ");
- bch2_bpos_to_text(&out, b->data->min_key);
+ prt_printf(&buf1, "start of node: ");
+ bch2_bpos_to_text(&buf1, b->data->min_key);
} else {
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
}
- bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key));
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
if (prev &&
- bpos_cmp(expected_start, cur->data->min_key) > 0 &&
+ bpos_gt(expected_start, cur->data->min_key) &&
BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
/* cur overwrites prev: */
- if (mustfix_fsck_err_on(bpos_cmp(prev->data->min_key,
- cur->data->min_key) >= 0, c,
+ if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
+ cur->data->min_key), c,
"btree node overwritten by next node at btree %s level %u:\n"
" node %s\n"
" next %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1, buf2))
- return DROP_PREV_NODE;
+ buf1.buf, buf2.buf)) {
+ ret = DROP_PREV_NODE;
+ goto out;
+ }
- if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p,
+ if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
bpos_predecessor(cur->data->min_key)), c,
"btree node with incorrect max_key at btree %s level %u:\n"
" node %s\n"
" next %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1, buf2))
+ buf1.buf, buf2.buf))
ret = set_node_max(c, prev,
bpos_predecessor(cur->data->min_key));
} else {
/* prev overwrites cur: */
- if (mustfix_fsck_err_on(bpos_cmp(expected_start,
- cur->data->max_key) >= 0, c,
+ if (mustfix_fsck_err_on(bpos_ge(expected_start,
+ cur->data->max_key), c,
"btree node overwritten by prev node at btree %s level %u:\n"
" prev %s\n"
" node %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1, buf2))
- return DROP_THIS_NODE;
+ buf1.buf, buf2.buf)) {
+ ret = DROP_THIS_NODE;
+ goto out;
+ }
- if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
+ if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" node %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1, buf2))
- ret = set_node_min(c, cur, expected_start);
+ buf1.buf, buf2.buf))
+ ret = set_node_min(c, cur, expected_start);
}
+out:
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
struct btree *child)
{
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
int ret = 0;
- if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
+ bch2_bpos_to_text(&buf2, b->key.k.p);
+
+ if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
- (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) {
+ buf1.buf, buf2.buf)) {
ret = set_node_max(c, child, b->key.k.p);
if (ret)
- return ret;
+ goto err;
}
+err:
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
-static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
{
+ struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf prev_k, cur_k;
struct btree *prev = NULL, *cur = NULL;
bool have_child, dropped_children = false;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
if (!b->c.level)
@@ -351,34 +374,38 @@ again:
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
- BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+ BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+ BUG_ON(bpos_gt(k.k->p, b->data->max_key));
bch2_btree_and_journal_iter_advance(&iter);
bch2_bkey_buf_reassemble(&cur_k, c, k);
- cur = bch2_btree_node_get_noiter(c, cur_k.k,
+ cur = bch2_btree_node_get_noiter(trans, cur_k.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(cur);
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
+
if (mustfix_fsck_err_on(ret == -EIO, c,
- "Unreadable btree node at btree %s level %u:\n"
+ "Topology repair: unreadable btree node at btree %s level %u:\n"
" %s",
bch2_btree_ids[b->c.btree_id],
b->c.level - 1,
- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) {
- bch2_btree_node_evict(c, cur_k.k);
+ buf.buf)) {
+ bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
+ cur = NULL;
if (ret)
break;
continue;
}
if (ret) {
- bch_err(c, "%s: error %i getting btree node",
- __func__, ret);
+ bch_err(c, "%s: error getting btree node: %s",
+ __func__, bch2_err_str(ret));
break;
}
@@ -386,9 +413,10 @@ again:
if (ret == DROP_THIS_NODE) {
six_unlock_read(&cur->c.lock);
- bch2_btree_node_evict(c, cur_k.k);
+ bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
+ cur = NULL;
if (ret)
break;
continue;
@@ -399,7 +427,7 @@ again:
prev = NULL;
if (ret == DROP_PREV_NODE) {
- bch2_btree_node_evict(c, prev_k.k);
+ bch2_btree_node_evict(trans, prev_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, prev_k.k->k.p);
if (ret)
@@ -439,23 +467,23 @@ again:
bch2_bkey_buf_reassemble(&cur_k, c, k);
bch2_btree_and_journal_iter_advance(&iter);
- cur = bch2_btree_node_get_noiter(c, cur_k.k,
+ cur = bch2_btree_node_get_noiter(trans, cur_k.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(cur);
if (ret) {
- bch_err(c, "%s: error %i getting btree node",
- __func__, ret);
+ bch_err(c, "%s: error getting btree node: %s",
+ __func__, bch2_err_str(ret));
goto err;
}
- ret = bch2_btree_repair_topology_recurse(c, cur);
+ ret = bch2_btree_repair_topology_recurse(trans, cur);
six_unlock_read(&cur->c.lock);
cur = NULL;
if (ret == DROP_THIS_NODE) {
- bch2_btree_node_evict(c, cur_k.k);
+ bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
dropped_children = true;
@@ -467,12 +495,14 @@ again:
have_child = true;
}
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
if (mustfix_fsck_err_on(!have_child, c,
"empty interior btree node at btree %s level %u\n"
" %s",
bch2_btree_ids[b->c.btree_id],
- b->c.level,
- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf)))
+ b->c.level, buf.buf))
ret = DROP_THIS_NODE;
err:
fsck_err:
@@ -488,42 +518,49 @@ fsck_err:
if (!ret && dropped_children)
goto again;
+ printbuf_exit(&buf);
return ret;
}
static int bch2_repair_topology(struct bch_fs *c)
{
+ struct btree_trans trans;
struct btree *b;
unsigned i;
int ret = 0;
+ bch2_trans_init(&trans, c, 0, 0);
+
for (i = 0; i < BTREE_ID_NR && !ret; i++) {
b = c->btree_roots[i].b;
if (btree_node_fake(b))
continue;
- six_lock_read(&b->c.lock, NULL, NULL);
- ret = bch2_btree_repair_topology_recurse(c, b);
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ ret = bch2_btree_repair_topology_recurse(&trans, b);
six_unlock_read(&b->c.lock);
if (ret == DROP_THIS_NODE) {
bch_err(c, "empty btree root - repair unimplemented");
- ret = FSCK_ERR_EXIT;
+ ret = -BCH_ERR_fsck_repair_unimplemented;
}
}
+ bch2_trans_exit(&trans);
+
return ret;
}
-static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
unsigned level, bool is_root,
struct bkey_s_c *k)
{
+ struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p = { 0 };
bool do_update = false;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
/*
@@ -533,78 +570,82 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- struct bucket *g2 = PTR_BUCKET(ca, &p.ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
- if (fsck_err_on(!g->gen_valid, c,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
- p.ptr.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ if (!g->gen_valid &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
if (!p.ptr.cached) {
- g2->_mark.gen = g->_mark.gen = p.ptr.gen;
- g2->gen_valid = g->gen_valid = true;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
} else {
do_update = true;
}
}
- if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
- p.ptr.gen, g->mark.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
if (!p.ptr.cached) {
- g2->_mark.gen = g->_mark.gen = p.ptr.gen;
- g2->gen_valid = g->gen_valid = true;
- g2->_mark.data_type = 0;
- g2->_mark.dirty_sectors = 0;
- g2->_mark.cached_sectors = 0;
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
} else {
do_update = true;
}
}
- if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
- p.ptr.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
do_update = true;
- if (fsck_err_on(!p.ptr.cached &&
- gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
- "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
- p.ptr.gen, g->mark.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
+ (c->opts.reconstruct_alloc ||
+ fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
do_update = true;
- if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen)
+ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
continue;
- if (fsck_err_on(g->mark.data_type &&
- g->mark.data_type != data_type, c,
+ if (fsck_err_on(bucket_data_type(g->data_type) &&
+ bucket_data_type(g->data_type) != data_type, c,
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[g->mark.data_type],
+ bch2_data_types[g->data_type],
bch2_data_types[data_type],
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (data_type == BCH_DATA_btree) {
- g2->_mark.data_type = g->_mark.data_type = data_type;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ g->data_type = data_type;
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
} else {
do_update = true;
@@ -618,14 +659,16 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
"pointer to nonexistent stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
- if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
+ if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
"pointer does not match stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
}
}
@@ -638,13 +681,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
if (is_root) {
bch_err(c, "cannot update btree roots yet");
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
if (!new) {
bch_err(c, "%s: error allocating new key", __func__);
- return -ENOMEM;
+ ret = -BCH_ERR_ENOMEM_gc_repair_key;
+ goto err;
}
bkey_reassemble(new, *k);
@@ -660,7 +705,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_GC_BUCKET(ca, ptr);
- ptr->gen = g->mark.gen;
+ ptr->gen = g->gen;
}
} else {
bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
@@ -669,12 +714,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
(ptr->cached &&
- (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+ (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
(!ptr->cached &&
- gen_cmp(ptr->gen, g->mark.gen) < 0) ||
- gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
- (g->mark.data_type &&
- g->mark.data_type != data_type);
+ gen_cmp(ptr->gen, g->gen) < 0) ||
+ gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
+ (g->data_type &&
+ g->data_type != data_type);
}));
again:
ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
@@ -708,19 +753,27 @@ found:
ret = bch2_journal_key_insert_take(c, btree_id, level, new);
if (ret) {
kfree(new);
- return ret;
+ goto err;
}
if (level)
- bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
+ bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
+
+ if (0) {
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, *k);
+ bch_info(c, "updated %s", buf.buf);
+
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+ bch_info(c, "new key %s", buf.buf);
+ }
- bch2_bkey_val_to_text(&PBUF(buf), c, *k);
- bch_info(c, "updated %s", buf);
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new));
- bch_info(c, "new key %s", buf);
*k = bkey_i_to_s_c(new);
}
+err:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -729,11 +782,9 @@ fsck_err:
static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
unsigned level, bool is_root,
struct bkey_s_c *k,
- u8 *max_stale, bool initial)
+ bool initial)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs;
- const struct bch_extent_ptr *ptr;
struct bkey deleted = KEY(0, 0, 0);
struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
unsigned flags =
@@ -745,9 +796,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
if (initial) {
BUG_ON(bch2_journal_seq_verify &&
- k->k->version.lo > journal_cur_seq(&c->journal));
+ k->k->version.lo > atomic64_read(&c->journal.seq));
- ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
+ ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
if (ret)
goto err;
@@ -758,27 +809,16 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
atomic64_set(&c->key_version, k->k->version.lo);
}
- ptrs = bch2_bkey_ptrs_c(*k);
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-
- if (gen_after(g->oldest_gen, ptr->gen))
- g->oldest_gen = ptr->gen;
-
- *max_stale = max(*max_stale, ptr_stale(ca, ptr));
- }
-
- ret = bch2_mark_key(trans, old, *k, flags);
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_mark_key(trans, btree_id, level, old, *k, flags));
fsck_err:
err:
if (ret)
- bch_err(c, "%s: ret %i", __func__, ret);
+ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
return ret;
}
-static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *max_stale,
- bool initial)
+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
{
struct bch_fs *c = trans->c;
struct btree_node_iter iter;
@@ -787,8 +827,6 @@ static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *ma
struct bkey_buf prev, cur;
int ret = 0;
- *max_stale = 0;
-
if (!btree_node_type_needs_gc(btree_node_type(b)))
return 0;
@@ -799,7 +837,7 @@ static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *ma
while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
- &k, max_stale, initial);
+ &k, initial);
if (ret)
break;
@@ -826,11 +864,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct btree *b;
- unsigned depth = metadata_only ? 1
- : bch2_expensive_debug_checks ? 0
- : !btree_node_type_needs_gc(btree_id) ? 1
- : 0;
- u8 max_stale = 0;
+ unsigned depth = metadata_only ? 1 : 0;
int ret = 0;
gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
@@ -841,21 +875,9 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
gc_pos_set(c, gc_pos_btree_node(b));
- ret = btree_gc_mark_node(trans, b, &max_stale, initial);
+ ret = btree_gc_mark_node(trans, b, initial);
if (ret)
break;
-
- if (!initial) {
- if (max_stale > 64)
- bch2_btree_node_rewrite(trans, &iter, b,
- BTREE_INSERT_NOWAIT|
- BTREE_INSERT_GC_LOCK_HELD);
- else if (!bch2_btree_gc_rewrite_disabled &&
- (bch2_btree_gc_always_rewrite || max_stale > 16))
- bch2_btree_node_rewrite(trans, &iter,
- b, BTREE_INSERT_NOWAIT|
- BTREE_INSERT_GC_LOCK_HELD);
- }
}
bch2_trans_iter_exit(trans, &iter);
@@ -867,8 +889,8 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
if (!btree_node_fake(b)) {
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
- &k, &max_stale, initial);
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
+ true, &k, initial);
}
gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
mutex_unlock(&c->btree_root_lock);
@@ -883,8 +905,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
- u8 max_stale = 0;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
@@ -893,13 +914,14 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
bkey_init(&prev.k->k);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
- BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+ BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+ BUG_ON(bpos_gt(k.k->p, b->data->max_key));
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
- &k, &max_stale, true);
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+ false, &k, true);
if (ret) {
- bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
+ bch_err(c, "%s: error from bch2_gc_mark_key: %s",
+ __func__, bch2_err_str(ret));
goto fsck_err;
}
@@ -929,7 +951,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
bch2_bkey_buf_reassemble(&cur, c, k);
bch2_btree_and_journal_iter_advance(&iter);
- child = bch2_btree_node_get_noiter(c, cur.k,
+ child = bch2_btree_node_get_noiter(trans, cur.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(child);
@@ -945,9 +967,10 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
" %s",
bch2_btree_ids[b->c.btree_id],
b->c.level - 1,
- (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) &&
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
- ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+ ret = -BCH_ERR_need_topology_repair;
bch_info(c, "Halting mark and sweep to start topology repair pass");
goto fsck_err;
} else {
@@ -958,8 +981,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
continue;
}
} else if (ret) {
- bch_err(c, "%s: error %i getting btree node",
- __func__, ret);
+ bch_err(c, "%s: error getting btree node: %s",
+ __func__, bch2_err_str(ret));
break;
}
@@ -975,6 +998,7 @@ fsck_err:
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
bch2_btree_and_journal_iter_exit(&iter);
+ printbuf_exit(&buf);
return ret;
}
@@ -984,12 +1008,8 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree *b;
- unsigned target_depth = metadata_only ? 1
- : bch2_expensive_debug_checks ? 0
- : !btree_node_type_needs_gc(btree_id) ? 1
- : 0;
- u8 max_stale = 0;
- char buf[100];
+ unsigned target_depth = metadata_only ? 1 : 0;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
b = c->btree_roots[btree_id].b;
@@ -998,19 +1018,21 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
return 0;
six_lock_read(&b->c.lock, NULL, NULL);
- if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
- "btree root with incorrect min_key: %s",
- (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->min_key);
+ if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
+ "btree root with incorrect min_key: %s", buf.buf)) {
bch_err(c, "repair unimplemented");
- ret = FSCK_ERR_EXIT;
+ ret = -BCH_ERR_fsck_repair_unimplemented;
goto fsck_err;
}
- if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c,
- "btree root with incorrect max_key: %s",
- (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
+ printbuf_reset(&buf);
+ bch2_bpos_to_text(&buf, b->data->max_key);
+ if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
+ "btree root with incorrect max_key: %s", buf.buf)) {
bch_err(c, "repair unimplemented");
- ret = FSCK_ERR_EXIT;
+ ret = -BCH_ERR_fsck_repair_unimplemented;
goto fsck_err;
}
@@ -1020,14 +1042,15 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
if (!ret) {
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
- ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
- &k, &max_stale, true);
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true,
+ &k, true);
}
fsck_err:
six_unlock_read(&b->c.lock);
if (ret < 0)
- bch_err(c, "%s: ret %i", __func__, ret);
+ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
+ printbuf_exit(&buf);
return ret;
}
@@ -1046,6 +1069,9 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
bch2_trans_init(&trans, c, 0, 0);
+ if (initial)
+ trans.is_initial_gc = true;
+
for (i = 0; i < BTREE_ID_NR; i++)
ids[i] = i;
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
@@ -1056,7 +1082,7 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
: bch2_gc_btree(&trans, ids[i], initial, metadata_only);
if (ret < 0)
- bch_err(c, "%s: ret %i", __func__, ret);
+ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
bch2_trans_exit(&trans);
return ret;
@@ -1147,10 +1173,10 @@ static void bch2_gc_free(struct bch_fs *c)
genradix_free(&c->gc_stripes);
for_each_member_device(ca, c, i) {
- kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+ kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
- ca->buckets[1] = NULL;
+ ca->buckets_gc = NULL;
free_percpu(ca->usage_gc);
ca->usage_gc = NULL;
@@ -1164,40 +1190,29 @@ static int bch2_gc_done(struct bch_fs *c,
bool initial, bool metadata_only)
{
struct bch_dev *ca = NULL;
- bool verify = !metadata_only && (!initial ||
- (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
+ struct printbuf buf = PRINTBUF;
+ bool verify = !metadata_only &&
+ !c->opts.reconstruct_alloc &&
+ (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
unsigned i, dev;
int ret = 0;
+ percpu_down_write(&c->mark_lock);
+
#define copy_field(_f, _msg, ...) \
- if (dst->_f != src->_f) { \
- if (verify) \
- fsck_err(c, _msg ": got %llu, should be %llu" \
- , ##__VA_ARGS__, dst->_f, src->_f); \
- dst->_f = src->_f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
- }
+ if (dst->_f != src->_f && \
+ (!verify || \
+ fsck_err(c, _msg ": got %llu, should be %llu" \
+ , ##__VA_ARGS__, dst->_f, src->_f))) \
+ dst->_f = src->_f
#define copy_stripe_field(_f, _msg, ...) \
- if (dst->_f != src->_f) { \
- if (verify) \
- fsck_err(c, "stripe %zu has wrong "_msg \
- ": got %u, should be %u", \
- iter.pos, ##__VA_ARGS__, \
- dst->_f, src->_f); \
- dst->_f = src->_f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
- }
-#define copy_bucket_field(_f) \
- if (dst->b[b]._f != src->b[b]._f) { \
- if (verify) \
- fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \
- ": got %u, should be %u", dev, b, \
- dst->b[b].mark.gen, \
- bch2_data_types[dst->b[b].mark.data_type],\
- dst->b[b]._f, src->b[b]._f); \
- dst->b[b]._f = src->b[b]._f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
- }
+ if (dst->_f != src->_f && \
+ (!verify || \
+ fsck_err(c, "stripe %zu has wrong "_msg \
+ ": got %u, should be %u", \
+ iter.pos, ##__VA_ARGS__, \
+ dst->_f, src->_f))) \
+ dst->_f = src->_f
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
#define copy_fs_field(_f, _msg, ...) \
@@ -1207,36 +1222,17 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_fs_usage_acc_to_base(c, i);
for_each_member_device(ca, c, dev) {
- struct bucket_array *dst = __bucket_array(ca, 0);
- struct bucket_array *src = __bucket_array(ca, 1);
- size_t b;
-
- for (b = 0; b < src->nbuckets; b++) {
- copy_bucket_field(_mark.gen);
- copy_bucket_field(_mark.data_type);
- copy_bucket_field(_mark.stripe);
- copy_bucket_field(_mark.dirty_sectors);
- copy_bucket_field(_mark.cached_sectors);
- copy_bucket_field(stripe_redundancy);
- copy_bucket_field(stripe);
-
- dst->b[b].oldest_gen = src->b[b].oldest_gen;
- }
+ struct bch_dev_usage *dst = ca->usage_base;
+ struct bch_dev_usage *src = (void *)
+ bch2_acc_percpu_u64s((void *) ca->usage_gc,
+ dev_usage_u64s());
- {
- struct bch_dev_usage *dst = ca->usage_base;
- struct bch_dev_usage *src = (void *)
- bch2_acc_percpu_u64s((void *) ca->usage_gc,
- dev_usage_u64s());
+ copy_dev_field(buckets_ec, "buckets_ec");
- copy_dev_field(buckets_ec, "buckets_ec");
- copy_dev_field(buckets_unavailable, "buckets_unavailable");
-
- for (i = 0; i < BCH_DATA_NR; i++) {
- copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
- copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
- copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
- }
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
}
};
@@ -1263,34 +1259,35 @@ static int bch2_gc_done(struct bch_fs *c,
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
- char buf[80];
if (metadata_only &&
(e->data_type == BCH_DATA_user ||
e->data_type == BCH_DATA_cached))
continue;
- bch2_replicas_entry_to_text(&PBUF(buf), e);
+ printbuf_reset(&buf);
+ bch2_replicas_entry_to_text(&buf, e);
- copy_fs_field(replicas[i], "%s", buf);
+ copy_fs_field(replicas[i], "%s", buf.buf);
}
}
#undef copy_fs_field
#undef copy_dev_field
-#undef copy_bucket_field
#undef copy_stripe_field
#undef copy_field
fsck_err:
if (ca)
percpu_ref_put(&ca->ref);
if (ret)
- bch_err(c, "%s: ret %i", __func__, ret);
+ bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret));
+
+ percpu_up_write(&c->mark_lock);
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_gc_start(struct bch_fs *c,
- bool metadata_only)
+static int bch2_gc_start(struct bch_fs *c)
{
struct bch_dev *ca = NULL;
unsigned i;
@@ -1301,151 +1298,345 @@ static int bch2_gc_start(struct bch_fs *c,
sizeof(u64), GFP_KERNEL);
if (!c->usage_gc) {
bch_err(c, "error allocating c->usage_gc");
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_gc_start;
}
for_each_member_device(ca, c, i) {
- BUG_ON(ca->buckets[1]);
BUG_ON(ca->usage_gc);
- ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket),
- GFP_KERNEL|__GFP_ZERO);
- if (!ca->buckets[1]) {
- percpu_ref_put(&ca->ref);
- bch_err(c, "error allocating ca->buckets[gc]");
- return -ENOMEM;
- }
-
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
if (!ca->usage_gc) {
bch_err(c, "error allocating ca->usage_gc");
percpu_ref_put(&ca->ref);
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_gc_start;
}
+
+ this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
+ ca->mi.nbuckets - ca->mi.first_bucket);
}
- percpu_down_write(&c->mark_lock);
+ return 0;
+}
+
+static int bch2_gc_reset(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
for_each_member_device(ca, c, i) {
- struct bucket_array *dst = __bucket_array(ca, 1);
- struct bucket_array *src = __bucket_array(ca, 0);
- size_t b;
+ free_percpu(ca->usage_gc);
+ ca->usage_gc = NULL;
+ }
- dst->first_bucket = src->first_bucket;
- dst->nbuckets = src->nbuckets;
+ free_percpu(c->usage_gc);
+ c->usage_gc = NULL;
- for (b = 0; b < src->nbuckets; b++) {
- struct bucket *d = &dst->b[b];
- struct bucket *s = &src->b[b];
+ return bch2_gc_start(c);
+}
- d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
- d->gen_valid = s->gen_valid;
+/* returns true if not equal */
+static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
+ struct bch_alloc_v4 r)
+{
+ return l.gen != r.gen ||
+ l.oldest_gen != r.oldest_gen ||
+ l.data_type != r.data_type ||
+ l.dirty_sectors != r.dirty_sectors ||
+ l.cached_sectors != r.cached_sectors ||
+ l.stripe_redundancy != r.stripe_redundancy ||
+ l.stripe != r.stripe;
+}
- if (metadata_only &&
- (s->mark.data_type == BCH_DATA_user ||
- s->mark.data_type == BCH_DATA_cached))
- d->_mark = s->mark;
+static int bch2_alloc_write_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ bool metadata_only)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
+ struct bucket gc, *b;
+ struct bkey_i_alloc_v4 *a;
+ struct bch_alloc_v4 old_convert, new;
+ const struct bch_alloc_v4 *old;
+ enum bch_data_type type;
+ int ret;
+
+ if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)))
+ return 1;
+
+ old = bch2_alloc_to_v4(k, &old_convert);
+ new = *old;
+
+ percpu_down_read(&c->mark_lock);
+ b = gc_bucket(ca, iter->pos.offset);
+
+ /*
+ * b->data_type doesn't yet include need_discard & need_gc_gen states -
+ * fix that here:
+ */
+ type = __alloc_data_type(b->dirty_sectors,
+ b->cached_sectors,
+ b->stripe,
+ *old,
+ b->data_type);
+ if (b->data_type != type) {
+ struct bch_dev_usage *u;
+
+ preempt_disable();
+ u = this_cpu_ptr(ca->usage_gc);
+ u->d[b->data_type].buckets--;
+ b->data_type = type;
+ u->d[b->data_type].buckets++;
+ preempt_enable();
+ }
+
+ gc = *b;
+ percpu_up_read(&c->mark_lock);
+
+ if (metadata_only &&
+ gc.data_type != BCH_DATA_sb &&
+ gc.data_type != BCH_DATA_journal &&
+ gc.data_type != BCH_DATA_btree)
+ return 0;
+
+ if (gen_after(old->gen, gc.gen))
+ return 0;
+
+ if (c->opts.reconstruct_alloc ||
+ fsck_err_on(new.data_type != gc.data_type, c,
+ "bucket %llu:%llu gen %u has wrong data_type"
+ ": got %s, should be %s",
+ iter->pos.inode, iter->pos.offset,
+ gc.gen,
+ bch2_data_types[new.data_type],
+ bch2_data_types[gc.data_type]))
+ new.data_type = gc.data_type;
+
+#define copy_bucket_field(_f) \
+ if (c->opts.reconstruct_alloc || \
+ fsck_err_on(new._f != gc._f, c, \
+ "bucket %llu:%llu gen %u data type %s has wrong " #_f \
+ ": got %u, should be %u", \
+ iter->pos.inode, iter->pos.offset, \
+ gc.gen, \
+ bch2_data_types[gc.data_type], \
+ new._f, gc._f)) \
+ new._f = gc._f; \
+
+ copy_bucket_field(gen);
+ copy_bucket_field(dirty_sectors);
+ copy_bucket_field(cached_sectors);
+ copy_bucket_field(stripe_redundancy);
+ copy_bucket_field(stripe);
+#undef copy_bucket_field
+
+ if (!bch2_alloc_v4_cmp(*old, new))
+ return 0;
+
+ a = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
+
+ a->v = new;
+
+ /*
+ * The trigger normally makes sure this is set, but we're not running
+ * triggers:
+ */
+ if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
+ a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+
+ ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
+fsck_err:
+ return ret;
+}
+
+static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_dev *ca;
+ unsigned i;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_member_device(ca, c, i) {
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW,
+ bch2_alloc_write_key(&trans, &iter, k, metadata_only));
+
+ if (ret < 0) {
+ bch_err(c, "error writing alloc info: %s", bch2_err_str(ret));
+ percpu_ref_put(&ca->ref);
+ break;
+ }
+ }
+
+ bch2_trans_exit(&trans);
+ return ret < 0 ? ret : 0;
+}
+
+static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
+{
+ struct bch_dev *ca;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bucket *g;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ unsigned i;
+ int ret;
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+ ca->mi.nbuckets * sizeof(struct bucket),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!buckets) {
+ percpu_ref_put(&ca->ref);
+ bch_err(c, "error allocating ca->buckets[gc]");
+ return -BCH_ERR_ENOMEM_gc_alloc_start;
}
+
+ buckets->first_bucket = ca->mi.first_bucket;
+ buckets->nbuckets = ca->mi.nbuckets;
+ rcu_assign_pointer(ca->buckets_gc, buckets);
};
- percpu_up_write(&c->mark_lock);
+ bch2_trans_init(&trans, c, 0, 0);
- return 0;
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ g = gc_bucket(ca, k.k->p.offset);
+
+ a = bch2_alloc_to_v4(k, &a_convert);
+
+ g->gen_valid = 1;
+ g->gen = a->gen;
+
+ if (metadata_only &&
+ (a->data_type == BCH_DATA_user ||
+ a->data_type == BCH_DATA_cached ||
+ a->data_type == BCH_DATA_parity)) {
+ g->data_type = a->data_type;
+ g->dirty_sectors = a->dirty_sectors;
+ g->cached_sectors = a->cached_sectors;
+ g->stripe = a->stripe;
+ g->stripe_redundancy = a->stripe_redundancy;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+
+ if (ret)
+ bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret));
+
+ return ret;
}
-static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only)
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
{
struct bch_dev *ca;
unsigned i;
for_each_member_device(ca, c, i) {
- struct bucket_array *buckets = __bucket_array(ca, true);
+ struct bucket_array *buckets = gc_bucket_array(ca);
struct bucket *g;
for_each_bucket(g, buckets) {
if (metadata_only &&
- (g->mark.data_type == BCH_DATA_user ||
- g->mark.data_type == BCH_DATA_cached ||
- g->mark.data_type == BCH_DATA_parity))
+ (g->data_type == BCH_DATA_user ||
+ g->data_type == BCH_DATA_cached ||
+ g->data_type == BCH_DATA_parity))
continue;
- g->_mark.dirty_sectors = 0;
- g->_mark.cached_sectors = 0;
+ g->data_type = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
}
};
}
-static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
- bool metadata_only)
+static int bch2_gc_write_reflink_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ size_t *idx)
{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
+ struct bch_fs *c = trans->c;
+ const __le64 *refcount = bkey_refcount_c(k);
+ struct printbuf buf = PRINTBUF;
struct reflink_gc *r;
- size_t idx = 0;
- char buf[200];
int ret = 0;
- if (metadata_only)
+ if (!refcount)
return 0;
- bch2_trans_init(&trans, c, 0, 0);
+ while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
+ r->offset < k.k->p.offset)
+ ++*idx;
- for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- const __le64 *refcount = bkey_refcount_c(k);
+ if (!r ||
+ r->offset != k.k->p.offset ||
+ r->size != k.k->size) {
+ bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+ return -EINVAL;
+ }
- if (!refcount)
- continue;
+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+ "reflink key has wrong refcount:\n"
+ " %s\n"
+ " should be %u",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+ r->refcount)) {
+ struct bkey_i *new = bch2_bkey_make_mut(trans, k);
- r = genradix_ptr(&c->reflink_gc_table, idx++);
- if (!r ||
- r->offset != k.k->p.offset ||
- r->size != k.k->size) {
- bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
- ret = -EINVAL;
- break;
- }
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
- "reflink key has wrong refcount:\n"
- " %s\n"
- " should be %u",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
- r->refcount)) {
- struct bkey_i *new;
-
- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
- if (!new) {
- ret = -ENOMEM;
- break;
- }
+ if (!r->refcount)
+ new->k.type = KEY_TYPE_deleted;
+ else
+ *bkey_refcount(new) = cpu_to_le64(r->refcount);
+
+ ret = bch2_trans_update(trans, iter, new, 0);
+ }
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
- bkey_reassemble(new, k);
+static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ size_t idx = 0;
+ int ret = 0;
- if (!r->refcount)
- new->k.type = KEY_TYPE_deleted;
- else
- *bkey_refcount(new) = cpu_to_le64(r->refcount);
+ if (metadata_only)
+ return 0;
- ret = initial
- ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
- : __bch2_trans_do(&trans, NULL, NULL, 0,
- __bch2_btree_insert(&trans, BTREE_ID_reflink, new));
- kfree(new);
+ bch2_trans_init(&trans, c, 0, 0);
+
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_gc_write_reflink_key(&trans, &iter, k, &idx));
- if (ret)
- break;
- }
- }
-fsck_err:
- bch2_trans_iter_exit(&trans, &iter);
c->reflink_gc_nr = 0;
bch2_trans_exit(&trans);
return ret;
}
-static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
+static int bch2_gc_reflink_start(struct bch_fs *c,
bool metadata_only)
{
struct btree_trans trans;
@@ -1470,7 +1661,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
GFP_KERNEL);
if (!r) {
- ret = -ENOMEM;
+ ret = -BCH_ERR_ENOMEM_gc_reflink_start;
break;
}
@@ -1484,8 +1675,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
return ret;
}
-static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial,
- bool metadata_only)
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
{
struct genradix_iter iter;
struct reflink_gc *r;
@@ -1494,71 +1684,81 @@ static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial,
r->refcount = 0;
}
-static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
- bool metadata_only)
+static int bch2_gc_write_stripes_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct gc_stripe *m;
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
const struct bch_stripe *s;
- char buf[200];
+ struct gc_stripe *m;
+ bool bad = false;
unsigned i;
int ret = 0;
- if (metadata_only)
+ if (k.k->type != KEY_TYPE_stripe)
return 0;
- bch2_trans_init(&trans, c, 0, 0);
+ s = bkey_s_c_to_stripe(k).v;
+ m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
- for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->type != KEY_TYPE_stripe)
- continue;
+ for (i = 0; i < s->nr_blocks; i++) {
+ u32 old = stripe_blockcount_get(s, i);
+ u32 new = (m ? m->block_sectors[i] : 0);
- s = bkey_s_c_to_stripe(k).v;
- m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
-
- for (i = 0; i < s->nr_blocks; i++)
- if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
- goto inconsistent;
- continue;
-inconsistent:
- if (fsck_err_on(true, c,
- "stripe has wrong block sector count %u:\n"
- " %s\n"
- " should be %u", i,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
- m ? m->block_sectors[i] : 0)) {
- struct bkey_i_stripe *new;
-
- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
- if (!new) {
- ret = -ENOMEM;
- break;
- }
+ if (old != new) {
+ prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
+ i, old, new);
+ bad = true;
+ }
+ }
- bkey_reassemble(&new->k_i, k);
+ if (bad)
+ bch2_bkey_val_to_text(&buf, c, k);
- for (i = 0; i < new->v.nr_blocks; i++)
- stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+ if (fsck_err_on(bad, c, "%s", buf.buf)) {
+ struct bkey_i_stripe *new;
- ret = initial
- ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i)
- : __bch2_trans_do(&trans, NULL, NULL, 0,
- __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
- kfree(new);
- }
+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(&new->k_i, k);
+
+ for (i = 0; i < new->v.nr_blocks; i++)
+ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+
+ ret = bch2_trans_update(trans, iter, &new->k_i, 0);
}
fsck_err:
- bch2_trans_iter_exit(&trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ if (metadata_only)
+ return 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_gc_write_stripes_key(&trans, &iter, k));
bch2_trans_exit(&trans);
return ret;
}
-static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial,
- bool metadata_only)
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
{
genradix_free(&c->gc_stripes);
}
@@ -1583,22 +1783,18 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial,
*/
int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
{
- struct bch_dev *ca;
- u64 start_time = local_clock();
- unsigned i, iter = 0;
+ unsigned iter = 0;
int ret;
lockdep_assert_held(&c->state_lock);
- trace_gc_start(c);
down_write(&c->gc_lock);
- /* flush interior btree updates: */
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
+ bch2_btree_interior_updates_flush(c);
- ret = bch2_gc_start(c, metadata_only) ?:
- bch2_gc_reflink_start(c, initial, metadata_only);
+ ret = bch2_gc_start(c) ?:
+ bch2_gc_alloc_start(c, metadata_only) ?:
+ bch2_gc_reflink_start(c, metadata_only);
if (ret)
goto out;
again:
@@ -1609,26 +1805,27 @@ again:
if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
c->opts.fix_errors != FSCK_OPT_NO) {
- bch_info(c, "starting topology repair pass");
+ bch_info(c, "Starting topology repair pass");
ret = bch2_repair_topology(c);
if (ret)
goto out;
- bch_info(c, "topology repair pass done");
+ bch_info(c, "Topology repair pass done");
set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags);
}
ret = bch2_gc_btrees(c, initial, metadata_only);
- if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR &&
+ if (ret == -BCH_ERR_need_topology_repair &&
!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true);
ret = 0;
}
- if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR)
- ret = FSCK_ERR_EXIT;
+ if (ret == -BCH_ERR_need_topology_repair)
+ ret = -BCH_ERR_fsck_errors_not_fixed;
if (ret)
goto out;
@@ -1653,9 +1850,12 @@ again:
clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
- bch2_gc_stripes_reset(c, initial, metadata_only);
- bch2_gc_alloc_reset(c, initial, metadata_only);
- bch2_gc_reflink_reset(c, initial, metadata_only);
+ bch2_gc_stripes_reset(c, metadata_only);
+ bch2_gc_alloc_reset(c, metadata_only);
+ bch2_gc_reflink_reset(c, metadata_only);
+ ret = bch2_gc_reset(c);
+ if (ret)
+ goto out;
/* flush fsck errors, reset counters */
bch2_flush_fsck_errs(c);
@@ -1665,16 +1865,15 @@ out:
if (!ret) {
bch2_journal_block(&c->journal);
- percpu_down_write(&c->mark_lock);
- ret = bch2_gc_reflink_done(c, initial, metadata_only) ?:
- bch2_gc_stripes_done(c, initial, metadata_only) ?:
+ ret = bch2_gc_stripes_done(c, metadata_only) ?:
+ bch2_gc_reflink_done(c, metadata_only) ?:
+ bch2_gc_alloc_done(c, metadata_only) ?:
bch2_gc_done(c, initial, metadata_only);
bch2_journal_unblock(&c->journal);
- } else {
- percpu_down_write(&c->mark_lock);
}
+ percpu_down_write(&c->mark_lock);
/* Indicates that gc is no longer in progress: */
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
@@ -1683,16 +1882,6 @@ out:
up_write(&c->gc_lock);
- trace_gc_end(c);
- bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
-
- /*
- * Wake up allocator in case it was waiting for buckets
- * because of not being able to inc gens
- */
- for_each_member_device(ca, c, i)
- bch2_wake_allocator(ca);
-
/*
* At startup, allocations can happen directly instead of via the
* allocator thread - issue wakeup in case they blocked on gc_lock:
@@ -1701,96 +1890,75 @@ out:
return ret;
}
-static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
+static int gc_btree_gens_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
+ struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr;
+ struct bkey_i *u;
+ int ret;
percpu_down_read(&c->mark_lock);
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr);
- if (gen_after(g->mark.gen, ptr->gen) > 16) {
+ if (ptr_stale(ca, ptr) > 16) {
percpu_up_read(&c->mark_lock);
- return true;
+ goto update;
}
}
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr);
+ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
- if (gen_after(g->gc_gen, ptr->gen))
- g->gc_gen = ptr->gen;
+ if (gen_after(*gen, ptr->gen))
+ *gen = ptr->gen;
}
percpu_up_read(&c->mark_lock);
+ return 0;
+update:
+ u = bch2_bkey_make_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
- return false;
+ bch2_extent_normalize(c, bkey_i_to_s(u));
+ return bch2_trans_update(trans, iter, u, 0);
}
-/*
- * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
- * node pointers currently never have cached pointers that can become stale:
- */
-static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_buf sk;
- int ret = 0, commit_err = 0;
-
- bch2_bkey_buf_init(&sk);
- bch2_trans_init(&trans, c, 0, 0);
-
- bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- while ((bch2_trans_begin(&trans),
- k = bch2_btree_iter_peek(&iter)).k) {
- ret = bkey_err(k);
-
- if (ret == -EINTR)
- continue;
- if (ret)
- break;
-
- c->gc_gens_pos = iter.pos;
-
- if (gc_btree_gens_key(c, k) && !commit_err) {
- bch2_bkey_buf_reassemble(&sk, c, k);
- bch2_extent_normalize(c, bkey_i_to_s(sk.k));
+ struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+ struct bkey_i_alloc_v4 *a_mut;
+ int ret;
- commit_err =
- bch2_trans_update(&trans, &iter, sk.k, 0) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOWAIT|
- BTREE_INSERT_NOFAIL);
- if (commit_err == -EINTR) {
- commit_err = 0;
- continue;
- }
- }
+ if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
+ return 0;
- bch2_btree_iter_advance(&iter);
- }
- bch2_trans_iter_exit(&trans, &iter);
+ a_mut = bch2_alloc_to_v4_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(a_mut);
+ if (ret)
+ return ret;
- bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(&sk, c);
+ a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
+ a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
- return ret;
+ return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
}
int bch2_gc_gens(struct bch_fs *c)
{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
struct bch_dev *ca;
- struct bucket_array *buckets;
- struct bucket *g;
- u64 start_time = local_clock();
+ u64 b, start_time = local_clock();
unsigned i;
int ret;
@@ -1799,36 +1967,63 @@ int bch2_gc_gens(struct bch_fs *c)
* introduces a deadlock in the RO path - we currently take the state
* lock at the start of going RO, thus the gc thread may get stuck:
*/
+ if (!mutex_trylock(&c->gc_gens_lock))
+ return 0;
+
+ trace_and_count(c, gc_gens_start, c);
down_read(&c->gc_lock);
+ bch2_trans_init(&trans, c, 0, 0);
for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
+ struct bucket_gens *gens;
+
+ BUG_ON(ca->oldest_gen);
+
+ ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
+ if (!ca->oldest_gen) {
+ percpu_ref_put(&ca->ref);
+ ret = -BCH_ERR_ENOMEM_gc_gens;
+ goto err;
+ }
+
+ gens = bucket_gens(ca);
- for_each_bucket(g, buckets)
- g->gc_gen = g->mark.gen;
- up_read(&ca->bucket_lock);
+ for (b = gens->first_bucket;
+ b < gens->nbuckets; b++)
+ ca->oldest_gen[b] = gens->b[b];
}
for (i = 0; i < BTREE_ID_NR; i++)
- if ((1 << i) & BTREE_ID_HAS_PTRS) {
+ if (btree_type_has_ptrs(i)) {
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
c->gc_gens_btree = i;
c->gc_gens_pos = POS_MIN;
- ret = bch2_gc_btree_gens(c, i);
- if (ret) {
- bch_err(c, "error recalculating oldest_gen: %i", ret);
+ ret = for_each_btree_key_commit(&trans, iter, i,
+ POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ gc_btree_gens_key(&trans, &iter, k));
+ if (ret && !bch2_err_matches(ret, EROFS))
+ bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
+ if (ret)
goto err;
- }
}
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets)
- g->oldest_gen = g->gc_gen;
- up_read(&ca->bucket_lock);
- }
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+ POS_MIN,
+ BTREE_ITER_PREFETCH,
+ k,
+ NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ bch2_alloc_write_oldest_gen(&trans, &iter, k));
+ if (ret && !bch2_err_matches(ret, EROFS))
+ bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
+ if (ret)
+ goto err;
c->gc_gens_btree = 0;
c->gc_gens_pos = POS_MIN;
@@ -1836,8 +2031,16 @@ int bch2_gc_gens(struct bch_fs *c)
c->gc_count++;
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+ trace_and_count(c, gc_gens_end, c);
err:
+ for_each_member_device(ca, c, i) {
+ kvfree(ca->oldest_gen);
+ ca->oldest_gen = NULL;
+ }
+
+ bch2_trans_exit(&trans);
up_read(&c->gc_lock);
+ mutex_unlock(&c->gc_gens_lock);
return ret;
}
@@ -1890,7 +2093,7 @@ static int bch2_gc_thread(void *arg)
ret = bch2_gc_gens(c);
#endif
if (ret < 0)
- bch_err(c, "btree gc failed: %i", ret);
+ bch_err(c, "btree gc failed: %s", bch2_err_str(ret));
debug_check_no_locks_held();
}
@@ -1920,7 +2123,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
if (IS_ERR(p)) {
- bch_err(c, "error creating gc thread: %li", PTR_ERR(p));
+ bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p)));
return PTR_ERR(p);
}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 0665f5941fcc..95d803b5743d 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -102,4 +102,11 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
return ret;
}
+static inline void bch2_do_gc_gens(struct bch_fs *c)
+{
+ atomic_inc(&c->kick_gc);
+ if (c->gc_thread)
+ wake_up_process(c->gc_thread);
+}
+
#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 4ff38c6395f3..586e2f96f649 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -33,6 +33,8 @@ void bch2_btree_node_io_unlock(struct btree *b)
void bch2_btree_node_io_lock(struct btree *b)
{
+ bch2_assert_btree_nodes_not_locked();
+
wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
@@ -51,12 +53,16 @@ void __bch2_btree_node_wait_on_write(struct btree *b)
void bch2_btree_node_wait_on_read(struct btree *b)
{
+ bch2_assert_btree_nodes_not_locked();
+
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
}
void bch2_btree_node_wait_on_write(struct btree *b)
{
+ bch2_assert_btree_nodes_not_locked();
+
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
@@ -71,13 +77,13 @@ static void verify_no_dups(struct btree *b,
if (start == end)
return;
- for (p = start, k = bkey_next(start);
+ for (p = start, k = bkey_p_next(start);
k != end;
- p = k, k = bkey_next(k)) {
+ p = k, k = bkey_p_next(k)) {
struct bkey l = bkey_unpack_key(b, p);
struct bkey r = bkey_unpack_key(b, k);
- BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0);
+ BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
}
#endif
}
@@ -86,7 +92,7 @@ static void set_needs_whiteout(struct bset *i, int v)
{
struct bkey_packed *k;
- for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+ for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
k->needs_whiteout = v;
}
@@ -99,8 +105,8 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
vpfree(p, size);
}
-static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
- bool *used_mempool)
+static void *_btree_bounce_alloc(struct bch_fs *c, size_t size,
+ bool *used_mempool)
{
unsigned flags = memalloc_nofs_save();
void *p;
@@ -108,7 +114,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
BUG_ON(size > btree_bytes(c));
*used_mempool = false;
- p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+ p = _vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
if (!p) {
*used_mempool = true;
p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
@@ -116,6 +122,8 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
memalloc_nofs_restore(flags);
return p;
}
+#define btree_bounce_alloc(_c, _size, _used_mempool) \
+ alloc_hooks(_btree_bounce_alloc(_c, _size, _used_mempool), void *, NULL)
static void sort_bkey_ptrs(const struct btree *bt,
struct bkey_packed **ptrs, unsigned nr)
@@ -169,7 +177,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
for (k = unwritten_whiteouts_start(c, b);
k != unwritten_whiteouts_end(c, b);
- k = bkey_next(k))
+ k = bkey_p_next(k))
*--ptrs = k;
sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
@@ -178,7 +186,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
while (ptrs != ptrs_end) {
bkey_copy(k, *ptrs);
- k = bkey_next(k);
+ k = bkey_p_next(k);
ptrs++;
}
@@ -250,11 +258,11 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
out = i->start;
for (k = start; k != end; k = n) {
- n = bkey_next(k);
+ n = bkey_p_next(k);
if (!bkey_deleted(k)) {
bkey_copy(out, k);
- out = bkey_next(out);
+ out = bkey_p_next(out);
} else {
BUG_ON(k->needs_whiteout);
}
@@ -445,6 +453,24 @@ void bch2_btree_build_aux_trees(struct btree *b)
}
/*
+ * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
+ *
+ * The first bset is going to be of similar order to the size of the node, the
+ * last bset is bounded by btree_write_set_buffer(), which is set to keep the
+ * memmove on insert from being too expensive: the middle bset should, ideally,
+ * be the geometric mean of the first and the last.
+ *
+ * Returns true if the middle bset is greater than that geometric mean:
+ */
+static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
+{
+ unsigned mid_u64s_bits =
+ (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
+
+ return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
+}
+
+/*
* @bch_btree_init_next - initialize a new (unwritten) bset that can then be
* inserted into
*
@@ -461,19 +487,14 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
EBUG_ON(!(b->c.lock.state.seq & 1));
BUG_ON(bset_written(b, bset(b, &b->set[1])));
+ BUG_ON(btree_node_just_written(b));
if (b->nsets == MAX_BSETS &&
- !btree_node_write_in_flight(b)) {
- unsigned log_u64s[] = {
- ilog2(bset_u64s(&b->set[0])),
- ilog2(bset_u64s(&b->set[1])),
- ilog2(bset_u64s(&b->set[2])),
- };
-
- if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
- bch2_btree_node_write(c, b, SIX_LOCK_write);
- reinit_iter = true;
- }
+ !btree_node_write_in_flight(b) &&
+ should_compact_all(c, b)) {
+ bch2_btree_node_write(c, b, SIX_LOCK_write,
+ BTREE_WRITE_init_next_bset);
+ reinit_iter = true;
}
if (b->nsets == MAX_BSETS &&
@@ -495,7 +516,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
struct btree *b)
{
- pr_buf(out, "%s level %u/%u\n ",
+ prt_printf(out, "%s level %u/%u\n ",
bch2_btree_ids[b->c.btree_id],
b->c.level,
c->btree_roots[b->c.btree_id].level);
@@ -507,87 +528,112 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct btree *b, struct bset *i,
unsigned offset, int write)
{
- pr_buf(out, "error validating btree node ");
- if (write)
- pr_buf(out, "before write ");
+ prt_printf(out, bch2_log_msg(c, "%s"),
+ write == READ
+ ? "error validating btree node "
+ : "corrupt btree node before write ");
if (ca)
- pr_buf(out, "on %s ", ca->name);
- pr_buf(out, "at btree ");
+ prt_printf(out, "on %s ", ca->name);
+ prt_printf(out, "at btree ");
btree_pos_to_text(out, c, b);
- pr_buf(out, "\n node offset %u", b->written);
+ prt_printf(out, "\n node offset %u", b->written);
if (i)
- pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+ prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+ prt_str(out, ": ");
}
enum btree_err_type {
+ /*
+ * We can repair this locally, and we're after the checksum check so
+ * there's no need to try another replica:
+ */
BTREE_ERR_FIXABLE,
+ /*
+ * We can repair this if we have to, but we should try reading another
+ * replica if we can:
+ */
BTREE_ERR_WANT_RETRY,
+ /*
+ * Read another replica if we have one, otherwise consider the whole
+ * node bad:
+ */
BTREE_ERR_MUST_RETRY,
- BTREE_ERR_FATAL,
+ BTREE_ERR_BAD_NODE,
+ BTREE_ERR_INCOMPATIBLE,
};
enum btree_validate_ret {
BTREE_RETRY_READ = 64,
};
+static int __btree_err(enum btree_err_type type,
+ struct bch_fs *c,
+ struct bch_dev *ca,
+ struct btree *b,
+ struct bset *i,
+ int write,
+ bool have_retry,
+ const char *fmt, ...)
+{
+ struct printbuf out = PRINTBUF;
+ va_list args;
+ int ret = -BCH_ERR_fsck_fix;
+
+ btree_err_msg(&out, c, ca, b, i, b->written, write);
+
+ va_start(args, fmt);
+ prt_vprintf(&out, fmt, args);
+ va_end(args);
+
+ if (write == WRITE) {
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
+ ret = c->opts.errors == BCH_ON_ERROR_continue
+ ? 0
+ : -BCH_ERR_fsck_errors_not_fixed;
+ goto out;
+ }
+
+ if (!have_retry && type == BTREE_ERR_WANT_RETRY)
+ type = BTREE_ERR_FIXABLE;
+ if (!have_retry && type == BTREE_ERR_MUST_RETRY)
+ type = BTREE_ERR_BAD_NODE;
+
+ switch (type) {
+ case BTREE_ERR_FIXABLE:
+ mustfix_fsck_err(c, "%s", out.buf);
+ ret = -BCH_ERR_fsck_fix;
+ break;
+ case BTREE_ERR_WANT_RETRY:
+ case BTREE_ERR_MUST_RETRY:
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
+ ret = BTREE_RETRY_READ;
+ break;
+ case BTREE_ERR_BAD_NODE:
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
+ bch2_topology_error(c);
+ ret = -BCH_ERR_need_topology_repair;
+ break;
+ case BTREE_ERR_INCOMPATIBLE:
+ bch2_print_string_as_lines(KERN_ERR, out.buf);
+ ret = -BCH_ERR_fsck_errors_not_fixed;
+ break;
+ default:
+ BUG();
+ }
+out:
+fsck_err:
+ printbuf_exit(&out);
+ return ret;
+}
+
#define btree_err(type, c, ca, b, i, msg, ...) \
({ \
- __label__ out; \
- char _buf[300]; \
- char *_buf2 = _buf; \
- struct printbuf out = PBUF(_buf); \
- \
- _buf2 = kmalloc(4096, GFP_ATOMIC); \
- if (_buf2) \
- out = _PBUF(_buf2, 4986); \
- \
- btree_err_msg(&out, c, ca, b, i, b->written, write); \
- pr_buf(&out, ": " msg, ##__VA_ARGS__); \
- \
- if (type == BTREE_ERR_FIXABLE && \
- write == READ && \
- !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
- mustfix_fsck_err(c, "%s", _buf2); \
- goto out; \
- } \
+ int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\
\
- switch (write) { \
- case READ: \
- if (_buf2) \
- bch_err(c, "%s", _buf2); \
- \
- switch (type) { \
- case BTREE_ERR_FIXABLE: \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
- goto fsck_err; \
- case BTREE_ERR_WANT_RETRY: \
- if (have_retry) { \
- ret = BTREE_RETRY_READ; \
- goto fsck_err; \
- } \
- break; \
- case BTREE_ERR_MUST_RETRY: \
- ret = BTREE_RETRY_READ; \
- goto fsck_err; \
- case BTREE_ERR_FATAL: \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
- goto fsck_err; \
- } \
- break; \
- case WRITE: \
- bch_err(c, "corrupt metadata before write: %s", _buf2); \
- \
- if (bch2_fs_inconsistent(c)) { \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
- goto fsck_err; \
- } \
- break; \
- } \
-out: \
- if (_buf2 != _buf) \
- kfree(_buf2); \
- true; \
+ if (_ret != -BCH_ERR_fsck_fix) \
+ goto fsck_err; \
+ *saw_error = true; \
})
#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
@@ -596,6 +642,7 @@ out: \
* When btree topology repair changes the start or end of a node, that might
* mean we have to drop keys that are no longer inside the node:
*/
+__cold
void bch2_btree_node_drop_keys_outside_node(struct btree *b)
{
struct bset_tree *t;
@@ -607,7 +654,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
struct bset *i = bset(b, t);
struct bkey_packed *k;
- for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+ for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
break;
@@ -618,43 +665,46 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
(u64 *) vstruct_end(i) - (u64 *) k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
set_btree_bset_end(b, t);
- bch2_bset_set_no_aux_tree(b, t);
}
- for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+ for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
break;
if (k != vstruct_last(i)) {
i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
set_btree_bset_end(b, t);
- bch2_bset_set_no_aux_tree(b, t);
}
}
+ /*
+ * Always rebuild search trees: eytzinger search tree nodes directly
+ * depend on the values of min/max key:
+ */
+ bch2_bset_set_no_aux_tree(b, b->set);
bch2_btree_build_aux_trees(b);
for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
- BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
- BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+ BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+ BUG_ON(bpos_gt(k.k->p, b->data->max_key));
}
}
static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
struct btree *b, struct bset *i,
unsigned offset, unsigned sectors,
- int write, bool have_retry)
+ int write, bool have_retry, bool *saw_error)
{
unsigned version = le16_to_cpu(i->version);
const char *err;
- char buf1[100];
- char buf2[100];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
int ret = 0;
btree_err_on((version != BCH_BSET_VERSION_OLD &&
version < bcachefs_metadata_version_min) ||
version >= bcachefs_metadata_version_max,
- BTREE_ERR_FATAL, c, ca, b, i,
+ BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
"unsupported bset version");
if (btree_err_on(version < c->sb.version_min,
@@ -678,14 +728,15 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
}
btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
- BTREE_ERR_FATAL, c, ca, b, i,
+ BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
"BSET_SEPARATE_WHITEOUTS no longer supported");
if (btree_err_on(offset + sectors > btree_sectors(c),
BTREE_ERR_FIXABLE, c, ca, b, i,
"bset past end of btree node")) {
i->u64s = 0;
- return 0;
+ ret = 0;
+ goto out;
}
btree_err_on(offset && !i->u64s,
@@ -733,17 +784,20 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
b->data->max_key = b->key.k.p;
}
- btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
+ btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"incorrect min_key: got %s should be %s",
- (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
+ (printbuf_reset(&buf1),
+ bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
+ (printbuf_reset(&buf2),
+ bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
}
- btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
+ btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
BTREE_ERR_MUST_RETRY, c, ca, b, i,
"incorrect max key %s",
- (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
+ (printbuf_reset(&buf1),
+ bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
if (write)
compat_btree_node(b->c.level, b->c.btree_id, version,
@@ -751,23 +805,37 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
err = bch2_bkey_format_validate(&bn->format);
btree_err_on(err,
- BTREE_ERR_FATAL, c, ca, b, i,
+ BTREE_ERR_BAD_NODE, c, ca, b, i,
"invalid bkey format: %s", err);
compat_bformat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
&bn->format);
}
+out:
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
+static int bset_key_invalid(struct bch_fs *c, struct btree *b,
+ struct bkey_s_c k,
+ bool updated_range, int rw,
+ struct printbuf *err)
+{
+ return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
+ (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?:
+ (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
+}
+
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
- struct bset *i, unsigned *whiteout_u64s,
- int write, bool have_retry)
+ struct bset *i, int write,
+ bool have_retry, bool *saw_error)
{
unsigned version = le16_to_cpu(i->version);
struct bkey_packed *k, *prev = NULL;
+ struct printbuf buf = PRINTBUF;
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
int ret = 0;
@@ -776,9 +844,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
k != vstruct_last(i);) {
struct bkey_s u;
struct bkey tmp;
- const char *invalid;
- if (btree_err_on(bkey_next(k) > vstruct_last(i),
+ if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
BTREE_ERR_FIXABLE, c, NULL, b, i,
"key extends past end of bset")) {
i->u64s = cpu_to_le16((u64 *) k - i->_data);
@@ -789,7 +856,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
BTREE_ERR_FIXABLE, c, NULL, b, i,
"invalid bkey format %u", k->format)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_next(k),
+ memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
continue;
}
@@ -802,18 +869,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
u = __bkey_disassemble(b, k, &tmp);
- invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?:
- (!updated_range ? bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?:
- (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
- if (invalid) {
- char buf[160];
+ printbuf_reset(&buf);
+ if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "invalid bkey: ");
+ bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, u.s_c);
- bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
- btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
- "invalid bkey: %s\n%s", invalid, buf);
+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_next(k),
+ memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
continue;
}
@@ -824,34 +891,34 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
&b->format, k);
if (prev && bkey_iter_cmp(b, prev, k) > 0) {
- char buf1[80];
- char buf2[80];
struct bkey up = bkey_unpack_key(b, prev);
- bch2_bkey_to_text(&PBUF(buf1), &up);
- bch2_bkey_to_text(&PBUF(buf2), u.k);
+ printbuf_reset(&buf);
+ prt_printf(&buf, "keys out of order: ");
+ bch2_bkey_to_text(&buf, &up);
+ prt_printf(&buf, " > ");
+ bch2_bkey_to_text(&buf, u.k);
bch2_dump_bset(c, b, i, 0);
- if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
- "keys out of order: %s > %s",
- buf1, buf2)) {
+ if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_next(k),
+ memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
continue;
}
}
prev = k;
- k = bkey_next(k);
+ k = bkey_p_next(k);
}
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
- struct btree *b, bool have_retry)
+ struct btree *b, bool have_retry, bool *saw_error)
{
struct btree_node_entry *bne;
struct sort_iter *iter;
@@ -865,9 +932,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
unsigned u64s;
unsigned blacklisted_written, nonblacklisted_written = 0;
unsigned ptr_written = btree_ptr_sectors_written(&b->key);
- int ret, retry_read = 0, write = READ;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0, retry_read = 0, write = READ;
b->version_ondisk = U16_MAX;
+ /* We might get called multiple times on read retry: */
+ b->written = 0;
iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
sort_iter_init(iter, b);
@@ -879,11 +949,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
- "bad magic");
+ "bad magic: want %llx, got %llx",
+ bset_magic(c), le64_to_cpu(b->data->magic));
btree_err_on(!b->data->keys.seq,
BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
- "bad btree header");
+ "bad btree header: seq 0");
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bch_btree_ptr_v2 *bp =
@@ -896,7 +967,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
}
while (b->written < (ptr_written ?: btree_sectors(c))) {
- unsigned sectors, whiteout_u64s = 0;
+ unsigned sectors;
struct nonce nonce;
struct bch_csum csum;
bool first = !b->written;
@@ -916,11 +987,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
BTREE_ERR_WANT_RETRY, c, ca, b, i,
"invalid checksum");
- bset_encrypt(c, i, b->written << 9);
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error decrypting btree node: %i", ret))
+ goto fsck_err;
- btree_err_on(btree_node_is_extents(b) &&
+ btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
- BTREE_ERR_FATAL, c, NULL, b, NULL,
+ BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL,
"btree node does not have NEW_EXTENT_OVERWRITE set");
sectors = vstruct_sectors(b->data, c->block_bits);
@@ -943,7 +1017,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
BTREE_ERR_WANT_RETRY, c, ca, b, i,
"invalid checksum");
- bset_encrypt(c, i, b->written << 9);
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error decrypting btree node: %i\n", ret))
+ goto fsck_err;
sectors = vstruct_sectors(bne, c->block_bits);
}
@@ -952,15 +1029,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
le16_to_cpu(i->version));
ret = validate_bset(c, ca, b, i, b->written, sectors,
- READ, have_retry);
+ READ, have_retry, saw_error);
if (ret)
goto fsck_err;
if (!b->written)
btree_node_set_format(b, b->data->format);
- ret = validate_bset_keys(c, b, i, &whiteout_u64s,
- READ, have_retry);
+ ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
if (ret)
goto fsck_err;
@@ -986,11 +1062,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (blacklisted && !first)
continue;
- sort_iter_add(iter, i->start,
- vstruct_idx(i, whiteout_u64s));
-
sort_iter_add(iter,
- vstruct_idx(i, whiteout_u64s),
+ vstruct_idx(i, 0),
vstruct_last(i));
nonblacklisted_written = b->written;
@@ -1050,21 +1123,25 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
for (k = i->start; k != vstruct_last(i);) {
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
- const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
- if (invalid ||
+ printbuf_reset(&buf);
+
+ if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
(bch2_inject_invalid_keys &&
!bversion_cmp(u.k->version, MAX_VERSION))) {
- char buf[160];
+ printbuf_reset(&buf);
+
+ prt_printf(&buf, "invalid bkey: ");
+ bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
+ prt_printf(&buf, "\n ");
+ bch2_bkey_val_to_text(&buf, c, u.s_c);
- bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
- btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
- "invalid bkey %s: %s", buf, invalid);
+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
btree_keys_account_key_drop(&b->nr, 0, k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_next(k),
+ memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
set_btree_bset_end(b, b->set);
continue;
@@ -1076,7 +1153,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bp.v->mem_ptr = 0;
}
- k = bkey_next(k);
+ k = bkey_p_next(k);
}
bch2_bset_build_aux_tree(b, b->set, false);
@@ -1096,14 +1173,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
set_btree_node_need_rewrite(b);
out:
mempool_free(iter, &c->fill_iter);
+ printbuf_exit(&buf);
return retry_read;
fsck_err:
- if (ret == BTREE_RETRY_READ) {
+ if (ret == BTREE_RETRY_READ)
retry_read = 1;
- } else {
- bch2_inconsistent_error(c);
+ else
set_btree_node_read_error(b);
- }
goto out;
}
@@ -1116,18 +1192,18 @@ static void btree_node_read_work(struct work_struct *work)
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
- char buf[200];
- struct printbuf out;
+ struct printbuf buf = PRINTBUF;
bool saw_error = false;
+ bool retry = false;
bool can_retry;
goto start;
while (1) {
+ retry = true;
bch_info(c, "retrying read");
ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
- bio_reset(bio);
- bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
+ bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
@@ -1138,10 +1214,10 @@ static void btree_node_read_work(struct work_struct *work)
bio->bi_status = BLK_STS_REMOVED;
}
start:
- out = PBUF(buf);
- btree_pos_to_text(&out, c, b);
+ printbuf_reset(&buf);
+ btree_pos_to_text(&buf, c, b);
bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
- bch2_blk_status_to_str(bio->bi_status), buf);
+ bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
rb->have_ioref = false;
@@ -1153,8 +1229,11 @@ start:
&failed, &rb->pick) > 0;
if (!bio->bi_status &&
- !bch2_btree_node_read_done(c, ca, b, can_retry))
+ !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
+ if (retry)
+ bch_info(c, "retry success");
break;
+ }
saw_error = true;
@@ -1167,9 +1246,18 @@ start:
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
rb->start_time);
bio_put(&rb->bio);
+ printbuf_exit(&buf);
+
+ if (saw_error && !btree_node_read_error(b)) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bpos_to_text(&buf, b->key.k.p);
+ bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
+ __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf);
+ printbuf_exit(&buf);
- if (saw_error && !btree_node_read_error(b))
bch2_btree_node_rewrite_async(c, b);
+ }
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
@@ -1183,6 +1271,7 @@ static void btree_node_read_endio(struct bio *bio)
if (rb->have_ioref) {
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+
bch2_latency_acct(ca, rb->start_time, READ);
}
@@ -1247,12 +1336,14 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
container_of(cl, struct btree_node_read_all, cl);
struct bch_fs *c = ra->c;
struct btree *b = ra->b;
+ struct printbuf buf = PRINTBUF;
bool dump_bset_maps = false;
bool have_retry = false;
int ret = 0, best = -1, write = READ;
unsigned i, written = 0, written2 = 0;
__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
+ bool _saw_error = false, *saw_error = &_saw_error;
for (i = 0; i < ra->nr; i++) {
struct btree_node *bn = ra->buf[i];
@@ -1290,8 +1381,6 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
fsck_err:
if (dump_bset_maps) {
for (i = 0; i < ra->nr; i++) {
- char buf[200];
- struct printbuf out = PBUF(buf);
struct btree_node *bn = ra->buf[i];
struct btree_node_entry *bne = NULL;
unsigned offset = 0, sectors;
@@ -1300,6 +1389,8 @@ fsck_err:
if (ra->err[i])
continue;
+ printbuf_reset(&buf);
+
while (offset < btree_sectors(c)) {
if (!offset) {
sectors = vstruct_sectors(bn, c->block_bits);
@@ -1310,10 +1401,10 @@ fsck_err:
sectors = vstruct_sectors(bne, c->block_bits);
}
- pr_buf(&out, " %u-%u", offset, offset + sectors);
+ prt_printf(&buf, " %u-%u", offset, offset + sectors);
if (bne && bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq), false))
- pr_buf(&out, "*");
+ prt_printf(&buf, "*");
offset += sectors;
}
@@ -1321,31 +1412,33 @@ fsck_err:
bne = ra->buf[i] + (offset << 9);
if (bne->keys.seq == bn->keys.seq) {
if (!gap)
- pr_buf(&out, " GAP");
+ prt_printf(&buf, " GAP");
gap = true;
sectors = vstruct_sectors(bne, c->block_bits);
- pr_buf(&out, " %u-%u", offset, offset + sectors);
+ prt_printf(&buf, " %u-%u", offset, offset + sectors);
if (bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq), false))
- pr_buf(&out, "*");
+ prt_printf(&buf, "*");
}
offset++;
}
- bch_err(c, "replica %u:%s", i, buf);
+ bch_err(c, "replica %u:%s", i, buf.buf);
}
}
if (best >= 0) {
memcpy(b->data, ra->buf[best], btree_bytes(c));
- ret = bch2_btree_node_read_done(c, NULL, b, false);
+ ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
} else {
ret = -1;
}
if (ret)
set_btree_node_read_error(b);
+ else if (*saw_error)
+ bch2_btree_node_rewrite_async(c, b);
for (i = 0; i < ra->nr; i++) {
mempool_free(ra->buf[i], &c->btree_bounce_pool);
@@ -1354,6 +1447,7 @@ fsck_err:
closure_debug_destroy(&ra->cl);
kfree(ra);
+ printbuf_exit(&buf);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
@@ -1368,6 +1462,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio)
if (rb->have_ioref) {
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+
bch2_latency_acct(ca, rb->start_time, READ);
}
@@ -1390,7 +1485,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
ra = kzalloc(sizeof(*ra), GFP_NOFS);
if (!ra)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
closure_init(&ra->cl, NULL);
ra->c = c;
@@ -1399,8 +1494,10 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
for (i = 0; i < ra->nr; i++) {
ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
- ra->bio[i] = bio_alloc_bioset(GFP_NOFS, buf_pages(ra->buf[i],
- btree_bytes(c)),
+ ra->bio[i] = bio_alloc_bioset(NULL,
+ buf_pages(ra->buf[i], btree_bytes(c)),
+ REQ_OP_READ|REQ_SYNC|REQ_META,
+ GFP_NOFS,
&c->btree_bio);
}
@@ -1416,7 +1513,6 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
rb->idx = i;
rb->pick = pick;
- rb->bio.bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
@@ -1453,11 +1549,9 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
struct btree_read_bio *rb;
struct bch_dev *ca;
struct bio *bio;
- char buf[200];
int ret;
- btree_pos_to_text(&PBUF(buf), c, b);
- trace_btree_read(c, b);
+ trace_and_count(c, btree_node_read, c, b);
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
@@ -1465,17 +1559,30 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
NULL, &pick);
- if (bch2_fs_fatal_err_on(ret <= 0, c,
- "btree node read error: no device to read from\n"
- " at %s", buf)) {
+
+ if (ret <= 0) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "btree node read error: no device to read from\n at ");
+ btree_pos_to_text(&buf, c, b);
+ bch_err(c, "%s", buf.buf);
+
+ if (test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags))
+ bch2_fatal_error(c);
+
set_btree_node_read_error(b);
+ clear_btree_node_read_in_flight(b);
+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+ printbuf_exit(&buf);
return;
}
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
- bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data,
- btree_bytes(c)),
+ bio = bio_alloc_bioset(NULL,
+ buf_pages(b->data, btree_bytes(c)),
+ REQ_OP_READ|REQ_SYNC|REQ_META,
+ GFP_NOIO,
&c->btree_bio);
rb = container_of(bio, struct btree_read_bio, bio);
rb->c = c;
@@ -1485,7 +1592,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
rb->pick = pick;
INIT_WORK(&rb->work, btree_node_read_work);
- bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_end_io = btree_node_read_endio;
bch2_bio_map(bio, b->data, btree_bytes(c));
@@ -1512,9 +1618,10 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
}
}
-int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
- const struct bkey_i *k, unsigned level)
+static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
+ const struct bkey_i *k, unsigned level)
{
+ struct bch_fs *c = trans->c;
struct closure cl;
struct btree *b;
int ret;
@@ -1526,7 +1633,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
closure_sync(&cl);
} while (ret);
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(trans, level != 0);
bch2_btree_cache_cannibalize_unlock(c);
BUG_ON(IS_ERR(b));
@@ -1557,6 +1664,13 @@ err:
return ret;
}
+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
+ const struct bkey_i *k, unsigned level)
+{
+ return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level));
+
+}
+
void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
struct btree_write *w)
{
@@ -1576,10 +1690,11 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
bch2_journal_pin_drop(&c->journal, &w->journal);
}
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
{
struct btree_write *w = btree_prev_write(b);
unsigned long old, new, v;
+ unsigned type = 0;
bch2_btree_complete_write(c, b, w);
@@ -1587,32 +1702,20 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
do {
old = new = v;
- if (old & (1U << BTREE_NODE_need_write))
- goto do_write;
-
- new &= ~(1U << BTREE_NODE_write_in_flight);
- new &= ~(1U << BTREE_NODE_write_in_flight_inner);
- } while ((v = cmpxchg(&b->flags, old, new)) != old);
-
- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
- return;
-
-do_write:
- six_lock_read(&b->c.lock, NULL, NULL);
- v = READ_ONCE(b->flags);
- do {
- old = new = v;
-
if ((old & (1U << BTREE_NODE_dirty)) &&
(old & (1U << BTREE_NODE_need_write)) &&
!(old & (1U << BTREE_NODE_never_write)) &&
- btree_node_may_write(b)) {
+ !(old & (1U << BTREE_NODE_write_blocked)) &&
+ !(old & (1U << BTREE_NODE_will_make_reachable))) {
new &= ~(1U << BTREE_NODE_dirty);
new &= ~(1U << BTREE_NODE_need_write);
new |= (1U << BTREE_NODE_write_in_flight);
new |= (1U << BTREE_NODE_write_in_flight_inner);
new |= (1U << BTREE_NODE_just_written);
new ^= (1U << BTREE_NODE_write_idx);
+
+ type = new & BTREE_WRITE_TYPE_MASK;
+ new &= ~BTREE_WRITE_TYPE_MASK;
} else {
new &= ~(1U << BTREE_NODE_write_in_flight);
new &= ~(1U << BTREE_NODE_write_in_flight_inner);
@@ -1620,9 +1723,22 @@ do_write:
} while ((v = cmpxchg(&b->flags, old, new)) != old);
if (new & (1U << BTREE_NODE_write_in_flight))
- __bch2_btree_node_write(c, b, true);
+ __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
+ else
+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+ struct btree_trans trans;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ __btree_node_write_done(c, b);
six_unlock_read(&b->c.lock);
+
+ bch2_trans_exit(&trans);
}
static void btree_node_write_work(struct work_struct *work)
@@ -1706,14 +1822,21 @@ static void btree_node_write_endio(struct bio *bio)
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
struct bset *i, unsigned sectors)
{
- unsigned whiteout_u64s = 0;
+ struct printbuf buf = PRINTBUF;
+ bool saw_error;
int ret;
- if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
- return -1;
+ ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
+ BKEY_TYPE_btree, WRITE, &buf);
+
+ if (ret)
+ bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
+ printbuf_exit(&buf);
+ if (ret)
+ return ret;
- ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
- validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false);
+ ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
+ validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
if (ret) {
bch2_inconsistent_error(c);
dump_stack();
@@ -1726,17 +1849,18 @@ static void btree_write_submit(struct work_struct *work)
{
struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
struct bch_extent_ptr *ptr;
- __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+ BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
bkey_copy(&tmp.k, &wbio->key);
bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
ptr->offset += wbio->sector_offset;
- bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
+ bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
+ &tmp.k, false);
}
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
{
struct btree_write_bio *wbio;
struct bset_tree *t;
@@ -1750,14 +1874,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
bool used_mempool;
unsigned long old, new;
bool validate_before_checksum = false;
+ enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
void *data;
+ int ret;
- if (already_started)
+ if (flags & BTREE_WRITE_ALREADY_STARTED)
goto do_write;
- if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
- return;
-
/*
* We may only have a read lock on the btree node - the dirty bit is our
* "lock" against racing with other threads that may be trying to start
@@ -1771,13 +1894,25 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
if (!(old & (1 << BTREE_NODE_dirty)))
return;
- if (!btree_node_may_write(b))
+ if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
+ !(old & (1 << BTREE_NODE_need_write)))
+ return;
+
+ if (old &
+ ((1 << BTREE_NODE_never_write)|
+ (1 << BTREE_NODE_write_blocked)))
+ return;
+
+ if (b->written &&
+ (old & (1 << BTREE_NODE_will_make_reachable)))
return;
- if (old & (1 << BTREE_NODE_never_write))
+ if (old & (1 << BTREE_NODE_write_in_flight))
return;
- BUG_ON(old & (1 << BTREE_NODE_write_in_flight));
+ if (flags & BTREE_WRITE_ONLY_IF_NEED)
+ type = new & BTREE_WRITE_TYPE_MASK;
+ new &= ~BTREE_WRITE_TYPE_MASK;
new &= ~(1 << BTREE_NODE_dirty);
new &= ~(1 << BTREE_NODE_need_write);
@@ -1790,6 +1925,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_sta
if (new & (1U << BTREE_NODE_need_write))
return;
do_write:
+ BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
+
atomic_dec(&c->btree_cache.dirty);
BUG_ON(btree_node_fake(b));
@@ -1857,6 +1994,8 @@ do_write:
u64s = bch2_sort_keys(i->start, &sort_iter, false);
le16_add_cpu(&i->u64s, u64s);
+ BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
+
set_needs_whiteout(i, false);
/* do we have data to write? */
@@ -1866,6 +2005,10 @@ do_write:
bytes_to_write = vstruct_end(i) - data;
sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+ if (!b->written &&
+ b->key.k.type == KEY_TYPE_btree_ptr_v2)
+ BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
+
memset(data + bytes_to_write, 0,
(sectors_to_write << 9) - bytes_to_write);
@@ -1873,7 +2016,7 @@ do_write:
BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
BUG_ON(i->seq != b->data->keys.seq);
- i->version = c->sb.version < bcachefs_metadata_version_new_versioning
+ i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
? cpu_to_le16(BCH_BSET_VERSION_OLD)
: cpu_to_le16(c->sb.version);
SET_BSET_OFFSET(i, b->written);
@@ -1891,7 +2034,10 @@ do_write:
validate_bset_for_write(c, b, i, sectors_to_write))
goto err;
- bset_encrypt(c, i, b->written << 9);
+ ret = bset_encrypt(c, i, b->written << 9);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error encrypting btree node: %i\n", ret))
+ goto err;
nonce = btree_nonce(i, b->written << 9);
@@ -1927,10 +2073,12 @@ do_write:
c->opts.nochanges)
goto err;
- trace_btree_write(b, bytes_to_write, sectors_to_write);
+ trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
- wbio = container_of(bio_alloc_bioset(GFP_NOIO,
+ wbio = container_of(bio_alloc_bioset(NULL,
buf_pages(data, sectors_to_write << 9),
+ REQ_OP_WRITE|REQ_META,
+ GFP_NOIO,
&c->btree_bio),
struct btree_write_bio, wbio.bio);
wbio_init(&wbio->wbio.bio);
@@ -1940,7 +2088,6 @@ do_write:
wbio->wbio.c = c;
wbio->wbio.used_mempool = used_mempool;
wbio->wbio.first_btree_write = !b->written;
- wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META;
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
wbio->wbio.bio.bi_private = b;
@@ -1950,31 +2097,22 @@ do_write:
b->written += sectors_to_write;
- if (wbio->wbio.first_btree_write &&
- b->key.k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
- cpu_to_le16(b->written);
-
if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
cpu_to_le16(b->written);
- atomic64_inc(&c->btree_writes_nr);
- atomic64_add(sectors_to_write, &c->btree_writes_sectors);
+ atomic64_inc(&c->btree_write_stats[type].nr);
+ atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
INIT_WORK(&wbio->work, btree_write_submit);
queue_work(c->io_complete_wq, &wbio->work);
return;
err:
set_btree_node_noevict(b);
- if (!b->written &&
- b->key.k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
- cpu_to_le16(sectors_to_write);
b->written += sectors_to_write;
nowrite:
btree_bounce_free(c, bytes, used_mempool, data);
- btree_node_write_done(c, b);
+ __btree_node_write_done(c, b);
}
/*
@@ -2037,12 +2175,13 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
* Use this one if the node is intent locked:
*/
void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
- enum six_lock_type lock_type_held)
+ enum six_lock_type lock_type_held,
+ unsigned flags)
{
if (lock_type_held == SIX_LOCK_intent ||
(lock_type_held == SIX_LOCK_read &&
six_lock_tryupgrade(&b->c.lock))) {
- __bch2_btree_node_write(c, b, false);
+ __bch2_btree_node_write(c, b, flags);
/* don't cycle lock unnecessarily: */
if (btree_node_just_written(b) &&
@@ -2054,64 +2193,70 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
if (lock_type_held == SIX_LOCK_read)
six_lock_downgrade(&b->c.lock);
} else {
- __bch2_btree_node_write(c, b, false);
+ __bch2_btree_node_write(c, b, flags);
if (lock_type_held == SIX_LOCK_write &&
btree_node_just_written(b))
bch2_btree_post_write_cleanup(c, b);
}
}
-static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
{
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
unsigned i;
+ bool ret = false;
restart:
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
if (test_bit(flag, &b->flags)) {
rcu_read_unlock();
wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+ ret = true;
goto restart;
-
}
rcu_read_unlock();
-}
-void bch2_btree_flush_all_reads(struct bch_fs *c)
-{
- __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+ return ret;
}
-void bch2_btree_flush_all_writes(struct bch_fs *c)
+bool bch2_btree_flush_all_reads(struct bch_fs *c)
{
- __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+ return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
}
-void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
+bool bch2_btree_flush_all_writes(struct bch_fs *c)
{
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct btree *b;
- unsigned i;
-
- rcu_read_lock();
- for_each_cached_btree(b, c, tbl, i, pos) {
- unsigned long flags = READ_ONCE(b->flags);
+ return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+}
- if (!(flags & (1 << BTREE_NODE_dirty)))
- continue;
+const char * const bch2_btree_write_types[] = {
+#define x(t, n) [n] = #t,
+ BCH_BTREE_WRITE_TYPES()
+ NULL
+};
- pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
- b,
- (flags & (1 << BTREE_NODE_dirty)) != 0,
- (flags & (1 << BTREE_NODE_need_write)) != 0,
- b->c.level,
- b->written,
- !list_empty_careful(&b->write_blocked),
- b->will_make_reachable != 0,
- b->will_make_reachable & 1);
+void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ printbuf_tabstop_push(out, 20);
+ printbuf_tabstop_push(out, 10);
+
+ prt_tab(out);
+ prt_str(out, "nr");
+ prt_tab(out);
+ prt_str(out, "size");
+ prt_newline(out);
+
+ for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
+ u64 nr = atomic64_read(&c->btree_write_stats[i].nr);
+ u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes);
+
+ prt_printf(out, "%s:", bch2_btree_write_types[i]);
+ prt_tab(out);
+ prt_u64(out, nr);
+ prt_tab(out);
+ prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
+ prt_newline(out);
}
- rcu_read_unlock();
}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index f11a2e96227b..c43fb60b8c82 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -15,18 +15,13 @@ struct btree;
struct btree_iter;
struct btree_node_read_all;
-static inline bool btree_node_dirty(struct btree *b)
-{
- return test_bit(BTREE_NODE_dirty, &b->flags);
-}
-
-static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
+static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
atomic_inc(&c->btree_cache.dirty);
}
-static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
+static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
atomic_dec(&c->btree_cache.dirty);
@@ -67,12 +62,6 @@ void __bch2_btree_node_wait_on_write(struct btree *);
void bch2_btree_node_wait_on_read(struct btree *);
void bch2_btree_node_wait_on_write(struct btree *);
-static inline bool btree_node_may_write(struct btree *b)
-{
- return list_empty_careful(&b->write_blocked) &&
- (!b->written || !b->will_make_reachable);
-}
-
enum compact_mode {
COMPACT_LAZY,
COMPACT_ALL,
@@ -111,22 +100,25 @@ static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
}};
}
-static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
{
struct nonce nonce = btree_nonce(i, offset);
+ int ret;
if (!offset) {
struct btree_node *bn = container_of(i, struct btree_node, keys);
unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
- bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
- bytes);
+ ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+ &bn->flags, bytes);
+ if (ret)
+ return ret;
- nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
}
- bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
- vstruct_end(i) - (void *) i->_data);
+ return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+ vstruct_end(i) - (void *) i->_data);
}
void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
@@ -137,7 +129,7 @@ void bch2_btree_build_aux_trees(struct btree *);
void bch2_btree_init_next(struct btree_trans *, struct btree *);
int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
- struct btree *, bool);
+ struct btree *, bool, bool *);
void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);
@@ -145,41 +137,27 @@ int bch2_btree_root_read(struct bch_fs *, enum btree_id,
void bch2_btree_complete_write(struct bch_fs *, struct btree *,
struct btree_write *);
-void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
+enum btree_write_flags {
+ __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
+ __BTREE_WRITE_ALREADY_STARTED,
+};
+#define BTREE_WRITE_ONLY_IF_NEED (1U << __BTREE_WRITE_ONLY_IF_NEED )
+#define BTREE_WRITE_ALREADY_STARTED (1U << __BTREE_WRITE_ALREADY_STARTED)
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
- enum six_lock_type);
+ enum six_lock_type, unsigned);
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
enum six_lock_type lock_held)
{
- if (b->written &&
- btree_node_need_write(b) &&
- btree_node_may_write(b) &&
- !btree_node_write_in_flight(b))
- bch2_btree_node_write(c, b, lock_held);
+ bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
}
-#define bch2_btree_node_write_cond(_c, _b, cond) \
-do { \
- unsigned long old, new, v = READ_ONCE((_b)->flags); \
- \
- do { \
- old = new = v; \
- \
- if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \
- break; \
- \
- new |= (1 << BTREE_NODE_need_write); \
- } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \
- \
- btree_node_write_if_need(_c, _b, SIX_LOCK_read); \
-} while (0)
-
-void bch2_btree_flush_all_reads(struct bch_fs *);
-void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
+bool bch2_btree_flush_all_reads(struct bch_fs *);
+bool bch2_btree_flush_all_writes(struct bch_fs *);
static inline void compat_bformat(unsigned level, enum btree_id btree_id,
unsigned version, unsigned big_endian,
@@ -223,7 +201,7 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
{
if (version < bcachefs_metadata_version_inode_btree_change &&
btree_node_type_is_extents(btree_id) &&
- bpos_cmp(bn->min_key, POS_MIN) &&
+ !bpos_eq(bn->min_key, POS_MIN) &&
write)
bn->min_key = bpos_nosnap_predecessor(bn->min_key);
@@ -240,9 +218,11 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
if (version < bcachefs_metadata_version_inode_btree_change &&
btree_node_type_is_extents(btree_id) &&
- bpos_cmp(bn->min_key, POS_MIN) &&
+ !bpos_eq(bn->min_key, POS_MIN) &&
!write)
bn->min_key = bpos_nosnap_successor(bn->min_key);
}
+void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
+
#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 6cfac32f711f..f524e4b394c3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -16,19 +16,17 @@
#include "replicas.h"
#include "subvolume.h"
+#include <linux/random.h>
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
-static void btree_trans_verify_sorted(struct btree_trans *);
-static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
-
static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
struct btree_path *);
static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef TRACK_PATH_ALLOCATED
return iter->ip_allocated;
#else
return 0;
@@ -46,7 +44,7 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
if (need_resched() || race_fault()) {
bch2_trans_unlock(trans);
schedule();
- return bch2_trans_relock(trans) ? 0 : -EINTR;
+ return bch2_trans_relock(trans);
} else {
return 0;
}
@@ -58,6 +56,9 @@ static inline int __btree_path_cmp(const struct btree_path *l,
struct bpos r_pos,
unsigned r_level)
{
+ /*
+ * Must match lock ordering as defined by __bch2_btree_node_lock:
+ */
return cmp_int(l->btree_id, r_btree_id) ?:
cmp_int((int) l->cached, (int) r_cached) ?:
bpos_cmp(l->pos, r_pos) ?:
@@ -96,18 +97,12 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos
return p;
}
-static inline bool is_btree_node(struct btree_path *path, unsigned l)
-{
- return l < BTREE_MAX_DEPTH &&
- (unsigned long) path->l[l].b >= 128;
-}
-
static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
{
struct bpos pos = iter->pos;
if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
- bkey_cmp(pos, POS_MAX))
+ !bkey_eq(pos, POS_MAX))
pos = bkey_successor(iter, pos);
return pos;
}
@@ -115,13 +110,13 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
static inline bool btree_path_pos_before_node(struct btree_path *path,
struct btree *b)
{
- return bpos_cmp(path->pos, b->data->min_key) < 0;
+ return bpos_lt(path->pos, b->data->min_key);
}
static inline bool btree_path_pos_after_node(struct btree_path *path,
struct btree *b)
{
- return bpos_cmp(b->key.k.p, path->pos) < 0;
+ return bpos_gt(path->pos, b->key.k.p);
}
static inline bool btree_path_pos_in_node(struct btree_path *path,
@@ -132,449 +127,6 @@ static inline bool btree_path_pos_in_node(struct btree_path *path,
!btree_path_pos_after_node(path, b);
}
-/* Btree node locking: */
-
-void bch2_btree_node_unlock_write(struct btree_trans *trans,
- struct btree_path *path, struct btree *b)
-{
- bch2_btree_node_unlock_write_inlined(trans, path, b);
-}
-
-void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
-{
- struct btree_path *linked;
- unsigned readers = 0;
-
- trans_for_each_path(trans, linked)
- if (linked->l[b->c.level].b == b &&
- btree_node_read_locked(linked, b->c.level))
- readers++;
-
- /*
- * Must drop our read locks before calling six_lock_write() -
- * six_unlock() won't do wakeups until the reader count
- * goes to 0, and it's safe because we have the node intent
- * locked:
- */
- if (!b->c.lock.readers)
- atomic64_sub(__SIX_VAL(read_lock, readers),
- &b->c.lock.state.counter);
- else
- this_cpu_sub(*b->c.lock.readers, readers);
-
- btree_node_lock_type(trans->c, b, SIX_LOCK_write);
-
- if (!b->c.lock.readers)
- atomic64_add(__SIX_VAL(read_lock, readers),
- &b->c.lock.state.counter);
- else
- this_cpu_add(*b->c.lock.readers, readers);
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- struct btree *b = btree_path_node(path, level);
- int want = __btree_lock_want(path, level);
-
- if (!is_btree_node(path, level))
- goto fail;
-
- if (race_fault())
- goto fail;
-
- if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
- (btree_node_lock_seq_matches(path, b, level) &&
- btree_node_lock_increment(trans, b, level, want))) {
- mark_btree_node_locked(path, level, want);
- return true;
- }
-fail:
- trace_btree_node_relock_fail(trans->fn, _RET_IP_,
- path->btree_id,
- &path->pos,
- (unsigned long) b,
- path->l[level].lock_seq,
- is_btree_node(path, level) ? b->c.lock.state.seq : 0);
- return false;
-}
-
-bool bch2_btree_node_upgrade(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- struct btree *b = path->l[level].b;
-
- if (!is_btree_node(path, level))
- return false;
-
- switch (btree_lock_want(path, level)) {
- case BTREE_NODE_UNLOCKED:
- BUG_ON(btree_node_locked(path, level));
- return true;
- case BTREE_NODE_READ_LOCKED:
- BUG_ON(btree_node_intent_locked(path, level));
- return bch2_btree_node_relock(trans, path, level);
- case BTREE_NODE_INTENT_LOCKED:
- break;
- }
-
- if (btree_node_intent_locked(path, level))
- return true;
-
- if (race_fault())
- return false;
-
- if (btree_node_locked(path, level)
- ? six_lock_tryupgrade(&b->c.lock)
- : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
- goto success;
-
- if (btree_node_lock_seq_matches(path, b, level) &&
- btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
- btree_node_unlock(path, level);
- goto success;
- }
-
- return false;
-success:
- mark_btree_node_intent_locked(path, level);
- return true;
-}
-
-static inline bool btree_path_get_locks(struct btree_trans *trans,
- struct btree_path *path,
- bool upgrade)
-{
- unsigned l = path->level;
- int fail_idx = -1;
-
- do {
- if (!btree_path_node(path, l))
- break;
-
- if (!(upgrade
- ? bch2_btree_node_upgrade(trans, path, l)
- : bch2_btree_node_relock(trans, path, l)))
- fail_idx = l;
-
- l++;
- } while (l < path->locks_want);
-
- /*
- * When we fail to get a lock, we have to ensure that any child nodes
- * can't be relocked so bch2_btree_path_traverse has to walk back up to
- * the node that we failed to relock:
- */
- if (fail_idx >= 0) {
- __bch2_btree_path_unlock(path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-
- do {
- path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
- --fail_idx;
- } while (fail_idx >= 0);
- }
-
- if (path->uptodate == BTREE_ITER_NEED_RELOCK)
- path->uptodate = BTREE_ITER_UPTODATE;
-
- bch2_trans_verify_locks(trans);
-
- return path->uptodate < BTREE_ITER_NEED_RELOCK;
-}
-
-static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
- bool cached)
-{
- return !cached
- ? container_of(_b, struct btree, c)->key.k.p
- : container_of(_b, struct bkey_cached, c)->key.pos;
-}
-
-/* Slowpath: */
-bool __bch2_btree_node_lock(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct bpos pos, unsigned level,
- enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip)
-{
- struct btree_path *linked, *deadlock_path = NULL;
- u64 start_time = local_clock();
- unsigned reason = 9;
- bool ret;
-
- /* Check if it's safe to block: */
- trans_for_each_path(trans, linked) {
- if (!linked->nodes_locked)
- continue;
-
- /*
- * Can't block taking an intent lock if we have _any_ nodes read
- * locked:
- *
- * - Our read lock blocks another thread with an intent lock on
- * the same node from getting a write lock, and thus from
- * dropping its intent lock
- *
- * - And the other thread may have multiple nodes intent locked:
- * both the node we want to intent lock, and the node we
- * already have read locked - deadlock:
- */
- if (type == SIX_LOCK_intent &&
- linked->nodes_locked != linked->nodes_intent_locked) {
- deadlock_path = linked;
- reason = 1;
- }
-
- if (linked->btree_id != path->btree_id) {
- if (linked->btree_id > path->btree_id) {
- deadlock_path = linked;
- reason = 3;
- }
- continue;
- }
-
- /*
- * Within the same btree, cached paths come before non
- * cached paths:
- */
- if (linked->cached != path->cached) {
- if (path->cached) {
- deadlock_path = linked;
- reason = 4;
- }
- continue;
- }
-
- /*
- * Interior nodes must be locked before their descendants: if
- * another path has possible descendants locked of the node
- * we're about to lock, it must have the ancestors locked too:
- */
- if (level > __fls(linked->nodes_locked)) {
- deadlock_path = linked;
- reason = 5;
- }
-
- /* Must lock btree nodes in key order: */
- if (btree_node_locked(linked, level) &&
- bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
- linked->cached)) <= 0) {
- deadlock_path = linked;
- reason = 7;
- BUG_ON(trans->in_traverse_all);
- }
- }
-
- if (unlikely(deadlock_path)) {
- trace_trans_restart_would_deadlock(trans->fn, ip,
- trans->in_traverse_all, reason,
- deadlock_path->btree_id,
- deadlock_path->cached,
- &deadlock_path->pos,
- path->btree_id,
- path->cached,
- &pos);
- btree_trans_restart(trans);
- return false;
- }
-
- if (six_trylock_type(&b->c.lock, type))
- return true;
-
- trans->locking_path_idx = path->idx;
- trans->locking_pos = pos;
- trans->locking_btree_id = path->btree_id;
- trans->locking_level = level;
- trans->locking = b;
-
- ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
-
- trans->locking = NULL;
-
- if (ret)
- bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
- start_time);
- return ret;
-}
-
-/* Btree iterator locking: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_btree_path_verify_locks(struct btree_path *path)
-{
- unsigned l;
-
- if (!path->nodes_locked) {
- BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
- btree_path_node(path, path->level));
- return;
- }
-
- for (l = 0; btree_path_node(path, l); l++)
- BUG_ON(btree_lock_want(path, l) !=
- btree_node_locked_type(path, l));
-}
-
-void bch2_trans_verify_locks(struct btree_trans *trans)
-{
- struct btree_path *path;
-
- trans_for_each_path(trans, path)
- bch2_btree_path_verify_locks(path);
-}
-#else
-static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
-#endif
-
-/* Btree path locking: */
-
-/*
- * Only for btree_cache.c - only relocks intent locks
- */
-bool bch2_btree_path_relock_intent(struct btree_trans *trans,
- struct btree_path *path)
-{
- unsigned l;
-
- for (l = path->level;
- l < path->locks_want && btree_path_node(path, l);
- l++) {
- if (!bch2_btree_node_relock(trans, path, l)) {
- __bch2_btree_path_unlock(path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
- path->btree_id, &path->pos);
- btree_trans_restart(trans);
- return false;
- }
- }
-
- return true;
-}
-
-__flatten
-static bool bch2_btree_path_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
-{
- bool ret = btree_path_get_locks(trans, path, false);
-
- if (!ret) {
- trace_trans_restart_relock_path(trans->fn, trace_ip,
- path->btree_id, &path->pos);
- btree_trans_restart(trans);
- }
- return ret;
-}
-
-bool __bch2_btree_path_upgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
-{
- struct btree_path *linked;
-
- EBUG_ON(path->locks_want >= new_locks_want);
-
- path->locks_want = new_locks_want;
-
- if (btree_path_get_locks(trans, path, true))
- return true;
-
- /*
- * XXX: this is ugly - we'd prefer to not be mucking with other
- * iterators in the btree_trans here.
- *
- * On failure to upgrade the iterator, setting iter->locks_want and
- * calling get_locks() is sufficient to make bch2_btree_path_traverse()
- * get the locks we want on transaction restart.
- *
- * But if this iterator was a clone, on transaction restart what we did
- * to this iterator isn't going to be preserved.
- *
- * Possibly we could add an iterator field for the parent iterator when
- * an iterator is a copy - for now, we'll just upgrade any other
- * iterators with the same btree id.
- *
- * The code below used to be needed to ensure ancestor nodes get locked
- * before interior nodes - now that's handled by
- * bch2_btree_path_traverse_all().
- */
- trans_for_each_path(trans, linked)
- if (linked != path &&
- linked->cached == path->cached &&
- linked->btree_id == path->btree_id &&
- linked->locks_want < new_locks_want) {
- linked->locks_want = new_locks_want;
- btree_path_get_locks(trans, linked, true);
- }
-
- return false;
-}
-
-void __bch2_btree_path_downgrade(struct btree_path *path,
- unsigned new_locks_want)
-{
- unsigned l;
-
- EBUG_ON(path->locks_want < new_locks_want);
-
- path->locks_want = new_locks_want;
-
- while (path->nodes_locked &&
- (l = __fls(path->nodes_locked)) >= path->locks_want) {
- if (l > path->level) {
- btree_node_unlock(path, l);
- } else {
- if (btree_node_intent_locked(path, l)) {
- six_lock_downgrade(&path->l[l].b->c.lock);
- path->nodes_intent_locked ^= 1 << l;
- }
- break;
- }
- }
-
- bch2_btree_path_verify_locks(path);
-}
-
-void bch2_trans_downgrade(struct btree_trans *trans)
-{
- struct btree_path *path;
-
- trans_for_each_path(trans, path)
- bch2_btree_path_downgrade(path);
-}
-
-/* Btree transaction locking: */
-
-bool bch2_trans_relock(struct btree_trans *trans)
-{
- struct btree_path *path;
-
- if (unlikely(trans->restarted))
- return false;
-
- trans_for_each_path(trans, path)
- if (path->should_be_locked &&
- !bch2_btree_path_relock(trans, path, _RET_IP_)) {
- trace_trans_restart_relock(trans->fn, _RET_IP_,
- path->btree_id, &path->pos);
- BUG_ON(!trans->restarted);
- return false;
- }
- return true;
-}
-
-void bch2_trans_unlock(struct btree_trans *trans)
-{
- struct btree_path *path;
-
- trans_for_each_path(trans, path)
- __bch2_btree_path_unlock(path);
-}
-
/* Btree iterator: */
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -590,10 +142,10 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans,
ck = (void *) path->l[0].b;
BUG_ON(ck->key.btree_id != path->btree_id ||
- bkey_cmp(ck->key.pos, path->pos));
+ !bkey_eq(ck->key.pos, path->pos));
if (!locked)
- btree_node_unlock(path, 0);
+ btree_node_unlock(trans, path, 0);
}
static void bch2_btree_path_verify_level(struct btree_trans *trans,
@@ -603,7 +155,9 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
struct btree_node_iter tmp;
bool locked;
struct bkey_packed *p, *k;
- char buf1[100], buf2[100], buf3[100];
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ struct printbuf buf3 = PRINTBUF;
const char *msg;
if (!bch2_debug_check_iterators)
@@ -622,7 +176,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
if (!btree_path_node(path, level))
return;
- if (!bch2_btree_node_relock(trans, path, level))
+ if (!bch2_btree_node_relock_notrace(trans, path, level))
return;
BUG_ON(!btree_path_pos_in_node(path, l->b));
@@ -648,29 +202,32 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
}
if (!locked)
- btree_node_unlock(path, level);
+ btree_node_unlock(trans, path, level);
return;
err:
- strcpy(buf2, "(none)");
- strcpy(buf3, "(none)");
-
- bch2_bpos_to_text(&PBUF(buf1), path->pos);
+ bch2_bpos_to_text(&buf1, path->pos);
if (p) {
struct bkey uk = bkey_unpack_key(l->b, p);
- bch2_bkey_to_text(&PBUF(buf2), &uk);
+
+ bch2_bkey_to_text(&buf2, &uk);
+ } else {
+ prt_printf(&buf2, "(none)");
}
if (k) {
struct bkey uk = bkey_unpack_key(l->b, k);
- bch2_bkey_to_text(&PBUF(buf3), &uk);
+
+ bch2_bkey_to_text(&buf3, &uk);
+ } else {
+ prt_printf(&buf3, "(none)");
}
panic("path should be %s key at level %u:\n"
"path pos %s\n"
"prev key %s\n"
"cur key %s\n",
- msg, level, buf1, buf2, buf3);
+ msg, level, buf1.buf, buf2.buf, buf3.buf);
}
static void bch2_btree_path_verify(struct btree_trans *trans,
@@ -730,8 +287,8 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
iter->pos.snapshot != iter->snapshot);
- BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
- bkey_cmp(iter->pos, iter->k.p) > 0);
+ BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
+ bkey_gt(iter->pos, iter->k.p));
}
static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
@@ -765,19 +322,19 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
if (ret)
goto out;
- if (!bkey_cmp(prev.k->p, k.k->p) &&
+ if (bkey_eq(prev.k->p, k.k->p) &&
bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
prev.k->p.snapshot) > 0) {
- char buf1[100], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
- bch2_bkey_to_text(&PBUF(buf1), k.k);
- bch2_bkey_to_text(&PBUF(buf2), prev.k);
+ bch2_bkey_to_text(&buf1, k.k);
+ bch2_bkey_to_text(&buf2, prev.k);
panic("iter snap %u\n"
"k %s\n"
"prev %s\n",
iter->snapshot,
- buf1, buf2);
+ buf1.buf, buf2.buf);
}
out:
bch2_trans_iter_exit(trans, &copy);
@@ -789,7 +346,9 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
{
struct btree_path *path;
unsigned idx;
- char buf[100];
+ struct printbuf buf = PRINTBUF;
+
+ btree_trans_sort_paths(trans);
trans_for_each_path_inorder(trans, path, idx) {
int cmp = cmp_int(path->btree_id, id) ?:
@@ -800,24 +359,25 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
if (cmp < 0)
continue;
- if (!(path->nodes_locked & 1) ||
+ if (!btree_node_locked(path, 0) ||
!path->should_be_locked)
continue;
if (!key_cache) {
- if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 &&
- bkey_cmp(pos, path->l[0].b->key.k.p) <= 0)
+ if (bkey_ge(pos, path->l[0].b->data->min_key) &&
+ bkey_le(pos, path->l[0].b->key.k.p))
return;
} else {
- if (!bkey_cmp(pos, path->pos))
+ if (bkey_eq(pos, path->pos))
return;
}
}
bch2_dump_trans_paths_updates(trans);
+ bch2_bpos_to_text(&buf, pos);
+
panic("not locked: %s %s%s\n",
- bch2_btree_ids[id],
- (bch2_bpos_to_text(&PBUF(buf), pos), buf),
+ bch2_btree_ids[id], buf.buf,
key_cache ? " cached" : "");
}
@@ -977,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
unsigned clobber_u64s,
unsigned new_u64s)
{
- struct bset_tree *t = bch2_bkey_to_bset(b, where);
+ struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
struct btree_path *linked;
if (node_iter != &path->l[b->c.level].iter) {
@@ -1003,8 +563,6 @@ static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
struct bkey *u,
struct bkey_packed *k)
{
- struct bkey_s_c ret;
-
if (unlikely(!k)) {
/*
* signal to bch2_btree_iter_peek_slot() that we're currently at
@@ -1014,19 +572,7 @@ static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
return bkey_s_c_null;
}
- ret = bkey_disassemble(l->b, k, u);
-
- /*
- * XXX: bch2_btree_bset_insert_key() generates invalid keys when we
- * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key
- * being overwritten but doesn't change k->size. But this is ok, because
- * those keys are never written out, we just have to avoid a spurious
- * assertion here:
- */
- if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
- bch2_bkey_debugcheck(c, l->b, ret);
-
- return ret;
+ return bkey_disassemble(l->b, k, u);
}
static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
@@ -1037,27 +583,31 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
bch2_btree_node_iter_peek_all(&l->iter, l->b));
}
-static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c,
+static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
struct btree_path *path,
struct btree_path_level *l,
struct bkey *u)
{
- struct bkey_s_c k = __btree_iter_unpack(c, l, u,
+ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
bch2_btree_node_iter_peek(&l->iter, l->b));
path->pos = k.k ? k.k->p : l->b->key.k.p;
+ trans->paths_sorted = false;
+ bch2_btree_path_verify_level(trans, path, l - path->l);
return k;
}
-static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c,
+static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
struct btree_path *path,
struct btree_path_level *l,
struct bkey *u)
{
- struct bkey_s_c k = __btree_iter_unpack(c, l, u,
+ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
bch2_btree_node_iter_prev(&l->iter, l->b));
path->pos = k.k ? k.k->p : l->b->data->min_key;
+ trans->paths_sorted = false;
+ bch2_btree_path_verify_level(trans, path, l - path->l);
return k;
}
@@ -1080,61 +630,6 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path,
return true;
}
-/*
- * Verify that iterator for parent node points to child node:
- */
-static void btree_path_verify_new_node(struct btree_trans *trans,
- struct btree_path *path, struct btree *b)
-{
- struct bch_fs *c = trans->c;
- struct btree_path_level *l;
- unsigned plevel;
- bool parent_locked;
- struct bkey_packed *k;
-
- if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- return;
-
- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
- return;
-
- plevel = b->c.level + 1;
- if (!btree_path_node(path, plevel))
- return;
-
- parent_locked = btree_node_locked(path, plevel);
-
- if (!bch2_btree_node_relock(trans, path, plevel))
- return;
-
- l = &path->l[plevel];
- k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- if (!k ||
- bkey_deleted(k) ||
- bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
- char buf1[100];
- char buf2[100];
- char buf3[100];
- char buf4[100];
- struct bkey uk = bkey_unpack_key(b, k);
-
- bch2_dump_btree_node(c, l->b);
- bch2_bpos_to_text(&PBUF(buf1), path->pos);
- bch2_bkey_to_text(&PBUF(buf2), &uk);
- bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
- bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
- panic("parent iter doesn't point to new node:\n"
- "iter pos %s %s\n"
- "iter key %s\n"
- "new node %s-%s\n",
- bch2_btree_ids[path->btree_id], buf1,
- buf2, buf3, buf4);
- }
-
- if (!parent_locked)
- btree_node_unlock(path, plevel);
-}
-
static inline void __btree_path_level_init(struct btree_path *path,
unsigned level)
{
@@ -1150,14 +645,12 @@ static inline void __btree_path_level_init(struct btree_path *path,
bch2_btree_node_iter_peek(&l->iter, l->b);
}
-static inline void btree_path_level_init(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+void bch2_btree_path_level_init(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
BUG_ON(path->cached);
- btree_path_verify_new_node(trans, path, b);
-
EBUG_ON(!btree_path_pos_in_node(path, b));
EBUG_ON(b->c.lock.state.seq & 1);
@@ -1168,6 +661,32 @@ static inline void btree_path_level_init(struct btree_trans *trans,
/* Btree path: fixups after btree node updates: */
+static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ if (!i->cached &&
+ i->level == b->c.level &&
+ i->btree_id == b->c.btree_id &&
+ bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
+ bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
+ i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+
+ if (unlikely(trans->journal_replay_not_finished)) {
+ struct bkey_i *j_k =
+ bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
+ i->k->k.p);
+
+ if (j_k) {
+ i->old_k = j_k->k;
+ i->old_v = &j_k->v;
+ }
+ }
+ }
+}
+
/*
* A btree node is being replaced - update the iterator to point to the new
* node:
@@ -1177,20 +696,22 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
struct btree_path *path;
trans_for_each_path(trans, path)
- if (!path->cached &&
+ if (path->uptodate == BTREE_ITER_UPTODATE &&
+ !path->cached &&
btree_path_pos_in_node(path, b)) {
enum btree_node_locked_type t =
btree_lock_want(path, b->c.level);
- if (path->nodes_locked &&
- t != BTREE_NODE_UNLOCKED) {
- btree_node_unlock(path, b->c.level);
+ if (t != BTREE_NODE_UNLOCKED) {
+ btree_node_unlock(trans, path, b->c.level);
six_lock_increment(&b->c.lock, t);
- mark_btree_node_locked(path, b->c.level, t);
+ mark_btree_node_locked(trans, path, b->c.level, t);
}
- btree_path_level_init(trans, path, b);
+ bch2_btree_path_level_init(trans, path, b);
}
+
+ bch2_trans_revalidate_updates_in_node(trans, b);
}
/*
@@ -1203,18 +724,12 @@ void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
trans_for_each_path_with_node(trans, b, path)
__btree_path_level_init(path, b->c.level);
+
+ bch2_trans_revalidate_updates_in_node(trans, b);
}
/* Btree path: traverse, set_pos: */
-static int lock_root_check_fn(struct six_lock *lock, void *p)
-{
- struct btree *b = container_of(lock, struct btree, c.lock);
- struct btree **rootp = p;
-
- return b == *rootp ? 0 : -1;
-}
-
static inline int btree_path_lock_root(struct btree_trans *trans,
struct btree_path *path,
unsigned depth_want,
@@ -1224,6 +739,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
enum six_lock_type lock_type;
unsigned i;
+ int ret;
EBUG_ON(path->nodes_locked);
@@ -1245,26 +761,27 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
}
lock_type = __btree_lock_want(path, path->level);
- if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX,
- path->level, lock_type,
- lock_root_check_fn, rootp,
- trace_ip))) {
- if (trans->restarted)
- return -EINTR;
- continue;
+ ret = btree_node_lock(trans, path, &b->c,
+ path->level, lock_type, trace_ip);
+ if (unlikely(ret)) {
+ if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
+ continue;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
+ BUG();
}
if (likely(b == READ_ONCE(*rootp) &&
b->c.level == path->level &&
!race_fault())) {
for (i = 0; i < path->level; i++)
- path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
+ path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
path->l[path->level].b = b;
for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
path->l[i].b = NULL;
- mark_btree_node_locked(path, path->level, lock_type);
- btree_path_level_init(trans, path, b);
+ mark_btree_node_locked(trans, path, path->level, lock_type);
+ bch2_btree_path_level_init(trans, path, b);
return 0;
}
@@ -1288,7 +805,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
bch2_bkey_buf_init(&tmp);
- while (nr && !ret) {
+ while (nr-- && !ret) {
if (!bch2_btree_node_relock(trans, path, path->level))
break;
@@ -1298,12 +815,12 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
break;
bch2_bkey_buf_unpack(&tmp, c, l->b, k);
- ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+ ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
path->level - 1);
}
if (!was_locked)
- btree_node_unlock(path, path->level);
+ btree_node_unlock(trans, path, path->level);
bch2_bkey_buf_exit(&tmp, c);
return ret;
@@ -1323,7 +840,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
bch2_bkey_buf_init(&tmp);
- while (nr && !ret) {
+ while (nr-- && !ret) {
if (!bch2_btree_node_relock(trans, path, path->level))
break;
@@ -1333,12 +850,12 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
break;
bch2_bkey_buf_reassemble(&tmp, c, k);
- ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+ ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
path->level - 1);
}
if (!was_locked)
- btree_node_unlock(path, path->level);
+ btree_node_unlock(trans, path, path->level);
bch2_bkey_buf_exit(&tmp, c);
return ret;
@@ -1363,7 +880,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
bp->mem_ptr = (unsigned long)b;
if (!locked)
- btree_node_unlock(path, plevel);
+ btree_node_unlock(trans, path, plevel);
}
static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
@@ -1400,7 +917,6 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree *b;
unsigned level = path->level - 1;
enum six_lock_type lock_type = __btree_lock_want(path, level);
- bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
struct bkey_buf tmp;
int ret;
@@ -1408,7 +924,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
bch2_bkey_buf_init(&tmp);
- if (unlikely(!replay_done)) {
+ if (unlikely(trans->journal_replay_not_finished)) {
ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
if (ret)
goto err;
@@ -1428,16 +944,17 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
if (unlikely(ret))
goto err;
- mark_btree_node_locked(path, level, lock_type);
- btree_path_level_init(trans, path, b);
-
- if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
+ if (likely(!trans->journal_replay_not_finished &&
+ tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
unlikely(b != btree_node_mem_ptr(tmp.k)))
btree_node_mem_ptr_set(trans, path, level + 1, b);
if (btree_node_read_locked(path, level + 1))
- btree_node_unlock(path, level + 1);
+ btree_node_unlock(trans, path, level + 1);
+
+ mark_btree_node_locked(trans, path, level, lock_type);
path->level = level;
+ bch2_btree_path_level_init(trans, path, b);
bch2_btree_path_verify_locks(path);
err:
@@ -1445,43 +962,31 @@ err:
return ret;
}
-static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
- unsigned, unsigned long);
-static int __btree_path_traverse_all(struct btree_trans *trans, int ret,
- unsigned long trace_ip)
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_path *path;
- int i;
+ unsigned long trace_ip = _RET_IP_;
+ int i, ret = 0;
if (trans->in_traverse_all)
- return -EINTR;
+ return -BCH_ERR_transaction_restart_in_traverse_all;
trans->in_traverse_all = true;
retry_all:
- trans->restarted = false;
+ trans->restarted = 0;
+ trans->last_restarted_ip = 0;
trans_for_each_path(trans, path)
path->should_be_locked = false;
- btree_trans_verify_sorted(trans);
-
- for (i = trans->nr_sorted - 2; i >= 0; --i) {
- struct btree_path *path1 = trans->paths + trans->sorted[i];
- struct btree_path *path2 = trans->paths + trans->sorted[i + 1];
-
- if (path1->btree_id == path2->btree_id &&
- path1->locks_want < path2->locks_want)
- __bch2_btree_path_upgrade(trans, path1, path2->locks_want);
- else if (!path1->locks_want && path2->locks_want)
- __bch2_btree_path_upgrade(trans, path1, 1);
- }
+ btree_trans_sort_paths(trans);
bch2_trans_unlock(trans);
cond_resched();
- if (unlikely(ret == -ENOMEM)) {
+ if (unlikely(trans->memory_allocation_failure)) {
struct closure cl;
closure_init_stack(&cl);
@@ -1492,92 +997,113 @@ retry_all:
} while (ret);
}
- if (unlikely(ret == -EIO))
- goto out;
-
- BUG_ON(ret && ret != -EINTR);
-
/* Now, redo traversals in correct order: */
i = 0;
while (i < trans->nr_sorted) {
path = trans->paths + trans->sorted[i];
- EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
-
- ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
- if (ret)
- goto retry_all;
-
- EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
-
- if (path->nodes_locked ||
- !btree_path_node(path, path->level))
+ /*
+ * Traversing a path can cause another path to be added at about
+ * the same position:
+ */
+ if (path->uptodate) {
+ __btree_path_get(path, false);
+ ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_);
+ __btree_path_put(path, false);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret, ENOMEM))
+ goto retry_all;
+ if (ret)
+ goto err;
+ } else {
i++;
+ }
}
/*
- * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
- * and relock(), relock() won't relock since path->should_be_locked
- * isn't set yet, which is all fine
+ * We used to assert that all paths had been traversed here
+ * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
+ * path->Should_be_locked is not set yet, we we might have unlocked and
+ * then failed to relock a path - that's fine.
*/
- trans_for_each_path(trans, path)
- BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
-out:
+err:
bch2_btree_cache_cannibalize_unlock(c);
trans->in_traverse_all = false;
- trace_trans_traverse_all(trans->fn, trace_ip);
+ trace_and_count(c, trans_traverse_all, trans, trace_ip);
return ret;
}
-static int bch2_btree_path_traverse_all(struct btree_trans *trans)
+static inline bool btree_path_check_pos_in_node(struct btree_path *path,
+ unsigned l, int check_pos)
{
- return __btree_path_traverse_all(trans, 0, _RET_IP_);
+ if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
+ return false;
+ if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
+ return false;
+ return true;
}
static inline bool btree_path_good_node(struct btree_trans *trans,
struct btree_path *path,
unsigned l, int check_pos)
{
- if (!is_btree_node(path, l) ||
- !bch2_btree_node_relock(trans, path, l))
- return false;
+ return is_btree_node(path, l) &&
+ bch2_btree_node_relock(trans, path, l) &&
+ btree_path_check_pos_in_node(path, l, check_pos);
+}
- if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
- return false;
- if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
- return false;
- return true;
+static void btree_path_set_level_down(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_level)
+{
+ unsigned l;
+
+ path->level = new_level;
+
+ for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+ if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+ btree_node_unlock(trans, path, l);
+
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ bch2_btree_path_verify(trans, path);
}
-static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
- struct btree_path *path,
- int check_pos)
+static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
+ struct btree_path *path,
+ int check_pos)
{
unsigned i, l = path->level;
-
+again:
while (btree_path_node(path, l) &&
- !btree_path_good_node(trans, path, l, check_pos)) {
- btree_node_unlock(path, l);
- path->l[l].b = BTREE_ITER_NO_NODE_UP;
- l++;
- }
+ !btree_path_good_node(trans, path, l, check_pos))
+ __btree_path_set_level_up(trans, path, l++);
/* If we need intent locks, take them too: */
for (i = l + 1;
i < path->locks_want && btree_path_node(path, i);
i++)
- if (!bch2_btree_node_relock(trans, path, i))
- while (l <= i) {
- btree_node_unlock(path, l);
- path->l[l].b = BTREE_ITER_NO_NODE_UP;
- l++;
- }
+ if (!bch2_btree_node_relock(trans, path, i)) {
+ while (l <= i)
+ __btree_path_set_level_up(trans, path, l++);
+ goto again;
+ }
return l;
}
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+ struct btree_path *path,
+ int check_pos)
+{
+ return likely(btree_node_locked(path, path->level) &&
+ btree_path_check_pos_in_node(path, path->level, check_pos))
+ ? path->level
+ : __btree_path_up_until_good_node(trans, path, check_pos);
+}
+
/*
* This is the main state machine for walking down the btree - walks down to a
* specified depth
@@ -1587,25 +1113,23 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
* On error, caller (peek_node()/peek_key()) must return NULL; the error is
* stashed in the iterator and returned from bch2_trans_exit().
*/
-static int btree_path_traverse_one(struct btree_trans *trans,
- struct btree_path *path,
- unsigned flags,
- unsigned long trace_ip)
+int bch2_btree_path_traverse_one(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags,
+ unsigned long trace_ip)
{
unsigned depth_want = path->level;
- int ret = 0;
+ int ret = -((int) trans->restarted);
- if (unlikely(trans->restarted)) {
- ret = -EINTR;
+ if (unlikely(ret))
goto out;
- }
/*
* Ensure we obey path->should_be_locked: if it's set, we can't unlock
* and re-traverse the path without a transaction restart:
*/
if (path->should_be_locked) {
- ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR;
+ ret = bch2_btree_path_relock(trans, path, trace_ip);
goto out;
}
@@ -1619,6 +1143,9 @@ static int btree_path_traverse_one(struct btree_trans *trans,
path->level = btree_path_up_until_good_node(trans, path, 0);
+ EBUG_ON(btree_path_node(path, path->level) &&
+ !btree_node_locked(path, path->level));
+
/*
* Note: path->nodes[path->level] may be temporarily NULL here - that
* would indicate to other code that we got to the end of the btree,
@@ -1639,52 +1166,38 @@ static int btree_path_traverse_one(struct btree_trans *trans,
goto out;
}
- __bch2_btree_path_unlock(path);
+ __bch2_btree_path_unlock(trans, path);
path->level = depth_want;
-
- if (ret == -EIO)
- path->l[path->level].b =
- BTREE_ITER_NO_NODE_ERROR;
- else
- path->l[path->level].b =
- BTREE_ITER_NO_NODE_DOWN;
+ path->l[path->level].b = ERR_PTR(ret);
goto out;
}
}
path->uptodate = BTREE_ITER_UPTODATE;
out:
- BUG_ON((ret == -EINTR) != !!trans->restarted);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
+ panic("ret %s (%i) trans->restarted %s (%i)\n",
+ bch2_err_str(ret), ret,
+ bch2_err_str(trans->restarted), trans->restarted);
bch2_btree_path_verify(trans, path);
return ret;
}
-static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long);
-
-int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
- struct btree_path *path, unsigned flags)
-{
- if (path->uptodate < BTREE_ITER_NEED_RELOCK)
- return 0;
-
- return bch2_trans_cond_resched(trans) ?:
- btree_path_traverse_one(trans, path, flags, _RET_IP_);
-}
-
-static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
+static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
struct btree_path *src)
{
- unsigned i;
+ unsigned i, offset = offsetof(struct btree_path, pos);
- memcpy(&dst->pos, &src->pos,
- sizeof(struct btree_path) - offsetof(struct btree_path, pos));
+ memcpy((void *) dst + offset,
+ (void *) src + offset,
+ sizeof(struct btree_path) - offset);
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- if (btree_node_locked(dst, i))
- six_lock_increment(&dst->l[i].b->c.lock,
- __btree_lock_want(dst, i));
+ for (i = 0; i < BTREE_MAX_DEPTH; i++) {
+ unsigned t = btree_node_locked_type(dst, i);
- btree_path_check_sort(trans, dst, 0);
+ if (t != BTREE_NODE_UNLOCKED)
+ six_lock_increment(&dst->l[i].b->c.lock, t);
+ }
}
static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
@@ -1697,55 +1210,45 @@ static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btr
return new;
}
-inline struct btree_path * __must_check
-bch2_btree_path_make_mut(struct btree_trans *trans,
+__flatten
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
struct btree_path *path, bool intent,
unsigned long ip)
{
- if (path->ref > 1 || path->preserve) {
- __btree_path_put(path, intent);
- path = btree_path_clone(trans, path, intent);
- path->preserve = false;
-#ifdef CONFIG_BCACHEFS_DEBUG
- path->ip_allocated = ip;
-#endif
- btree_trans_verify_sorted(trans);
- }
-
+ __btree_path_put(path, intent);
+ path = btree_path_clone(trans, path, intent);
+ path->preserve = false;
return path;
}
-static struct btree_path * __must_check
-btree_path_set_pos(struct btree_trans *trans,
+struct btree_path * __must_check
+__bch2_btree_path_set_pos(struct btree_trans *trans,
struct btree_path *path, struct bpos new_pos,
- bool intent, unsigned long ip)
+ bool intent, unsigned long ip, int cmp)
{
- int cmp = bpos_cmp(new_pos, path->pos);
- unsigned l = path->level;
+ unsigned level = path->level;
- EBUG_ON(trans->restarted);
+ bch2_trans_verify_not_in_restart(trans);
EBUG_ON(!path->ref);
- if (!cmp)
- return path;
-
path = bch2_btree_path_make_mut(trans, path, intent, ip);
path->pos = new_pos;
- path->should_be_locked = false;
-
- btree_path_check_sort(trans, path, cmp);
+ trans->paths_sorted = false;
if (unlikely(path->cached)) {
- btree_node_unlock(path, 0);
- path->l[0].b = BTREE_ITER_NO_NODE_CACHED;
+ btree_node_unlock(trans, path, 0);
+ path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
goto out;
}
- l = btree_path_up_until_good_node(trans, path, cmp);
+ level = btree_path_up_until_good_node(trans, path, cmp);
- if (btree_path_node(path, l)) {
+ if (btree_path_node(path, level)) {
+ struct btree_path_level *l = &path->l[level];
+
+ BUG_ON(!btree_node_locked(path, level));
/*
* We might have to skip over many keys, or just a few: try
* advancing the node iterator, and if we have to skip over too
@@ -1753,13 +1256,20 @@ btree_path_set_pos(struct btree_trans *trans,
* is expensive).
*/
if (cmp < 0 ||
- !btree_path_advance_to_pos(path, &path->l[l], 8))
- __btree_path_level_init(path, l);
+ !btree_path_advance_to_pos(path, l, 8))
+ bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
+
+ /*
+ * Iterators to interior nodes should always be pointed at the first non
+ * whiteout:
+ */
+ if (unlikely(level))
+ bch2_btree_node_iter_peek(&l->iter, l->b);
}
- if (l != path->level) {
+ if (unlikely(level != path->level)) {
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- __bch2_btree_path_unlock(path);
+ __bch2_btree_path_unlock(trans, path);
}
out:
bch2_btree_path_verify(trans, path);
@@ -1770,37 +1280,37 @@ out:
static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
{
- struct btree_path *next;
+ struct btree_path *sib;
- next = prev_btree_path(trans, path);
- if (next && !btree_path_cmp(next, path))
- return next;
+ sib = prev_btree_path(trans, path);
+ if (sib && !btree_path_cmp(sib, path))
+ return sib;
- next = next_btree_path(trans, path);
- if (next && !btree_path_cmp(next, path))
- return next;
+ sib = next_btree_path(trans, path);
+ if (sib && !btree_path_cmp(sib, path))
+ return sib;
return NULL;
}
static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
{
- struct btree_path *next;
+ struct btree_path *sib;
- next = prev_btree_path(trans, path);
- if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
- return next;
+ sib = prev_btree_path(trans, path);
+ if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+ return sib;
- next = next_btree_path(trans, path);
- if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
- return next;
+ sib = next_btree_path(trans, path);
+ if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+ return sib;
return NULL;
}
static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
{
- __bch2_btree_path_unlock(path);
+ __bch2_btree_path_unlock(trans, path);
btree_path_list_remove(trans, path);
trans->paths_allocated &= ~(1ULL << path->idx);
}
@@ -1815,90 +1325,222 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte
if (!__btree_path_put(path, intent))
return;
- /*
- * Perhaps instead we should check for duplicate paths in traverse_all:
- */
- if (path->preserve &&
- (dup = have_path_at_pos(trans, path))) {
- dup->preserve = true;
- path->preserve = false;
- goto free;
- }
+ dup = path->preserve
+ ? have_path_at_pos(trans, path)
+ : have_node_at_pos(trans, path);
+
+ if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
+ return;
- if (!path->preserve &&
- (dup = have_node_at_pos(trans, path)))
- goto free;
- return;
-free:
if (path->should_be_locked &&
- !btree_node_locked(dup, path->level))
+ !trans->restarted &&
+ (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
+ return;
+
+ if (dup) {
+ dup->preserve |= path->preserve;
+ dup->should_be_locked |= path->should_be_locked;
+ }
+
+ __bch2_path_free(trans, path);
+}
+
+static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
+ bool intent)
+{
+ EBUG_ON(trans->paths + path->idx != path);
+ EBUG_ON(!path->ref);
+
+ if (!__btree_path_put(path, intent))
return;
- dup->should_be_locked |= path->should_be_locked;
__bch2_path_free(trans, path);
}
+void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+{
+ panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
+ trans->restart_count, restart_count,
+ (void *) trans->last_begin_ip);
+}
+
+void bch2_trans_in_restart_error(struct btree_trans *trans)
+{
+ panic("in transaction restart: %s, last restarted by %pS\n",
+ bch2_err_str(trans->restarted),
+ (void *) trans->last_restarted_ip);
+}
+
noinline __cold
-void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
- struct btree_path *path;
struct btree_insert_entry *i;
+ struct btree_write_buffered_key *wb;
+
+ prt_printf(buf, "transaction updates for %s journal seq %llu",
+ trans->fn, trans->journal_res.seq);
+ prt_newline(buf);
+ printbuf_indent_add(buf, 2);
+
+ trans_for_each_update(trans, i) {
+ struct bkey_s_c old = { &i->old_k, i->old_v };
+
+ prt_printf(buf, "update: btree=%s cached=%u %pS",
+ bch2_btree_ids[i->btree_id],
+ i->cached,
+ (void *) i->ip_allocated);
+ prt_newline(buf);
+
+ prt_printf(buf, " old ");
+ bch2_bkey_val_to_text(buf, trans->c, old);
+ prt_newline(buf);
+
+ prt_printf(buf, " new ");
+ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
+ prt_newline(buf);
+ }
+
+ trans_for_each_wb_update(trans, wb) {
+ prt_printf(buf, "update: btree=%s wb=1 %pS",
+ bch2_btree_ids[wb->btree],
+ (void *) i->ip_allocated);
+ prt_newline(buf);
+
+ prt_printf(buf, " new ");
+ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
+ prt_newline(buf);
+ }
+
+ printbuf_indent_sub(buf, 2);
+}
+
+noinline __cold
+void bch2_dump_trans_updates(struct btree_trans *trans)
+{
+ struct printbuf buf = PRINTBUF;
+
+ bch2_trans_updates_to_text(&buf, trans);
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
+
+noinline __cold
+void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
+{
+ prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
+ path->idx, path->ref, path->intent_ref,
+ path->preserve ? 'P' : ' ',
+ path->should_be_locked ? 'S' : ' ',
+ bch2_btree_ids[path->btree_id],
+ path->level);
+ bch2_bpos_to_text(out, path->pos);
+
+ prt_printf(out, " locks %u", path->nodes_locked);
+#ifdef TRACK_PATH_ALLOCATED
+ prt_printf(out, " %pS", (void *) path->ip_allocated);
+#endif
+ prt_newline(out);
+}
+
+noinline __cold
+void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
+ bool nosort)
+{
+ struct btree_path *path;
unsigned idx;
- char buf1[300], buf2[300];
- btree_trans_verify_sorted(trans);
+ if (!nosort)
+ btree_trans_sort_paths(trans);
trans_for_each_path_inorder(trans, path, idx)
- printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n",
- path->idx, path->ref, path->intent_ref,
- path->should_be_locked ? " S" : "",
- path->preserve ? " P" : "",
- bch2_btree_ids[path->btree_id],
- (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1),
- path->nodes_locked,
-#ifdef CONFIG_BCACHEFS_DEBUG
- (void *) path->ip_allocated
-#else
- NULL
-#endif
- );
+ bch2_btree_path_to_text(out, path);
+}
- trans_for_each_update(trans, i) {
- struct bkey u;
- struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u);
+noinline __cold
+void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+ __bch2_trans_paths_to_text(out, trans, false);
+}
- printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s",
- bch2_btree_ids[i->btree_id],
- (void *) i->ip_allocated,
- (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1),
- (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2));
+noinline __cold
+void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
+{
+ struct printbuf buf = PRINTBUF;
+
+ __bch2_trans_paths_to_text(&buf, trans, nosort);
+ bch2_trans_updates_to_text(&buf, trans);
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
+
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+{
+ __bch2_dump_trans_paths_updates(trans, false);
+}
+
+noinline __cold
+static void bch2_trans_update_max_paths(struct btree_trans *trans)
+{
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+ struct printbuf buf = PRINTBUF;
+
+ if (!s)
+ return;
+
+ bch2_trans_paths_to_text(&buf, trans);
+
+ if (!buf.allocation_failure) {
+ mutex_lock(&s->lock);
+ if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
+ s->nr_max_paths = trans->nr_max_paths =
+ hweight64(trans->paths_allocated);
+ swap(s->max_paths_text, buf.buf);
+ }
+ mutex_unlock(&s->lock);
}
+
+ printbuf_exit(&buf);
+
+ trans->nr_max_paths = hweight64(trans->paths_allocated);
+}
+
+static noinline void btree_path_overflow(struct btree_trans *trans)
+{
+ bch2_dump_trans_paths_updates(trans);
+ panic("trans path oveflow\n");
}
-static struct btree_path *btree_path_alloc(struct btree_trans *trans,
- struct btree_path *pos)
+static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
+ struct btree_path *pos)
{
struct btree_path *path;
unsigned idx;
if (unlikely(trans->paths_allocated ==
- ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
- bch2_dump_trans_paths_updates(trans);
- panic("trans path oveflow\n");
- }
+ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
+ btree_path_overflow(trans);
idx = __ffs64(~trans->paths_allocated);
+
+ /*
+ * Do this before marking the new path as allocated, since it won't be
+ * initialized yet:
+ */
+ if (unlikely(idx > trans->nr_max_paths))
+ bch2_trans_update_max_paths(trans);
+
trans->paths_allocated |= 1ULL << idx;
path = &trans->paths[idx];
-
path->idx = idx;
path->ref = 0;
path->intent_ref = 0;
path->nodes_locked = 0;
- path->nodes_intent_locked = 0;
btree_path_list_add(trans, pos, path);
+ trans->paths_sorted = false;
return path;
}
@@ -1912,7 +1554,10 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
bool intent = flags & BTREE_ITER_INTENT;
int i;
- BUG_ON(trans->restarted);
+ bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_locks(trans);
+
+ btree_trans_sort_paths(trans);
trans_for_each_path_inorder(trans, path, i) {
if (__btree_path_cmp(path,
@@ -1930,7 +1575,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
path_pos->btree_id == btree_id &&
path_pos->level == level) {
__btree_path_get(path_pos, intent);
- path = btree_path_set_pos(trans, path_pos, pos, intent, ip);
+ path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
} else {
path = btree_path_alloc(trans, path_pos);
path_pos = NULL;
@@ -1944,13 +1589,12 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
path->level = level;
path->locks_want = locks_want;
path->nodes_locked = 0;
- path->nodes_intent_locked = 0;
for (i = 0; i < ARRAY_SIZE(path->l); i++)
- path->l[i].b = BTREE_ITER_NO_NODE_INIT;
-#ifdef CONFIG_BCACHEFS_DEBUG
+ path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+#ifdef TRACK_PATH_ALLOCATED
path->ip_allocated = ip;
#endif
- btree_trans_verify_sorted(trans);
+ trans->paths_sorted = false;
}
if (!(flags & BTREE_ITER_NOPRESERVE))
@@ -1968,42 +1612,43 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
*/
locks_want = min(locks_want, BTREE_MAX_DEPTH);
- if (locks_want > path->locks_want) {
- path->locks_want = locks_want;
- btree_path_get_locks(trans, path, true);
- }
+ if (locks_want > path->locks_want)
+ bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
return path;
}
-inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
{
+ struct btree_path_level *l = path_l(path);
+ struct bkey_packed *_k;
struct bkey_s_c k;
- BUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+ if (unlikely(!l->b))
+ return bkey_s_c_null;
- if (!path->cached) {
- struct btree_path_level *l = path_l(path);
- struct bkey_packed *_k =
- bch2_btree_node_iter_peek_all(&l->iter, l->b);
+ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+ EBUG_ON(!btree_node_locked(path, path->level));
+ if (!path->cached) {
+ _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
- EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
+ EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
- if (!k.k || bpos_cmp(path->pos, k.k->p))
+ if (!k.k || !bpos_eq(path->pos, k.k->p))
goto hole;
} else {
struct bkey_cached *ck = (void *) path->l[0].b;
- EBUG_ON(path->btree_id != ck->key.btree_id ||
- bkey_cmp(path->pos, ck->key.pos));
-
- /* BTREE_ITER_CACHED_NOFILL? */
- if (unlikely(!ck->valid))
- goto hole;
+ EBUG_ON(ck &&
+ (path->btree_id != ck->key.btree_id ||
+ !bkey_eq(path->pos, ck->key.pos)));
+ if (!ck || !ck->valid)
+ return bkey_s_c_null;
+ *u = ck->k->k;
k = bkey_i_to_s_c(ck->k);
}
@@ -2027,7 +1672,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
{
int ret;
- iter->path = btree_path_set_pos(iter->trans, iter->path,
+ iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
btree_iter_search_key(iter),
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2036,7 +1681,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
if (ret)
return ret;
- iter->path->should_be_locked = true;
+ btree_path_set_should_be_locked(iter->path);
return 0;
}
@@ -2059,16 +1704,15 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
if (!b)
goto out;
- BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
+ BUG_ON(bpos_lt(b->key.k.p, iter->pos));
bkey_init(&iter->k);
iter->k.p = iter->pos = b->key.k.p;
- iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- iter->path->should_be_locked = true;
- BUG_ON(iter->path->uptodate);
+ btree_path_set_should_be_locked(iter->path);
out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -2079,15 +1723,25 @@ err:
goto out;
}
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
+{
+ struct btree *b;
+
+ while (b = bch2_btree_iter_peek_node(iter),
+ bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
+ bch2_trans_begin(iter->trans);
+
+ return b;
+}
+
struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
struct btree_path *path = iter->path;
struct btree *b = NULL;
- unsigned l;
int ret;
- BUG_ON(trans->restarted);
+ bch2_trans_verify_not_in_restart(trans);
EBUG_ON(iter->path->cached);
bch2_btree_iter_verify(iter);
@@ -2097,47 +1751,35 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
/* got to end? */
if (!btree_path_node(path, path->level + 1)) {
- btree_node_unlock(path, path->level);
- path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
- path->level++;
+ btree_path_set_level_up(trans, path);
return NULL;
}
if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
- __bch2_btree_path_unlock(path);
- path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
- path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
- trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
- path->btree_id, &path->pos);
- btree_trans_restart(trans);
- ret = -EINTR;
+ __bch2_btree_path_unlock(trans, path);
+ path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
+ path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
goto err;
}
b = btree_path_node(path, path->level + 1);
- if (!bpos_cmp(iter->pos, b->key.k.p)) {
- btree_node_unlock(path, path->level);
- path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
- path->level++;
+ if (bpos_eq(iter->pos, b->key.k.p)) {
+ __btree_path_set_level_up(trans, path, path->level++);
} else {
/*
* Haven't gotten to the end of the parent node: go back down to
* the next child node
*/
path = iter->path =
- btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+ bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- path->level = iter->min_depth;
-
- for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
- if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(path, l);
-
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- bch2_btree_iter_verify(iter);
+ btree_path_set_level_down(trans, path, iter->min_depth);
ret = bch2_btree_path_traverse(trans, path, iter->flags);
if (ret)
@@ -2149,10 +1791,10 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
bkey_init(&iter->k);
iter->k.p = iter->pos = b->key.k.p;
- iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- iter->path->should_be_locked = true;
+ btree_path_set_should_be_locked(iter->path);
BUG_ON(iter->path->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
@@ -2168,23 +1810,31 @@ err:
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
- struct bpos pos = iter->k.p;
- bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
- ? bpos_cmp(pos, SPOS_MAX)
- : bkey_cmp(pos, SPOS_MAX)) != 0;
+ if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
+ struct bpos pos = iter->k.p;
+ bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_eq(pos, SPOS_MAX)
+ : bkey_eq(pos, SPOS_MAX));
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
- pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
- return ret;
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ pos = bkey_successor(iter, pos);
+ bch2_btree_iter_set_pos(iter, pos);
+ return ret;
+ } else {
+ if (!btree_path_node(iter->path, iter->path->level))
+ return true;
+
+ iter->advanced = true;
+ return false;
+ }
}
inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
{
struct bpos pos = bkey_start_pos(&iter->k);
- bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
- ? bpos_cmp(pos, POS_MIN)
- : bkey_cmp(pos, POS_MIN)) != 0;
+ bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_eq(pos, POS_MIN)
+ : bkey_eq(pos, POS_MIN));
if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
pos = bkey_predecessor(iter, pos);
@@ -2192,39 +1842,66 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
return ret;
}
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bpos pos)
+static noinline
+struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
{
struct btree_insert_entry *i;
+ struct bkey_i *ret = NULL;
- trans_for_each_update(trans, i)
- if ((cmp_int(btree_id, i->btree_id) ?:
- bpos_cmp(pos, i->k->k.p)) <= 0) {
- if (btree_id == i->btree_id)
- return i->k;
+ trans_for_each_update(iter->trans, i) {
+ if (i->btree_id < iter->btree_id)
+ continue;
+ if (i->btree_id > iter->btree_id)
break;
- }
+ if (bpos_lt(i->k->k.p, iter->path->pos))
+ continue;
+ if (i->key_cache_already_flushed)
+ continue;
+ if (!ret || bpos_lt(i->k->k.p, ret->k.p))
+ ret = i->k;
+ }
- return NULL;
+ return ret;
}
-static noinline
-struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
- struct btree_path *path)
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
{
- struct journal_keys *keys = &trans->c->journal_keys;
- size_t idx = bch2_journal_key_search(keys, path->btree_id,
- path->level, path->pos);
+ return iter->flags & BTREE_ITER_WITH_UPDATES
+ ? __bch2_btree_trans_peek_updates(iter)
+ : NULL;
+}
- while (idx < keys->nr && keys->d[idx].overwritten)
- idx++;
+struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos end_pos)
+{
+ struct bkey_i *k;
- return (idx < keys->nr &&
- keys->d[idx].btree_id == path->btree_id &&
- keys->d[idx].level == path->level)
- ? keys->d[idx].k
- : NULL;
+ if (bpos_lt(iter->path->pos, iter->journal_pos))
+ iter->journal_idx = 0;
+
+ k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+ iter->path->level,
+ iter->path->pos,
+ end_pos,
+ &iter->journal_idx);
+
+ iter->journal_pos = k ? k->k.p : end_pos;
+ return k;
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos);
+
+ if (k) {
+ iter->k = k->k;
+ return bkey_i_to_s_c(k);
+ } else {
+ return bkey_s_c_null;
+ }
}
static noinline
@@ -2233,11 +1910,10 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
struct bkey_s_c k)
{
struct bkey_i *next_journal =
- __btree_trans_peek_journal(trans, iter->path);
+ bch2_btree_journal_peek(trans, iter,
+ k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
- if (next_journal &&
- bpos_cmp(next_journal->k.p,
- k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+ if (next_journal) {
iter->k = next_journal->k;
k = bkey_i_to_s_c(next_journal);
}
@@ -2245,18 +1921,67 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
return k;
}
+/*
+ * Checks btree key cache for key at iter->pos and returns it if present, or
+ * bkey_s_c_null:
+ */
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bch_fs *c = trans->c;
+ struct bkey u;
+ struct bkey_s_c k;
+ int ret;
+
+ if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) &&
+ bpos_eq(iter->pos, pos))
+ return bkey_s_c_null;
+
+ if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
+ return bkey_s_c_null;
+
+ if (!iter->key_cache_path)
+ iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
+ iter->flags & BTREE_ITER_INTENT, 0,
+ iter->flags|BTREE_ITER_CACHED|
+ BTREE_ITER_CACHED_NOFILL,
+ _THIS_IP_);
+
+ iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+ iter->flags|BTREE_ITER_CACHED) ?:
+ bch2_btree_path_relock(trans, iter->path, _THIS_IP_);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
+
+ btree_path_set_should_be_locked(iter->key_cache_path);
+
+ k = bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+ if (k.k && !bkey_err(k)) {
+ iter->k = u;
+ k.k = &iter->k;
+ }
+ return k;
+}
+
static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
{
struct btree_trans *trans = iter->trans;
struct bkey_i *next_update;
- struct bkey_s_c k;
+ struct bkey_s_c k, k2;
int ret;
- EBUG_ON(iter->path->cached || iter->path->level);
+ EBUG_ON(iter->path->cached);
bch2_btree_iter_verify(iter);
while (1) {
- iter->path = btree_path_set_pos(trans, iter->path, search_key,
+ struct btree_path_level *l;
+
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2268,17 +1993,38 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
goto out;
}
- k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
+ l = path_l(iter->path);
+
+ if (unlikely(!l->b)) {
+ /* No btree nodes at requested level: */
+ bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ k = bkey_s_c_null;
+ goto out;
+ }
+
+ btree_path_set_should_be_locked(iter->path);
+
+ k = btree_path_level_peek_all(trans->c, l, &iter->k);
+
+ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ k.k &&
+ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ k = k2;
+ ret = bkey_err(k);
+ if (ret) {
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ goto out;
+ }
+ }
if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
k = btree_trans_peek_journal(trans, iter, k);
- next_update = iter->flags & BTREE_ITER_WITH_UPDATES
- ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
- : NULL;
+ next_update = btree_trans_peek_updates(iter);
+
if (next_update &&
- bpos_cmp(next_update->k.p,
- k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+ bpos_le(next_update->k.p,
+ k.k ? k.k->p : l->b->key.k.p)) {
iter->k = next_update->k;
k = bkey_i_to_s_c(next_update);
}
@@ -2291,7 +2037,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
* whiteout, with a real key at the same position, since
* in the btree deleted keys sort before non deleted.
*/
- search_key = bpos_cmp(search_key, k.k->p)
+ search_key = !bpos_eq(search_key, k.k->p)
? k.k->p
: bpos_successor(k.k->p);
continue;
@@ -2299,9 +2045,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
if (likely(k.k)) {
break;
- } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
+ } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
/* Advance to next leaf node: */
- search_key = bpos_successor(iter->path->l[0].b->key.k.p);
+ search_key = bpos_successor(l->b->key.k.p);
} else {
/* End of btree: */
bch2_btree_iter_set_pos(iter, SPOS_MAX);
@@ -2319,16 +2065,20 @@ out:
* bch2_btree_iter_peek: returns first key greater than or equal to iterator's
* current position
*/
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
{
struct btree_trans *trans = iter->trans;
struct bpos search_key = btree_iter_search_key(iter);
struct bkey_s_c k;
+ struct bpos iter_pos;
int ret;
+ EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+ EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
+
if (iter->update_path) {
- bch2_path_put(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ bch2_path_put_nokeep(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
iter->update_path = NULL;
}
@@ -2336,13 +2086,30 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
while (1) {
k = __bch2_btree_iter_peek(iter, search_key);
- if (!k.k || bkey_err(k))
- goto out;
+ if (unlikely(!k.k))
+ goto end;
+ if (unlikely(bkey_err(k)))
+ goto out_no_locked;
+
+ /*
+ * iter->pos should be mononotically increasing, and always be
+ * equal to the key we just returned - except extents can
+ * straddle iter->pos:
+ */
+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+ iter_pos = k.k->p;
+ else
+ iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
+
+ if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_gt(iter_pos, end)
+ : bkey_ge(iter_pos, end)))
+ goto end;
if (iter->update_path &&
- bkey_cmp(iter->update_path->pos, k.k->p)) {
- bch2_path_put(trans, iter->update_path,
- iter->flags & BTREE_ITER_INTENT);
+ !bkey_eq(iter->update_path->pos, k.k->p)) {
+ bch2_path_put_nokeep(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
iter->update_path = NULL;
}
@@ -2366,13 +2133,15 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
__btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
iter->update_path = iter->path;
- iter->update_path = btree_path_set_pos(trans,
+ iter->update_path = bch2_btree_path_set_pos(trans,
iter->update_path, pos,
iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
-
- BUG_ON(!(iter->update_path->nodes_locked & 1));
- iter->update_path->should_be_locked = true;
+ _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
}
/*
@@ -2396,25 +2165,21 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
break;
}
- /*
- * iter->pos should be mononotically increasing, and always be equal to
- * the key we just returned - except extents can straddle iter->pos:
- */
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
- iter->pos = k.k->p;
- else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
- iter->pos = bkey_start_pos(k.k);
+ iter->pos = iter_pos;
- iter->path = btree_path_set_pos(trans, iter->path, k.k->p,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- BUG_ON(!iter->path->nodes_locked);
-out:
+
+ btree_path_set_should_be_locked(iter->path);
+out_no_locked:
if (iter->update_path) {
- BUG_ON(!(iter->update_path->nodes_locked & 1));
- iter->update_path->should_be_locked = true;
+ ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_);
+ if (unlikely(ret))
+ k = bkey_s_c_err(ret);
+ else
+ btree_path_set_should_be_locked(iter->update_path);
}
- iter->path->should_be_locked = true;
if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
iter->pos.snapshot = iter->snapshot;
@@ -2428,6 +2193,104 @@ out:
bch2_btree_iter_verify_entry_exit(iter);
return k;
+end:
+ bch2_btree_iter_set_pos(iter, end);
+ k = bkey_s_c_null;
+ goto out_no_locked;
+}
+
+/**
+ * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
+ * to iterator's current position, returning keys from every level of the btree.
+ * For keys at different levels of the btree that compare equal, the key from
+ * the lower level (leaf) is returned first.
+ */
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bkey_s_c k;
+ int ret;
+
+ EBUG_ON(iter->path->cached);
+ bch2_btree_iter_verify(iter);
+ BUG_ON(iter->path->level < iter->min_depth);
+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+ EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
+
+ while (1) {
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (unlikely(ret)) {
+ /* ensure that iter->k is consistent with iter->pos: */
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+
+ /* Already at end? */
+ if (!btree_path_node(iter->path, iter->path->level)) {
+ k = bkey_s_c_null;
+ goto out_no_locked;
+ }
+
+ k = btree_path_level_peek_all(trans->c,
+ &iter->path->l[iter->path->level], &iter->k);
+
+ /* Check if we should go up to the parent node: */
+ if (!k.k ||
+ (iter->advanced &&
+ bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
+ iter->pos = path_l(iter->path)->b->key.k.p;
+ btree_path_set_level_up(trans, iter->path);
+ iter->advanced = false;
+ continue;
+ }
+
+ /*
+ * Check if we should go back down to a leaf:
+ * If we're not in a leaf node, we only return the current key
+ * if it exactly matches iter->pos - otherwise we first have to
+ * go back to the leaf:
+ */
+ if (iter->path->level != iter->min_depth &&
+ (iter->advanced ||
+ !k.k ||
+ !bpos_eq(iter->pos, k.k->p))) {
+ btree_path_set_level_down(trans, iter->path, iter->min_depth);
+ iter->pos = bpos_successor(iter->pos);
+ iter->advanced = false;
+ continue;
+ }
+
+ /* Check if we should go to the next key: */
+ if (iter->path->level == iter->min_depth &&
+ iter->advanced &&
+ k.k &&
+ bpos_eq(iter->pos, k.k->p)) {
+ iter->pos = bpos_successor(iter->pos);
+ iter->advanced = false;
+ continue;
+ }
+
+ if (iter->advanced &&
+ iter->path->level == iter->min_depth &&
+ !bpos_eq(k.k->p, iter->pos))
+ iter->advanced = false;
+
+ BUG_ON(iter->advanced);
+ BUG_ON(!k.k);
+ break;
+ }
+
+ iter->pos = k.k->p;
+ btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+ bch2_btree_iter_verify(iter);
+
+ return k;
}
/**
@@ -2469,7 +2332,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
search_key.snapshot = U32_MAX;
while (1) {
- iter->path = btree_path_set_pos(trans, iter->path, search_key,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -2478,20 +2341,18 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
/* ensure that iter->k is consistent with iter->pos: */
bch2_btree_iter_set_pos(iter, iter->pos);
k = bkey_s_c_err(ret);
- goto out;
+ goto out_no_locked;
}
- k = btree_path_level_peek(trans->c, iter->path,
+ k = btree_path_level_peek(trans, iter->path,
&iter->path->l[0], &iter->k);
if (!k.k ||
((iter->flags & BTREE_ITER_IS_EXTENTS)
- ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
- : bpos_cmp(k.k->p, search_key) > 0))
- k = btree_path_level_prev(trans->c, iter->path,
+ ? bpos_ge(bkey_start_pos(k.k), search_key)
+ : bpos_gt(k.k->p, search_key)))
+ k = btree_path_level_prev(trans, iter->path,
&iter->path->l[0], &iter->k);
- btree_path_check_sort(trans, iter->path, 0);
-
if (likely(k.k)) {
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
if (k.k->p.snapshot == iter->snapshot)
@@ -2502,8 +2363,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
* longer at the same _key_ (not pos), return
* that candidate
*/
- if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
- bch2_path_put(trans, iter->path,
+ if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
+ bch2_path_put_nokeep(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
iter->path = saved_path;
saved_path = NULL;
@@ -2516,7 +2377,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
iter->snapshot,
k.k->p.snapshot)) {
if (saved_path)
- bch2_path_put(trans, saved_path,
+ bch2_path_put_nokeep(trans, saved_path,
iter->flags & BTREE_ITER_INTENT);
saved_path = btree_path_clone(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
@@ -2537,29 +2398,30 @@ got_key:
}
break;
- } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
+ } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
/* Advance to previous leaf node: */
search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
} else {
/* Start of btree: */
bch2_btree_iter_set_pos(iter, POS_MIN);
k = bkey_s_c_null;
- goto out;
+ goto out_no_locked;
}
}
- EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
+ EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
/* Extents can straddle iter->pos: */
- if (bkey_cmp(k.k->p, iter->pos) < 0)
+ if (bkey_lt(k.k->p, iter->pos))
iter->pos = k.k->p;
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
iter->pos.snapshot = iter->snapshot;
-out:
+
+ btree_path_set_should_be_locked(iter->path);
+out_no_locked:
if (saved_path)
- bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
- iter->path->should_be_locked = true;
+ bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -2586,9 +2448,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
struct bkey_s_c k;
int ret;
- EBUG_ON(iter->path->level);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
+ EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+ EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
/* extents can't span inode numbers: */
if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
@@ -2600,44 +2463,56 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
search_key = btree_iter_search_key(iter);
- iter->path = btree_path_set_pos(trans, iter->path, search_key,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
if ((iter->flags & BTREE_ITER_CACHED) ||
!(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
struct bkey_i *next_update;
- if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
- (next_update = btree_trans_peek_updates(trans,
- iter->btree_id, search_key)) &&
- !bpos_cmp(next_update->k.p, iter->pos)) {
+ if ((next_update = btree_trans_peek_updates(iter)) &&
+ bpos_eq(next_update->k.p, iter->pos)) {
iter->k = next_update->k;
k = bkey_i_to_s_c(next_update);
goto out;
}
if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
- (next_update = __btree_trans_peek_journal(trans, iter->path)) &&
- !bpos_cmp(next_update->k.p, iter->pos)) {
- iter->k = next_update->k;
- k = bkey_i_to_s_c(next_update);
+ (k = btree_trans_peek_slot_journal(trans, iter)).k)
goto out;
+
+ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+ if (!bkey_err(k))
+ iter->k = *k.k;
+ /* We're not returning a key from iter->path: */
+ goto out_no_locked;
}
k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+ if (unlikely(!k.k))
+ goto out_no_locked;
} else {
struct bpos next;
+ struct bpos end = iter->pos;
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ end.offset = U64_MAX;
+
+ EBUG_ON(iter->path->level);
if (iter->flags & BTREE_ITER_INTENT) {
struct btree_iter iter2;
bch2_trans_copy_iter(&iter2, iter);
- k = bch2_btree_iter_peek(&iter2);
+ k = bch2_btree_iter_peek_upto(&iter2, end);
if (k.k && !bkey_err(k)) {
iter->k = iter2.k;
@@ -2647,16 +2522,19 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
} else {
struct bpos pos = iter->pos;
- k = bch2_btree_iter_peek(iter);
- iter->pos = pos;
+ k = bch2_btree_iter_peek_upto(iter, end);
+ if (unlikely(bkey_err(k)))
+ bch2_btree_iter_set_pos(iter, pos);
+ else
+ iter->pos = pos;
}
if (unlikely(bkey_err(k)))
- return k;
+ goto out_no_locked;
next = k.k ? bkey_start_pos(k.k) : POS_MAX;
- if (bkey_cmp(iter->pos, next) < 0) {
+ if (bkey_lt(iter->pos, next)) {
bkey_init(&iter->k);
iter->k.p = iter->pos;
@@ -2674,8 +2552,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
}
out:
- iter->path->should_be_locked = true;
-
+ btree_path_set_should_be_locked(iter->path);
+out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
ret = bch2_btree_iter_verify_ret(iter, k);
@@ -2701,76 +2579,99 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
return bch2_btree_iter_peek_slot(iter);
}
-/* new transactional stuff: */
-
-static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
- struct btree_path *path)
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
{
- EBUG_ON(path->sorted_idx >= trans->nr_sorted);
- EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
- EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
+ struct bkey_s_c k;
+
+ while (btree_trans_too_many_iters(iter->trans) ||
+ (k = bch2_btree_iter_peek_type(iter, iter->flags),
+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+ bch2_trans_begin(iter->trans);
+
+ return k;
}
-static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
-{
+/* new transactional stuff: */
+
#ifdef CONFIG_BCACHEFS_DEBUG
+static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
+{
+ struct btree_path *path;
unsigned i;
- for (i = 0; i < trans->nr_sorted; i++)
- btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
-#endif
+ BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated));
+
+ trans_for_each_path(trans, path) {
+ BUG_ON(path->sorted_idx >= trans->nr_sorted);
+ BUG_ON(trans->sorted[path->sorted_idx] != path->idx);
+ }
+
+ for (i = 0; i < trans->nr_sorted; i++) {
+ unsigned idx = trans->sorted[i];
+
+ EBUG_ON(!(trans->paths_allocated & (1ULL << idx)));
+ BUG_ON(trans->paths[idx].sorted_idx != i);
+ }
}
static void btree_trans_verify_sorted(struct btree_trans *trans)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
struct btree_path *path, *prev = NULL;
unsigned i;
+ if (!bch2_debug_check_iterators)
+ return;
+
trans_for_each_path_inorder(trans, path, i) {
- BUG_ON(prev && btree_path_cmp(prev, path) > 0);
+ if (prev && btree_path_cmp(prev, path) > 0) {
+ __bch2_dump_trans_paths_updates(trans, true);
+ panic("trans paths out of order!\n");
+ }
prev = path;
}
-#endif
}
+#else
+static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
+static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
+#endif
-static inline void btree_path_swap(struct btree_trans *trans,
- struct btree_path *l, struct btree_path *r)
+void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
{
- swap(l->sorted_idx, r->sorted_idx);
- swap(trans->sorted[l->sorted_idx],
- trans->sorted[r->sorted_idx]);
-
- btree_path_verify_sorted_ref(trans, l);
- btree_path_verify_sorted_ref(trans, r);
-}
+ int i, l = 0, r = trans->nr_sorted, inc = 1;
+ bool swapped;
-static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
- int cmp)
-{
- struct btree_path *n;
+ btree_trans_verify_sorted_refs(trans);
- if (cmp <= 0) {
- n = prev_btree_path(trans, path);
- if (n && btree_path_cmp(n, path) > 0) {
- do {
- btree_path_swap(trans, n, path);
- n = prev_btree_path(trans, path);
- } while (n && btree_path_cmp(n, path) > 0);
+ if (trans->paths_sorted)
+ goto out;
- goto out;
+ /*
+ * Cocktail shaker sort: this is efficient because iterators will be
+ * mostly sorted.
+ */
+ do {
+ swapped = false;
+
+ for (i = inc > 0 ? l : r - 2;
+ i + 1 < r && i >= l;
+ i += inc) {
+ if (btree_path_cmp(trans->paths + trans->sorted[i],
+ trans->paths + trans->sorted[i + 1]) > 0) {
+ swap(trans->sorted[i], trans->sorted[i + 1]);
+ trans->paths[trans->sorted[i]].sorted_idx = i;
+ trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
+ swapped = true;
+ }
}
- }
- if (cmp >= 0) {
- n = next_btree_path(trans, path);
- if (n && btree_path_cmp(path, n) > 0) {
- do {
- btree_path_swap(trans, path, n);
- n = next_btree_path(trans, path);
- } while (n && btree_path_cmp(path, n) > 0);
- }
- }
+ if (inc > 0)
+ --r;
+ else
+ l++;
+ inc = -inc;
+ } while (swapped);
+
+ trans->paths_sorted = true;
out:
btree_trans_verify_sorted(trans);
}
@@ -2781,15 +2682,18 @@ static inline void btree_path_list_remove(struct btree_trans *trans,
unsigned i;
EBUG_ON(path->sorted_idx >= trans->nr_sorted);
-
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ trans->nr_sorted--;
+ memmove_u64s_down_small(trans->sorted + path->sorted_idx,
+ trans->sorted + path->sorted_idx + 1,
+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+#else
array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
-
+#endif
for (i = path->sorted_idx; i < trans->nr_sorted; i++)
trans->paths[trans->sorted[i]].sorted_idx = i;
path->sorted_idx = U8_MAX;
-
- btree_trans_verify_sorted_refs(trans);
}
static inline void btree_path_list_add(struct btree_trans *trans,
@@ -2798,11 +2702,17 @@ static inline void btree_path_list_add(struct btree_trans *trans,
{
unsigned i;
- btree_trans_verify_sorted_refs(trans);
-
- path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
+ path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted;
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
+ trans->sorted + path->sorted_idx,
+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+ trans->nr_sorted++;
+ trans->sorted[path->sorted_idx] = path->idx;
+#else
array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
+#endif
for (i = path->sorted_idx; i < trans->nr_sorted; i++)
trans->paths[trans->sorted[i]].sorted_idx = i;
@@ -2812,67 +2722,38 @@ static inline void btree_path_list_add(struct btree_trans *trans,
void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
{
+ if (iter->update_path)
+ bch2_path_put_nokeep(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
if (iter->path)
bch2_path_put(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
- if (iter->update_path)
- bch2_path_put(trans, iter->update_path,
+ if (iter->key_cache_path)
+ bch2_path_put(trans, iter->key_cache_path,
iter->flags & BTREE_ITER_INTENT);
iter->path = NULL;
iter->update_path = NULL;
+ iter->key_cache_path = NULL;
}
-static void __bch2_trans_iter_init(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
- unsigned locks_want,
- unsigned depth,
- unsigned flags,
- unsigned long ip)
+static inline void bch2_trans_iter_init_inlined(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags)
{
- EBUG_ON(trans->restarted);
-
- if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
- btree_node_type_is_extents(btree_id))
- flags |= BTREE_ITER_IS_EXTENTS;
-
- if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
- !btree_type_has_snapshots(btree_id))
- flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
-
- if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
- btree_type_has_snapshots(btree_id))
- flags |= BTREE_ITER_FILTER_SNAPSHOTS;
-
- if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
- flags |= BTREE_ITER_WITH_JOURNAL;
-
- iter->trans = trans;
- iter->path = NULL;
- iter->update_path = NULL;
- iter->btree_id = btree_id;
- iter->min_depth = depth;
- iter->flags = flags;
- iter->snapshot = pos.snapshot;
- iter->pos = pos;
- iter->k.type = KEY_TYPE_deleted;
- iter->k.p = pos;
- iter->k.size = 0;
-#ifdef CONFIG_BCACHEFS_DEBUG
- iter->ip_allocated = ip;
-#endif
-
- iter->path = bch2_path_get(trans, btree_id, iter->pos,
- locks_want, depth, flags, ip);
+ bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+ bch2_btree_iter_flags(trans, btree_id, flags),
+ _RET_IP_);
}
-void bch2_trans_iter_init(struct btree_trans *trans,
+void bch2_trans_iter_init_outlined(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
unsigned flags)
{
- __bch2_trans_iter_init(trans, iter, btree_id, pos,
- 0, 0, flags, _RET_IP_);
+ bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+ bch2_btree_iter_flags(trans, btree_id, flags),
+ _RET_IP_);
}
void bch2_trans_node_iter_init(struct btree_trans *trans,
@@ -2883,11 +2764,16 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
unsigned depth,
unsigned flags)
{
- __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth,
- BTREE_ITER_NOT_EXTENTS|
- __BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_ALL_SNAPSHOTS|
- flags, _RET_IP_);
+ flags |= BTREE_ITER_NOT_EXTENTS;
+ flags |= __BTREE_ITER_ALL_SNAPSHOTS;
+ flags |= BTREE_ITER_ALL_SNAPSHOTS;
+
+ bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
+ __bch2_btree_iter_flags(trans, btree_id, flags),
+ _RET_IP_);
+
+ iter->min_depth = depth;
+
BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
BUG_ON(iter->path->level != depth);
BUG_ON(iter->min_depth != depth);
@@ -2900,38 +2786,37 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
__btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
if (src->update_path)
__btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
+ dst->key_cache_path = NULL;
}
-void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
- size_t new_top = trans->mem_top + size;
+ unsigned new_top = trans->mem_top + size;
+ size_t old_bytes = trans->mem_bytes;
+ size_t new_bytes = roundup_pow_of_two(new_top);
+ void *new_mem;
void *p;
- if (new_top > trans->mem_bytes) {
- size_t old_bytes = trans->mem_bytes;
- size_t new_bytes = roundup_pow_of_two(new_top);
- void *new_mem;
+ trans->mem_max = max(trans->mem_max, new_top);
- WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+ WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
- new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
- if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
- new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
- new_bytes = BTREE_TRANS_MEM_MAX;
- kfree(trans->mem);
- }
+ new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+ if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
+ new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+ new_bytes = BTREE_TRANS_MEM_MAX;
+ kfree(trans->mem);
+ }
- if (!new_mem)
- return ERR_PTR(-ENOMEM);
+ if (!new_mem)
+ return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
- trans->mem = new_mem;
- trans->mem_bytes = new_bytes;
+ trans->mem = new_mem;
+ trans->mem_bytes = new_bytes;
- if (old_bytes) {
- trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes);
- btree_trans_restart(trans);
- return ERR_PTR(-EINTR);
- }
+ if (old_bytes) {
+ trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
}
p = trans->mem + trans->mem_top;
@@ -2940,58 +2825,78 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
return p;
}
+static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ if (path->cached && !btree_node_locked(path, 0))
+ path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ trans->srcu_lock_time = jiffies;
+}
+
/**
* bch2_trans_begin() - reset a transaction after a interrupted attempt
* @trans: transaction to reset
*
- * While iterating over nodes or updating nodes a attempt to lock a btree
- * node may return EINTR when the trylock fails. When this occurs
- * bch2_trans_begin() should be called and the transaction retried.
+ * While iterating over nodes or updating nodes a attempt to lock a btree node
+ * may return BCH_ERR_transaction_restart when the trylock fails. When this
+ * occurs bch2_trans_begin() should be called and the transaction retried.
*/
-void bch2_trans_begin(struct btree_trans *trans)
+u32 bch2_trans_begin(struct btree_trans *trans)
{
- struct btree_insert_entry *i;
struct btree_path *path;
- trans_for_each_update(trans, i)
- __btree_path_put(i->path, true);
+ bch2_trans_reset_updates(trans);
- memset(&trans->journal_res, 0, sizeof(trans->journal_res));
- trans->extra_journal_res = 0;
- trans->nr_updates = 0;
+ trans->restart_count++;
trans->mem_top = 0;
- trans->hooks = NULL;
- trans->extra_journal_entries = NULL;
- trans->extra_journal_entry_u64s = 0;
-
- if (trans->fs_usage_deltas) {
- trans->fs_usage_deltas->used = 0;
- memset(&trans->fs_usage_deltas->memset_start, 0,
- (void *) &trans->fs_usage_deltas->memset_end -
- (void *) &trans->fs_usage_deltas->memset_start);
- }
-
trans_for_each_path(trans, path) {
path->should_be_locked = false;
/*
+ * If the transaction wasn't restarted, we're presuming to be
+ * doing something new: dont keep iterators excpt the ones that
+ * are in use - except for the subvolumes btree:
+ */
+ if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
+ path->preserve = false;
+
+ /*
* XXX: we probably shouldn't be doing this if the transaction
* was restarted, but currently we still overflow transaction
* iterators if we do that
*/
if (!path->ref && !path->preserve)
__bch2_path_free(trans, path);
- else if (!path->ref)
+ else
path->preserve = false;
}
- bch2_trans_cond_resched(trans);
+ if (!trans->restarted &&
+ (need_resched() ||
+ local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+ bch2_trans_unlock(trans);
+ cond_resched();
+ bch2_trans_relock(trans);
+ }
+
+ if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
+ bch2_trans_reset_srcu_lock(trans);
- if (trans->restarted)
+ trans->last_begin_ip = _RET_IP_;
+ if (trans->restarted) {
bch2_btree_path_traverse_all(trans);
+ trans->notrace_relock_fail = false;
+ }
- trans->restarted = false;
+ trans->last_begin_time = local_clock();
+ return trans->restart_count;
}
static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
@@ -3003,7 +2908,7 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
BUG_ON(trans->used_mempool);
#ifdef __KERNEL__
- p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
+ p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
#endif
if (!p)
p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
@@ -3012,34 +2917,88 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
trans->updates = p; p += updates_bytes;
}
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
- unsigned expected_nr_iters,
- size_t expected_mem_bytes,
- const char *fn)
+const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+
+unsigned bch2_trans_get_fn_idx(const char *fn)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
+ if (!bch2_btree_transaction_fns[i] ||
+ bch2_btree_transaction_fns[i] == fn) {
+ bch2_btree_transaction_fns[i] = fn;
+ return i;
+ }
+
+ pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
+ return i;
+}
+
+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
__acquires(&c->btree_trans_barrier)
{
+ struct btree_transaction_stats *s;
+
+ bch2_assert_btree_nodes_not_locked();
+
memset(trans, 0, sizeof(*trans));
trans->c = c;
- trans->fn = fn;
+ trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
+ ? bch2_btree_transaction_fns[fn_idx] : NULL;
+ trans->last_begin_time = local_clock();
+ trans->fn_idx = fn_idx;
+ trans->locking_wait.task = current;
+ trans->journal_replay_not_finished =
+ !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+ closure_init_stack(&trans->ref);
bch2_trans_alloc_paths(trans, c);
- if (expected_mem_bytes) {
- trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
- trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
+ s = btree_trans_stats(trans);
+ if (s && s->max_mem) {
+ unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
+
+ trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
if (!unlikely(trans->mem)) {
trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
trans->mem_bytes = BTREE_TRANS_MEM_MAX;
+ } else {
+ trans->mem_bytes = expected_mem_bytes;
}
}
+ if (s) {
+ trans->nr_max_paths = s->nr_max_paths;
+ trans->wb_updates_size = s->wb_updates_size;
+ }
+
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ trans->srcu_lock_time = jiffies;
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+ struct btree_trans *pos;
+
+ mutex_lock(&c->btree_trans_lock);
+ list_for_each_entry(pos, &c->btree_trans_list, list) {
+ /*
+ * We'd much prefer to be stricter here and completely
+ * disallow multiple btree_trans in the same thread -
+ * but the data move path calls bch2_write when we
+ * already have a btree_trans initialized.
+ */
+ BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
+ bch2_trans_locked(pos));
- trans->pid = current->pid;
- mutex_lock(&c->btree_trans_lock);
- list_add(&trans->list, &c->btree_trans_list);
- mutex_unlock(&c->btree_trans_lock);
+ if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
+ list_add_tail(&trans->list, &pos->list);
+ goto list_add_done;
+ }
+ }
+ list_add_tail(&trans->list, &c->btree_trans_list);
+list_add_done:
+ mutex_unlock(&c->btree_trans_lock);
+ }
}
static void check_btree_paths_leaked(struct btree_trans *trans)
@@ -3069,23 +3028,33 @@ void bch2_trans_exit(struct btree_trans *trans)
{
struct btree_insert_entry *i;
struct bch_fs *c = trans->c;
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
bch2_trans_unlock(trans);
+ closure_sync(&trans->ref);
+
+ if (s)
+ s->max_mem = max(s->max_mem, trans->mem_max);
+
trans_for_each_update(trans, i)
__btree_path_put(i->path, true);
trans->nr_updates = 0;
check_btree_paths_leaked(trans);
- mutex_lock(&c->btree_trans_lock);
- list_del(&trans->list);
- mutex_unlock(&c->btree_trans_lock);
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+ mutex_lock(&c->btree_trans_lock);
+ list_del(&trans->list);
+ mutex_unlock(&c->btree_trans_lock);
+ }
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+ kfree(trans->extra_journal_entries.data);
+
if (trans->fs_usage_deltas) {
if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
REPLICAS_DELTA_LIST_MAX)
@@ -3115,84 +3084,88 @@ void bch2_trans_exit(struct btree_trans *trans)
}
static void __maybe_unused
-bch2_btree_path_node_to_text(struct printbuf *out,
- struct btree_bkey_cached_common *_b,
- bool cached)
+bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
+ struct btree_bkey_cached_common *b)
{
- pr_buf(out, " l=%u %s:",
- _b->level, bch2_btree_ids[_b->btree_id]);
- bch2_bpos_to_text(out, btree_node_pos(_b, cached));
-}
+ struct six_lock_count c = six_lock_counts(&b->lock);
+ struct task_struct *owner;
+ pid_t pid;
-static bool trans_has_locks(struct btree_trans *trans)
-{
- struct btree_path *path;
+ rcu_read_lock();
+ owner = READ_ONCE(b->lock.owner);
+ pid = owner ? owner->pid : 0;
+ rcu_read_unlock();
- trans_for_each_path(trans, path)
- if (path->nodes_locked)
- return true;
- return false;
+ prt_tab(out);
+ prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
+ b->level, bch2_btree_ids[b->btree_id]);
+ bch2_bpos_to_text(out, btree_node_pos(b));
+
+ prt_tab(out);
+ prt_printf(out, " locks %u:%u:%u held by pid %u",
+ c.n[0], c.n[1], c.n[2], pid);
}
-void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
{
- struct btree_trans *trans;
struct btree_path *path;
- struct btree *b;
+ struct btree_bkey_cached_common *b;
+ static char lock_types[] = { 'r', 'i', 'w' };
unsigned l;
- mutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- if (!trans_has_locks(trans))
- continue;
+ if (!out->nr_tabstops) {
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 32);
+ }
- pr_buf(out, "%i %s\n", trans->pid, trans->fn);
+ prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
- trans_for_each_path(trans, path) {
- if (!path->nodes_locked)
- continue;
+ trans_for_each_path(trans, path) {
+ if (!path->nodes_locked)
+ continue;
- pr_buf(out, " path %u %c l=%u %s:",
- path->idx,
- path->cached ? 'c' : 'b',
- path->level,
- bch2_btree_ids[path->btree_id]);
- bch2_bpos_to_text(out, path->pos);
- pr_buf(out, "\n");
-
- for (l = 0; l < BTREE_MAX_DEPTH; l++) {
- if (btree_node_locked(path, l)) {
- pr_buf(out, " %s l=%u ",
- btree_node_intent_locked(path, l) ? "i" : "r", l);
- bch2_btree_path_node_to_text(out,
- (void *) path->l[l].b,
- path->cached);
- pr_buf(out, "\n");
- }
+ prt_printf(out, " path %u %c l=%u %s:",
+ path->idx,
+ path->cached ? 'c' : 'b',
+ path->level,
+ bch2_btree_ids[path->btree_id]);
+ bch2_bpos_to_text(out, path->pos);
+ prt_newline(out);
+
+ for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+ if (btree_node_locked(path, l) &&
+ !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
+ prt_printf(out, " %c l=%u ",
+ lock_types[btree_node_locked_type(path, l)], l);
+ bch2_btree_bkey_cached_common_to_text(out, b);
+ prt_newline(out);
}
}
+ }
- b = READ_ONCE(trans->locking);
- if (b) {
- path = &trans->paths[trans->locking_path_idx];
- pr_buf(out, " locking path %u %c l=%u %s:",
- trans->locking_path_idx,
- path->cached ? 'c' : 'b',
- trans->locking_level,
- bch2_btree_ids[trans->locking_btree_id]);
- bch2_bpos_to_text(out, trans->locking_pos);
-
- pr_buf(out, " node ");
- bch2_btree_path_node_to_text(out,
- (void *) b, path->cached);
- pr_buf(out, "\n");
- }
+ b = READ_ONCE(trans->locking);
+ if (b) {
+ prt_printf(out, " blocked for %lluus on",
+ div_u64(local_clock() - trans->locking_wait.start_time,
+ 1000));
+ prt_newline(out);
+ prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]);
+ bch2_btree_bkey_cached_common_to_text(out, b);
+ prt_newline(out);
}
- mutex_unlock(&c->btree_trans_lock);
}
void bch2_fs_btree_iter_exit(struct bch_fs *c)
{
+ struct btree_transaction_stats *s;
+
+ for (s = c->btree_transaction_stats;
+ s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+ s++) {
+ kfree(s->max_paths_text);
+ bch2_time_stats_exit(&s->lock_hold_times);
+ }
+
if (c->btree_trans_barrier_initialized)
cleanup_srcu_struct(&c->btree_trans_barrier);
mempool_exit(&c->btree_trans_mem_pool);
@@ -3201,9 +3174,17 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
int bch2_fs_btree_iter_init(struct bch_fs *c)
{
+ struct btree_transaction_stats *s;
unsigned nr = BTREE_ITER_MAX;
int ret;
+ for (s = c->btree_transaction_stats;
+ s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+ s++) {
+ bch2_time_stats_init(&s->lock_hold_times);
+ mutex_init(&s->lock);
+ }
+
INIT_LIST_HEAD(&c->btree_trans_list);
mutex_init(&c->btree_trans_lock);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 981817247dfe..6b7cef145ced 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -5,6 +5,8 @@
#include "bset.h"
#include "btree_types.h"
+#include <trace/events/bcachefs.h>
+
static inline void __btree_path_get(struct btree_path *path, bool intent)
{
path->ref++;
@@ -50,13 +52,18 @@ static inline struct btree *btree_node_parent(struct btree_path *path,
return btree_path_node(path, b->c.level + 1);
}
-static inline int btree_iter_err(const struct btree_iter *iter)
+/* Iterate over paths within a transaction: */
+
+void __bch2_btree_trans_sort_paths(struct btree_trans *);
+
+static inline void btree_trans_sort_paths(struct btree_trans *trans)
{
- return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
+ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+ trans->paths_sorted)
+ return;
+ __bch2_btree_trans_sort_paths(trans);
}
-/* Iterate over paths within a transaction: */
-
static inline struct btree_path *
__trans_next_path(struct btree_trans *trans, unsigned idx)
{
@@ -75,11 +82,14 @@ __trans_next_path(struct btree_trans *trans, unsigned idx)
return &trans->paths[idx];
}
-#define trans_for_each_path(_trans, _path) \
- for (_path = __trans_next_path((_trans), 0); \
+#define trans_for_each_path_from(_trans, _path, _start) \
+ for (_path = __trans_next_path((_trans), _start); \
(_path); \
_path = __trans_next_path((_trans), (_path)->idx + 1))
+#define trans_for_each_path(_trans, _path) \
+ trans_for_each_path_from(_trans, _path, 0)
+
static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
{
unsigned idx = path ? path->sorted_idx + 1 : 0;
@@ -93,9 +103,10 @@ static inline struct btree_path *next_btree_path(struct btree_trans *trans, stru
static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
{
- EBUG_ON(path->sorted_idx >= trans->nr_sorted);
- return path->sorted_idx
- ? trans->paths + trans->sorted[path->sorted_idx - 1]
+ unsigned idx = path ? path->sorted_idx : trans->nr_sorted;
+
+ return idx
+ ? trans->paths + trans->sorted[idx - 1]
: NULL;
}
@@ -104,6 +115,11 @@ static inline struct btree_path *prev_btree_path(struct btree_trans *trans, stru
((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
_i++)
+#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \
+ for (_i = trans->nr_sorted - 1; \
+ ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\
+ --_i)
+
static inline bool __path_has_node(const struct btree_path *path,
const struct btree *b)
{
@@ -129,23 +145,74 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
_path = __trans_next_path_with_node((_trans), (_b), \
(_path)->idx + 1))
-struct btree_path * __must_check
-bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
bool, unsigned long);
+
+static inline struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *trans,
+ struct btree_path *path, bool intent,
+ unsigned long ip)
+{
+ if (path->ref > 1 || path->preserve)
+ path = __bch2_btree_path_make_mut(trans, path, intent, ip);
+ path->should_be_locked = false;
+ return path;
+}
+
+struct btree_path * __must_check
+__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
+ struct bpos, bool, unsigned long, int);
+
+static inline struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *trans,
+ struct btree_path *path, struct bpos new_pos,
+ bool intent, unsigned long ip)
+{
+ int cmp = bpos_cmp(new_pos, path->pos);
+
+ return cmp
+ ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp)
+ : path;
+}
+
+int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+ unsigned, unsigned long);
+
+static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
+ struct btree_path *path, unsigned flags)
+{
+ if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+ return 0;
+
+ return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
+}
+
int __must_check bch2_btree_path_traverse(struct btree_trans *,
struct btree_path *, unsigned);
struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
unsigned, unsigned, unsigned, unsigned long);
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
+struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
+ struct btree_iter *, struct bpos);
+
+void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
+
+int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *);
+
+static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock)
+{
+ return mutex_trylock(lock)
+ ? 0
+ : __bch2_trans_mutex_lock(trans, lock);
+}
+
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_trans_verify_paths(struct btree_trans *);
-void bch2_trans_verify_locks(struct btree_trans *);
void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
struct bpos, bool);
#else
static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
-static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
struct bpos pos, bool key_cache) {}
#endif
@@ -156,46 +223,67 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
struct btree *, struct btree_node_iter *,
struct bkey_packed *, unsigned, unsigned);
-bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
+int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
-bool bch2_trans_relock(struct btree_trans *);
+int bch2_trans_relock(struct btree_trans *);
+int bch2_trans_relock_notrace(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
+bool bch2_trans_locked(struct btree_trans *);
-__always_inline
-static inline int btree_trans_restart(struct btree_trans *trans)
+static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count)
{
- trans->restarted = true;
- bch2_trans_unlock(trans);
- return -EINTR;
+ return restart_count != trans->restart_count;
}
-bool bch2_btree_node_upgrade(struct btree_trans *,
- struct btree_path *, unsigned);
+void bch2_trans_restart_error(struct btree_trans *, u32);
+
+static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
+ u32 restart_count)
+{
+ if (trans_was_restarted(trans, restart_count))
+ bch2_trans_restart_error(trans, restart_count);
+}
-bool __bch2_btree_path_upgrade(struct btree_trans *,
- struct btree_path *, unsigned);
+void bch2_trans_in_restart_error(struct btree_trans *);
-static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
+static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
{
- new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+ if (trans->restarted)
+ bch2_trans_in_restart_error(trans);
+}
+
+__always_inline
+static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
+{
+ BUG_ON(err <= 0);
+ BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart));
- return path->locks_want < new_locks_want
- ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
- : path->uptodate == BTREE_ITER_UPTODATE;
+ trans->restarted = err;
+ trans->last_restarted_ip = _THIS_IP_;
+ return -err;
+}
+
+__always_inline
+static inline int btree_trans_restart(struct btree_trans *trans, int err)
+{
+ btree_trans_restart_nounlock(trans, err);
+ return -err;
}
-void __bch2_btree_path_downgrade(struct btree_path *, unsigned);
+bool bch2_btree_node_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned);
-static inline void bch2_btree_path_downgrade(struct btree_path *path)
+void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
+
+static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
+ struct btree_path *path)
{
unsigned new_locks_want = path->level + !!path->intent_ref;
if (path->locks_want > new_locks_want)
- __bch2_btree_path_downgrade(path, new_locks_want);
+ __bch2_btree_path_downgrade(trans, path, new_locks_want);
}
void bch2_trans_downgrade(struct btree_trans *);
@@ -207,11 +295,19 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
int __must_check bch2_btree_iter_traverse(struct btree_iter *);
struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
struct btree *bch2_btree_iter_next_node(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
+
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+ return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
+}
+
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
@@ -260,8 +356,85 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna
}
void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
-void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *,
+
+static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
+ unsigned btree_id,
+ unsigned flags)
+{
+ if (flags & BTREE_ITER_ALL_LEVELS)
+ flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
+
+ if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+ btree_node_type_is_extents(btree_id))
+ flags |= BTREE_ITER_IS_EXTENTS;
+
+ if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+ !btree_type_has_snapshots(btree_id))
+ flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+
+ if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ btree_type_has_snapshots(btree_id))
+ flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+
+ if (trans->journal_replay_not_finished)
+ flags |= BTREE_ITER_WITH_JOURNAL;
+
+ return flags;
+}
+
+static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
+ unsigned btree_id,
+ unsigned flags)
+{
+ if (!btree_id_cached(trans->c, btree_id)) {
+ flags &= ~BTREE_ITER_CACHED;
+ flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+ } else if (!(flags & BTREE_ITER_CACHED))
+ flags |= BTREE_ITER_WITH_KEY_CACHE;
+
+ return __bch2_btree_iter_flags(trans, btree_id, flags);
+}
+
+static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned locks_want,
+ unsigned depth,
+ unsigned flags,
+ unsigned long ip)
+{
+ memset(iter, 0, sizeof(*iter));
+ iter->trans = trans;
+ iter->btree_id = btree_id;
+ iter->flags = flags;
+ iter->snapshot = pos.snapshot;
+ iter->pos = pos;
+ iter->k.p = pos;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ iter->ip_allocated = ip;
+#endif
+ iter->path = bch2_path_get(trans, btree_id, iter->pos,
+ locks_want, depth, flags, ip);
+}
+
+void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
unsigned, struct bpos, unsigned);
+
+static inline void bch2_trans_iter_init(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags)
+{
+ if (__builtin_constant_p(btree_id) &&
+ __builtin_constant_p(flags))
+ bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+ bch2_btree_iter_flags(trans, btree_id, flags),
+ _THIS_IP_);
+ else
+ bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
+}
+
void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
enum btree_id, struct bpos,
unsigned, unsigned, unsigned);
@@ -269,29 +442,96 @@ void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
static inline void set_btree_iter_dontneed(struct btree_iter *iter)
{
- iter->path->preserve = false;
+ if (!iter->trans->restarted)
+ iter->path->preserve = false;
}
-void *bch2_trans_kmalloc(struct btree_trans *, size_t);
-void bch2_trans_begin(struct btree_trans *);
+void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
-static inline struct btree *
-__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter)
+static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
- struct btree *b;
+ size = roundup(size, 8);
- while (b = bch2_btree_iter_peek_node(iter),
- PTR_ERR_OR_ZERO(b) == -EINTR)
- bch2_trans_begin(trans);
+ if (likely(trans->mem_top + size <= trans->mem_bytes)) {
+ void *p = trans->mem + trans->mem_top;
+
+ trans->mem_top += size;
+ memset(p, 0, size);
+ return p;
+ } else {
+ return __bch2_trans_kmalloc(trans, size);
+ }
+}
+
+static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
+{
+ size = roundup(size, 8);
+
+ if (likely(trans->mem_top + size <= trans->mem_bytes)) {
+ void *p = trans->mem + trans->mem_top;
+
+ trans->mem_top += size;
+ return p;
+ } else {
+ return __bch2_trans_kmalloc(trans, size);
+ }
+}
+
+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+ struct bkey_i *mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k));
- return b;
+ if (!IS_ERR(mut))
+ bkey_reassemble(mut, k);
+ return mut;
}
+static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+
+ return unlikely(IS_ERR(k.k))
+ ? ERR_CAST(k.k)
+ : bch2_bkey_make_mut(trans, k);
+}
+
+#define bch2_bkey_get_mut_typed(_trans, _iter, _type) \
+({ \
+ struct bkey_i *_k = bch2_bkey_get_mut(_trans, _iter); \
+ struct bkey_i_##_type *_ret; \
+ \
+ if (IS_ERR(_k)) \
+ _ret = ERR_CAST(_k); \
+ else if (unlikely(_k->k.type != KEY_TYPE_##_type)) \
+ _ret = ERR_PTR(-ENOENT); \
+ else \
+ _ret = bkey_i_to_##_type(_k); \
+ _ret; \
+})
+
+#define bch2_bkey_alloc(_trans, _iter, _type) \
+({ \
+ struct bkey_i_##_type *_k = bch2_trans_kmalloc_nomemzero(_trans, sizeof(*_k));\
+ if (!IS_ERR(_k)) { \
+ bkey_##_type##_init(&_k->k_i); \
+ _k->k.p = (_iter)->pos; \
+ } \
+ _k; \
+})
+
+u32 bch2_trans_begin(struct btree_trans *);
+
+/*
+ * XXX
+ * this does not handle transaction restarts from bch2_btree_iter_next_node()
+ * correctly
+ */
#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
_locks_want, _depth, _flags, _b, _ret) \
for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \
_start, _locks_want, _depth, _flags); \
- (_b) = __btree_iter_peek_node_and_restart((_trans), &(_iter)),\
+ (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)), \
!((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \
(_b) = bch2_btree_iter_next_node(&(_iter)))
@@ -305,20 +545,48 @@ static inline int bkey_err(struct bkey_s_c k)
return PTR_ERR_OR_ZERO(k.k);
}
+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
+ unsigned flags)
+{
+ BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
+
+ return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ bch2_btree_iter_peek_prev(iter);
+}
+
static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
- unsigned flags)
+ unsigned flags)
{
- return flags & BTREE_ITER_SLOTS
- ? bch2_btree_iter_peek_slot(iter)
- : bch2_btree_iter_peek(iter);
+ return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
+ flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ bch2_btree_iter_peek(iter);
+}
+
+static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
+ struct bpos end,
+ unsigned flags)
+{
+ if (!(flags & BTREE_ITER_SLOTS))
+ return bch2_btree_iter_peek_upto(iter, end);
+
+ if (bkey_gt(iter->pos, end))
+ return bkey_s_c_null;
+
+ return bch2_btree_iter_peek_slot(iter);
}
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
- ? -EINTR : 0;
+ if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
+ trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
+ }
+
+ return 0;
}
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
+
static inline struct bkey_s_c
__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
struct btree_iter *iter, unsigned flags)
@@ -327,12 +595,178 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
while (btree_trans_too_many_iters(trans) ||
(k = bch2_btree_iter_peek_type(iter, flags),
- bkey_err(k) == -EINTR))
+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
bch2_trans_begin(trans);
return k;
}
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos end,
+ unsigned flags)
+{
+ struct bkey_s_c k;
+
+ while (btree_trans_too_many_iters(trans) ||
+ (k = bch2_btree_iter_peek_upto_type(iter, end, flags),
+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+ bch2_trans_begin(trans);
+
+ return k;
+}
+
+#define lockrestart_do(_trans, _do) \
+({ \
+ u32 _restart_count; \
+ int _ret; \
+ \
+ do { \
+ _restart_count = bch2_trans_begin(_trans); \
+ _ret = (_do); \
+ } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart)); \
+ \
+ if (!_ret) \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ \
+ _ret; \
+})
+
+/*
+ * nested_lockrestart_do(), nested_commit_do():
+ *
+ * These are like lockrestart_do() and commit_do(), with two differences:
+ *
+ * - We don't call bch2_trans_begin() unless we had a transaction restart
+ * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
+ * transaction restart
+ */
+#define nested_lockrestart_do(_trans, _do) \
+({ \
+ u32 _restart_count, _orig_restart_count; \
+ int _ret; \
+ \
+ _restart_count = _orig_restart_count = (_trans)->restart_count; \
+ \
+ while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\
+ _restart_count = bch2_trans_begin(_trans); \
+ \
+ if (!_ret) \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ \
+ if (!_ret && trans_was_restarted(_trans, _orig_restart_count)) \
+ _ret = -BCH_ERR_transaction_restart_nested; \
+ \
+ _ret; \
+})
+
+#define for_each_btree_key2(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _do) \
+({ \
+ int _ret = 0; \
+ \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ while (1) { \
+ u32 _restart_count = bch2_trans_begin(_trans); \
+ \
+ _ret = 0; \
+ (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \
+ if (!(_k).k) \
+ break; \
+ \
+ _ret = bkey_err(_k) ?: (_do); \
+ if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ continue; \
+ if (_ret) \
+ break; \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ if (!bch2_btree_iter_advance(&(_iter))) \
+ break; \
+ } \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret; \
+})
+
+#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _do) \
+({ \
+ int _ret = 0; \
+ \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ while (1) { \
+ u32 _restart_count = bch2_trans_begin(_trans); \
+ \
+ _ret = 0; \
+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
+ if (!(_k).k) \
+ break; \
+ \
+ _ret = bkey_err(_k) ?: (_do); \
+ if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ continue; \
+ if (_ret) \
+ break; \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ if (!bch2_btree_iter_advance(&(_iter))) \
+ break; \
+ } \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret; \
+})
+
+#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _do) \
+({ \
+ int _ret = 0; \
+ \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ while (1) { \
+ u32 _restart_count = bch2_trans_begin(_trans); \
+ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
+ if (!(_k).k) { \
+ _ret = 0; \
+ break; \
+ } \
+ \
+ _ret = bkey_err(_k) ?: (_do); \
+ if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ continue; \
+ if (_ret) \
+ break; \
+ bch2_trans_verify_not_restarted(_trans, _restart_count);\
+ if (!bch2_btree_iter_rewind(&(_iter))) \
+ break; \
+ } \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret; \
+})
+
+#define for_each_btree_key_commit(_trans, _iter, _btree_id, \
+ _start, _iter_flags, _k, \
+ _disk_res, _journal_seq, _commit_flags,\
+ _do) \
+ for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+ (_journal_seq), (_commit_flags)))
+
+#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \
+ _start, _end, _iter_flags, _k, \
+ _disk_res, _journal_seq, _commit_flags,\
+ _do) \
+ for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
+ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+ (_journal_seq), (_commit_flags)))
+
#define for_each_btree_key(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
@@ -341,6 +775,15 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
+#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans), \
+ &(_iter), _end, _flags),\
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
@@ -349,6 +792,14 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
+#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \
for (; \
(_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
@@ -361,16 +812,36 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
+#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
+ for (; \
+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
+
/* new multiple iterator interface: */
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
-void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
- unsigned, size_t, const char *);
+void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned);
void bch2_trans_exit(struct btree_trans *);
-#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__)
+extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+unsigned bch2_trans_get_fn_idx(const char *);
+
+#define bch2_trans_init(_trans, _c, _nr_iters, _mem) \
+do { \
+ static unsigned trans_fn_idx; \
+ \
+ if (unlikely(!trans_fn_idx)) \
+ trans_fn_idx = bch2_trans_get_fn_idx(__func__); \
+ \
+ __bch2_trans_init(_trans, _c, trans_fn_idx); \
+} while (0)
-void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
+void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
void bch2_fs_btree_iter_exit(struct bch_fs *);
int bch2_fs_btree_iter_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 61a447dc578a..33269afe9cf2 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_cache.h"
@@ -5,13 +6,20 @@
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update.h"
+#include "errcode.h"
#include "error.h"
#include "journal.h"
#include "journal_reclaim.h"
#include <linux/sched/mm.h>
+#include <linux/seq_buf.h>
#include <trace/events/bcachefs.h>
+static inline bool btree_uses_pcpu_readers(enum btree_id id)
+{
+ return id == BTREE_ID_subvolumes;
+}
+
static struct kmem_cache *bch2_key_cache;
static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -20,8 +28,8 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
const struct bkey_cached *ck = obj;
const struct bkey_cached_key *key = arg->key;
- return cmp_int(ck->key.btree_id, key->btree_id) ?:
- bpos_cmp(ck->key.pos, key->pos);
+ return ck->key.btree_id != key->btree_id ||
+ !bpos_eq(ck->key.pos, key->pos);
}
static const struct rhashtable_params bch2_btree_key_cache_params = {
@@ -49,13 +57,12 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
if (!six_trylock_intent(&ck->c.lock))
return false;
- if (!six_trylock_write(&ck->c.lock)) {
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
six_unlock_intent(&ck->c.lock);
return false;
}
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- six_unlock_write(&ck->c.lock);
+ if (!six_trylock_write(&ck->c.lock)) {
six_unlock_intent(&ck->c.lock);
return false;
}
@@ -83,32 +90,208 @@ static void bkey_cached_free(struct btree_key_cache *bc,
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
- list_move_tail(&ck->list, &bc->freed);
- bc->nr_freed++;
+ if (ck->c.lock.readers)
+ list_move_tail(&ck->list, &bc->freed_pcpu);
+ else
+ list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ atomic_long_inc(&bc->nr_freed);
+
+ kfree(ck->k);
+ ck->k = NULL;
+ ck->u64s = 0;
+
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
+}
+
+#ifdef __KERNEL__
+static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ struct bkey_cached *pos;
+
+ list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
+ if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
+ pos->btree_trans_barrier_seq)) {
+ list_move(&ck->list, &pos->list);
+ return;
+ }
+ }
+
+ list_move(&ck->list, &bc->freed_nonpcpu);
+}
+#endif
+
+static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+ if (!ck->c.lock.readers) {
+#ifdef __KERNEL__
+ struct btree_key_cache_freelist *f;
+ bool freed = false;
+
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+
+ if (f->nr < ARRAY_SIZE(f->objs)) {
+ f->objs[f->nr++] = ck;
+ freed = true;
+ }
+ preempt_enable();
+
+ if (!freed) {
+ mutex_lock(&bc->lock);
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+
+ while (f->nr > ARRAY_SIZE(f->objs) / 2) {
+ struct bkey_cached *ck2 = f->objs[--f->nr];
+
+ __bkey_cached_move_to_freelist_ordered(bc, ck2);
+ }
+ preempt_enable();
+
+ __bkey_cached_move_to_freelist_ordered(bc, ck);
+ mutex_unlock(&bc->lock);
+ }
+#else
+ mutex_lock(&bc->lock);
+ list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ mutex_unlock(&bc->lock);
+#endif
+ } else {
+ mutex_lock(&bc->lock);
+ list_move_tail(&ck->list, &bc->freed_pcpu);
+ mutex_unlock(&bc->lock);
+ }
+}
+
+static void bkey_cached_free_fast(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+ ck->btree_trans_barrier_seq =
+ start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+ list_del_init(&ck->list);
+ atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
+ bkey_cached_move_to_freelist(bc, ck);
+
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
static struct bkey_cached *
-bkey_cached_alloc(struct btree_key_cache *c)
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
+ bool *was_new)
{
- struct bkey_cached *ck;
+ struct bch_fs *c = trans->c;
+ struct btree_key_cache *bc = &c->btree_key_cache;
+ struct bkey_cached *ck = NULL;
+ bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
+ int ret;
+
+ if (!pcpu_readers) {
+#ifdef __KERNEL__
+ struct btree_key_cache_freelist *f;
+
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+ if (f->nr)
+ ck = f->objs[--f->nr];
+ preempt_enable();
+
+ if (!ck) {
+ mutex_lock(&bc->lock);
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+
+ while (!list_empty(&bc->freed_nonpcpu) &&
+ f->nr < ARRAY_SIZE(f->objs) / 2) {
+ ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+ list_del_init(&ck->list);
+ f->objs[f->nr++] = ck;
+ }
+
+ ck = f->nr ? f->objs[--f->nr] : NULL;
+ preempt_enable();
+ mutex_unlock(&bc->lock);
+ }
+#else
+ mutex_lock(&bc->lock);
+ if (!list_empty(&bc->freed_nonpcpu)) {
+ ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+ list_del_init(&ck->list);
+ }
+ mutex_unlock(&bc->lock);
+#endif
+ } else {
+ mutex_lock(&bc->lock);
+ if (!list_empty(&bc->freed_pcpu)) {
+ ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
+ list_del_init(&ck->list);
+ }
+ mutex_unlock(&bc->lock);
+ }
+
+ if (ck) {
+ int ret;
+
+ ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
+ if (unlikely(ret)) {
+ bkey_cached_move_to_freelist(bc, ck);
+ return ERR_PTR(ret);
+ }
+
+ path->l[0].b = (void *) ck;
+ path->l[0].lock_seq = ck->c.lock.state.seq;
+ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+
+ ret = bch2_btree_node_lock_write(trans, path, &ck->c);
+ if (unlikely(ret)) {
+ btree_node_unlock(trans, path, 0);
+ bkey_cached_move_to_freelist(bc, ck);
+ return ERR_PTR(ret);
+ }
- ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
- if (likely(ck)) {
- INIT_LIST_HEAD(&ck->list);
- six_lock_init(&ck->c.lock);
- BUG_ON(!six_trylock_intent(&ck->c.lock));
- BUG_ON(!six_trylock_write(&ck->c.lock));
return ck;
}
- return NULL;
+ ck = kmem_cache_zalloc(bch2_key_cache, GFP_NOWAIT|__GFP_NOWARN);
+ if (likely(ck))
+ goto init;
+
+ bch2_trans_unlock(trans);
+
+ ck = kmem_cache_zalloc(bch2_key_cache, GFP_KERNEL);
+
+ ret = bch2_trans_relock(trans);
+ if (ret) {
+ kmem_cache_free(bch2_key_cache, ck);
+ return ERR_PTR(ret);
+ }
+
+ if (!ck)
+ return NULL;
+init:
+ INIT_LIST_HEAD(&ck->list);
+ bch2_btree_lock_init(&ck->c);
+ if (pcpu_readers)
+ six_lock_pcpu_alloc(&ck->c.lock);
+
+ ck->c.cached = true;
+ BUG_ON(!six_trylock_intent(&ck->c.lock));
+ BUG_ON(!six_trylock_write(&ck->c.lock));
+ *was_new = true;
+ return ck;
}
static struct bkey_cached *
@@ -120,15 +303,6 @@ bkey_cached_reuse(struct btree_key_cache *c)
unsigned i;
mutex_lock(&c->lock);
- list_for_each_entry_reverse(ck, &c->freed, list)
- if (bkey_cached_lock_for_evict(ck)) {
- c->nr_freed--;
- list_del(&ck->list);
- mutex_unlock(&c->lock);
- return ck;
- }
- mutex_unlock(&c->lock);
-
rcu_read_lock();
tbl = rht_dereference_rcu(c->table.tbl, &c->table);
for (i = 0; i < tbl->size; i++)
@@ -136,46 +310,46 @@ bkey_cached_reuse(struct btree_key_cache *c)
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(c, ck);
- rcu_read_unlock();
- return ck;
+ goto out;
}
}
+ ck = NULL;
+out:
rcu_read_unlock();
-
- return NULL;
+ mutex_unlock(&c->lock);
+ return ck;
}
static struct bkey_cached *
-btree_key_cache_create(struct bch_fs *c,
- enum btree_id btree_id,
- struct bpos pos)
+btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
{
+ struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck;
- bool was_new = true;
+ bool was_new = false;
- ck = bkey_cached_alloc(bc);
+ ck = bkey_cached_alloc(trans, path, &was_new);
+ if (IS_ERR(ck))
+ return ck;
if (unlikely(!ck)) {
ck = bkey_cached_reuse(bc);
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
- bch2_btree_ids[btree_id]);
- return ERR_PTR(-ENOMEM);
+ bch2_btree_ids[path->btree_id]);
+ return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
}
- was_new = false;
+ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+ } else {
+ if (path->btree_id == BTREE_ID_subvolumes)
+ six_lock_pcpu_alloc(&ck->c.lock);
}
- if (btree_id == BTREE_ID_subvolumes)
- six_lock_pcpu_alloc(&ck->c.lock);
- else
- six_lock_pcpu_free(&ck->c.lock);
-
ck->c.level = 0;
- ck->c.btree_id = btree_id;
- ck->key.btree_id = btree_id;
- ck->key.pos = pos;
+ ck->c.btree_id = path->btree_id;
+ ck->key.btree_id = path->btree_id;
+ ck->key.pos = path->pos;
ck->valid = false;
ck->flags = 1U << BKEY_CACHED_ACCESSED;
@@ -189,11 +363,10 @@ btree_key_cache_create(struct bch_fs *c,
six_unlock_intent(&ck->c.lock);
kfree(ck);
} else {
- mutex_lock(&bc->lock);
- bkey_cached_free(bc, ck);
- mutex_unlock(&bc->lock);
+ bkey_cached_free_fast(bc, ck);
}
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
return NULL;
}
@@ -214,17 +387,17 @@ static int btree_key_cache_fill(struct btree_trans *trans,
struct bkey_i *new_k = NULL;
int ret;
- bch2_trans_iter_init(trans, &iter, ck->key.btree_id,
- ck->key.pos, BTREE_ITER_SLOTS);
+ bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
+ BTREE_ITER_KEY_CACHE_FILL|
+ BTREE_ITER_CACHED_NOFILL);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (!bch2_btree_node_relock(trans, ck_path, 0)) {
- trace_trans_restart_relock_key_cache_fill(trans->fn,
- _THIS_IP_, ck_path->btree_id, &ck_path->pos);
- ret = btree_trans_restart(trans);
+ trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
goto err;
}
@@ -234,22 +407,48 @@ static int btree_key_cache_fill(struct btree_trans *trans,
*/
new_u64s = k.k->u64s + 1;
+ /*
+ * Allocate some extra space so that the transaction commit path is less
+ * likely to have to reallocate, since that requires a transaction
+ * restart:
+ */
+ new_u64s = min(256U, (new_u64s * 3) / 2);
+
if (new_u64s > ck->u64s) {
new_u64s = roundup_pow_of_two(new_u64s);
- new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
if (!new_k) {
- bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_ids[ck->key.btree_id], new_u64s);
- ret = -ENOMEM;
- goto err;
+ bch2_trans_unlock(trans);
+
+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+ if (!new_k) {
+ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_ids[ck->key.btree_id], new_u64s);
+ ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+ goto err;
+ }
+
+ if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+ kfree(new_k);
+ trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
+ goto err;
+ }
+
+ ret = bch2_trans_relock(trans);
+ if (ret) {
+ kfree(new_k);
+ goto err;
+ }
}
}
- /*
- * XXX: not allowed to be holding read locks when we take a write lock,
- * currently
- */
- bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
+ ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
+ if (ret) {
+ kfree(new_k);
+ goto err;
+ }
+
if (new_k) {
kfree(ck->k);
ck->u64s = new_u64s;
@@ -267,18 +466,9 @@ err:
return ret;
}
-static int bkey_cached_check_fn(struct six_lock *lock, void *p)
-{
- struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
- const struct btree_path *path = p;
-
- return ck->key.btree_id == path->btree_id &&
- !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1;
-}
-
-__flatten
-int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
- unsigned flags)
+static noinline int
+bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck;
@@ -288,78 +478,136 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path
path->l[1].b = NULL;
- if (bch2_btree_node_relock(trans, path, 0)) {
+ if (bch2_btree_node_relock_notrace(trans, path, 0)) {
ck = (void *) path->l[0].b;
goto fill;
}
retry:
ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
if (!ck) {
- if (flags & BTREE_ITER_CACHED_NOCREATE) {
- path->l[0].b = NULL;
- return 0;
- }
-
- ck = btree_key_cache_create(c, path->btree_id, path->pos);
+ ck = btree_key_cache_create(trans, path);
ret = PTR_ERR_OR_ZERO(ck);
if (ret)
goto err;
if (!ck)
goto retry;
- mark_btree_node_locked(path, 0, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
path->locks_want = 1;
} else {
enum six_lock_type lock_want = __btree_lock_want(path, 0);
- if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0,
- lock_want,
- bkey_cached_check_fn, path, _THIS_IP_)) {
- if (!trans->restarted)
- goto retry;
-
- trace_transaction_restart_ip(trans->fn, _THIS_IP_);
- ret = -EINTR;
+ ret = btree_node_lock(trans, path, (void *) ck, 0,
+ lock_want, _THIS_IP_);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto err;
- }
+
+ BUG_ON(ret);
if (ck->key.btree_id != path->btree_id ||
- bpos_cmp(ck->key.pos, path->pos)) {
+ !bpos_eq(ck->key.pos, path->pos)) {
six_unlock_type(&ck->c.lock, lock_want);
goto retry;
}
- mark_btree_node_locked(path, 0, lock_want);
+ mark_btree_node_locked(trans, path, 0, lock_want);
}
path->l[0].lock_seq = ck->c.lock.state.seq;
path->l[0].b = (void *) ck;
fill:
+ path->uptodate = BTREE_ITER_UPTODATE;
+
if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+ /*
+ * Using the underscore version because we haven't set
+ * path->uptodate yet:
+ */
if (!path->locks_want &&
!__bch2_btree_path_upgrade(trans, path, 1)) {
- trace_transaction_restart_ip(trans->fn, _THIS_IP_);
- ret = btree_trans_restart(trans);
+ trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
goto err;
}
ret = btree_key_cache_fill(trans, path, ck);
if (ret)
goto err;
+
+ ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
+ if (ret)
+ goto err;
+
+ path->uptodate = BTREE_ITER_UPTODATE;
}
if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- path->uptodate = BTREE_ITER_UPTODATE;
BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+ BUG_ON(path->uptodate);
return ret;
err:
- if (ret != -EINTR) {
- btree_node_unlock(path, 0);
- path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
+ path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ btree_node_unlock(trans, path, 0);
+ path->l[0].b = ERR_PTR(ret);
+ }
+ return ret;
+}
+
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_cached *ck;
+ int ret = 0;
+
+ EBUG_ON(path->level);
+
+ path->l[1].b = NULL;
+
+ if (bch2_btree_node_relock_notrace(trans, path, 0)) {
+ ck = (void *) path->l[0].b;
+ goto fill;
+ }
+retry:
+ ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+ if (!ck) {
+ return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+ } else {
+ enum six_lock_type lock_want = __btree_lock_want(path, 0);
+
+ ret = btree_node_lock(trans, path, (void *) ck, 0,
+ lock_want, _THIS_IP_);
+ EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+ if (ret)
+ return ret;
+
+ if (ck->key.btree_id != path->btree_id ||
+ !bpos_eq(ck->key.pos, path->pos)) {
+ six_unlock_type(&ck->c.lock, lock_want);
+ goto retry;
+ }
+
+ mark_btree_node_locked(trans, path, 0, lock_want);
}
+
+ path->l[0].lock_seq = ck->c.lock.state.seq;
+ path->l[0].b = (void *) ck;
+fill:
+ if (!ck->valid)
+ return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+
+ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
+ path->uptodate = BTREE_ITER_UPTODATE;
+ EBUG_ON(!ck->valid);
+ EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+
return ret;
}
@@ -381,31 +629,36 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BTREE_ITER_ALL_SNAPSHOTS);
bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_CACHED_NOCREATE|
BTREE_ITER_INTENT);
+ b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+
ret = bch2_btree_iter_traverse(&c_iter);
if (ret)
goto out;
ck = (void *) c_iter.path->l[0].b;
- if (!ck ||
- (journal_seq && ck->journal.seq != journal_seq))
+ if (!ck)
goto out;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- if (!evict)
- goto out;
- goto evict;
+ if (evict)
+ goto evict;
+ goto out;
}
+ BUG_ON(!ck->valid);
+
+ if (journal_seq && ck->journal.seq != journal_seq)
+ goto out;
+
/*
* Since journal reclaim depends on us making progress here, and the
* allocator/copygc depend on journal reclaim making progress, we need
* to be using alloc reserves:
- * */
+ */
ret = bch2_btree_iter_traverse(&b_iter) ?:
bch2_trans_update(trans, &b_iter, ck->k,
+ BTREE_UPDATE_KEY_CACHE_RECLAIM|
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
@@ -413,16 +666,17 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
(ck->journal.seq == journal_last_seq(j)
- ? BTREE_INSERT_JOURNAL_RESERVED
+ ? JOURNAL_WATERMARK_reserved
: 0)|
commit_flags);
- if (ret) {
- bch2_fs_fatal_err_on(ret != -EINTR &&
- ret != -EAGAIN &&
- !bch2_journal_error(j), c,
- "error flushing key cache: %i", ret);
+
+ bch2_fs_fatal_err_on(ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+ !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
+ !bch2_journal_error(j), c,
+ "error flushing key cache: %s", bch2_err_str(ret));
+ if (ret)
goto out;
- }
bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
@@ -435,24 +689,22 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
} else {
+ struct btree_path *path2;
evict:
- BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
+ trans_for_each_path(trans, path2)
+ if (path2 != c_iter.path)
+ __bch2_btree_path_unlock(trans, path2);
- mark_btree_node_unlocked(c_iter.path, 0);
- c_iter.path->l[0].b = NULL;
-
- six_lock_write(&ck->c.lock, NULL, NULL);
+ bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
+ mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
bkey_cached_evict(&c->btree_key_cache, ck);
-
- mutex_lock(&c->btree_key_cache.lock);
- bkey_cached_free(&c->btree_key_cache, ck);
- mutex_unlock(&c->btree_key_cache.lock);
+ bkey_cached_free_fast(&c->btree_key_cache, ck);
}
out:
bch2_trans_iter_exit(trans, &b_iter);
@@ -467,11 +719,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
struct bkey_cached *ck =
container_of(pin, struct bkey_cached, journal);
struct bkey_cached_key key;
+ struct btree_trans trans;
+ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
int ret = 0;
- int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ bch2_trans_init(&trans, c, 0, 0);
- six_lock_read(&ck->c.lock, NULL, NULL);
+ btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read);
key = ck->key;
if (ck->journal.seq != seq ||
@@ -479,14 +733,22 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
six_unlock_read(&ck->c.lock);
goto unlock;
}
+
+ if (ck->seq != seq) {
+ bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
+ bch2_btree_key_cache_journal_flush);
+ six_unlock_read(&ck->c.lock);
+ goto unlock;
+ }
six_unlock_read(&ck->c.lock);
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
btree_key_cache_flush_pos(&trans, key, seq,
BTREE_INSERT_JOURNAL_RECLAIM, false));
unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+ bch2_trans_exit(&trans);
return ret;
}
@@ -507,21 +769,22 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
}
bool bch2_btree_insert_key_cached(struct btree_trans *trans,
- struct btree_path *path,
- struct bkey_i *insert)
+ unsigned flags,
+ struct btree_insert_entry *insert_entry)
{
struct bch_fs *c = trans->c;
- struct bkey_cached *ck = (void *) path->l[0].b;
+ struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
+ struct bkey_i *insert = insert_entry->k;
bool kick_reclaim = false;
- BUG_ON(insert->u64s > ck->u64s);
+ BUG_ON(insert->k.u64s > ck->u64s);
- if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
int difference;
- BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
+ BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
- difference = jset_u64s(insert->u64s) - ck->res.u64s;
+ difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
if (difference > 0) {
trans->journal_preres.u64s -= difference;
ck->res.u64s += difference;
@@ -539,21 +802,50 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
kick_reclaim = true;
}
- bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
- &ck->journal, bch2_btree_key_cache_journal_flush);
+ /*
+ * To minimize lock contention, we only add the journal pin here and
+ * defer pin updates to the flush callback via ->seq. Be careful not to
+ * update ->seq on nojournal commits because we don't want to update the
+ * pin to a seq that doesn't include journal updates on disk. Otherwise
+ * we risk losing the update after a crash.
+ *
+ * The only exception is if the pin is not active in the first place. We
+ * have to add the pin because journal reclaim drives key cache
+ * flushing. The flush callback will not proceed unless ->seq matches
+ * the latest pin, so make sure it starts with a consistent value.
+ */
+ if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
+ !journal_pin_active(&ck->journal)) {
+ ck->seq = trans->journal_res.seq;
+ }
+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+ &ck->journal, bch2_btree_key_cache_journal_flush);
if (kick_reclaim)
journal_reclaim_kick(&c->journal);
return true;
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
- enum btree_id id, struct bpos pos)
+void bch2_btree_key_cache_drop(struct btree_trans *trans,
+ struct btree_path *path)
{
- BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
+ struct bch_fs *c = trans->c;
+ struct bkey_cached *ck = (void *) path->l[0].b;
+
+ BUG_ON(!ck->valid);
+
+ /*
+ * We just did an update to the btree, bypassing the key cache: the key
+ * cache key is now stale and must be dropped, even if dirty:
+ */
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ atomic_long_dec(&c->btree_key_cache.nr_dirty);
+ bch2_journal_pin_drop(&c->journal, &ck->journal);
+ }
+
+ ck->valid = false;
}
-#endif
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
@@ -567,12 +859,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
unsigned start, flags;
int srcu_idx;
- /* Return -1 if we can't do anything right now */
- if (sc->gfp_mask & __GFP_FS)
- mutex_lock(&bc->lock);
- else if (!mutex_trylock(&bc->lock))
- return -1;
-
+ mutex_lock(&bc->lock);
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
flags = memalloc_nofs_save();
@@ -580,14 +867,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
- list_for_each_entry_safe(ck, t, &bc->freed, list) {
+ list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
break;
list_del(&ck->list);
+ six_lock_pcpu_free(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
- bc->nr_freed--;
+ atomic_long_dec(&bc->nr_freed);
+ scanned++;
+ freed++;
+ }
+
+ if (scanned >= nr)
+ goto out;
+
+ list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
+ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+ ck->btree_trans_barrier_seq))
+ break;
+
+ list_del(&ck->list);
+ six_lock_pcpu_free(&ck->c.lock);
+ kmem_cache_free(bch2_key_cache, ck);
+ atomic_long_dec(&bc->nr_freed);
scanned++;
freed++;
}
@@ -604,7 +908,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
do {
struct rhash_head *pos, *next;
- pos = *rht_bucket(tbl, bc->shrink_iter);
+ pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
while (!rht_is_a_nulls(pos)) {
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
@@ -659,24 +963,50 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
struct bucket_table *tbl;
struct bkey_cached *ck, *n;
struct rhash_head *pos;
+ LIST_HEAD(items);
unsigned i;
+#ifdef __KERNEL__
+ int cpu;
+#endif
if (bc->shrink.list.next)
unregister_shrinker(&bc->shrink);
mutex_lock(&bc->lock);
- rcu_read_lock();
- tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
- if (tbl)
- for (i = 0; i < tbl->size; i++)
- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
- bkey_cached_evict(bc, ck);
- list_add(&ck->list, &bc->freed);
- }
- rcu_read_unlock();
+ /*
+ * The loop is needed to guard against racing with rehash:
+ */
+ while (atomic_long_read(&bc->nr_keys)) {
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+ if (tbl)
+ for (i = 0; i < tbl->size; i++)
+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+ bkey_cached_evict(bc, ck);
+ list_add(&ck->list, &items);
+ }
+ rcu_read_unlock();
+ }
+
+#ifdef __KERNEL__
+ for_each_possible_cpu(cpu) {
+ struct btree_key_cache_freelist *f =
+ per_cpu_ptr(bc->pcpu_freed, cpu);
+
+ for (i = 0; i < f->nr; i++) {
+ ck = f->objs[i];
+ list_add(&ck->list, &items);
+ }
+ }
+#endif
+
+ list_splice(&bc->freed_pcpu, &items);
+ list_splice(&bc->freed_nonpcpu, &items);
+
+ mutex_unlock(&bc->lock);
- list_for_each_entry_safe(ck, n, &bc->freed, list) {
+ list_for_each_entry_safe(ck, n, &items, list) {
cond_resched();
bch2_journal_pin_drop(&c->journal, &ck->journal);
@@ -684,53 +1014,82 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
list_del(&ck->list);
kfree(ck->k);
+ six_lock_pcpu_free(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
}
- BUG_ON(atomic_long_read(&bc->nr_dirty) &&
- !bch2_journal_error(&c->journal) &&
- test_bit(BCH_FS_WAS_RW, &c->flags));
- BUG_ON(atomic_long_read(&bc->nr_keys));
+ if (atomic_long_read(&bc->nr_dirty) &&
+ !bch2_journal_error(&c->journal) &&
+ test_bit(BCH_FS_WAS_RW, &c->flags))
+ panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
+ atomic_long_read(&bc->nr_dirty));
- mutex_unlock(&bc->lock);
+ if (atomic_long_read(&bc->nr_keys))
+ panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
+ atomic_long_read(&bc->nr_keys));
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
+
+ free_percpu(bc->pcpu_freed);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
mutex_init(&c->lock);
- INIT_LIST_HEAD(&c->freed);
+ INIT_LIST_HEAD(&c->freed_pcpu);
+ INIT_LIST_HEAD(&c->freed_nonpcpu);
}
-int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
+static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
{
- int ret;
+ struct btree_key_cache *bc =
+ container_of(shrink, struct btree_key_cache, shrink);
+ char *cbuf;
+ size_t buflen = seq_buf_get_buf(s, &cbuf);
+ struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
+
+ bch2_btree_key_cache_to_text(&out, bc);
+ seq_buf_commit(s, out.pos);
+}
- ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
- if (ret)
- return ret;
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
- c->table_init_done = true;
+#ifdef __KERNEL__
+ bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
+ if (!bc->pcpu_freed)
+ return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+#endif
+
+ if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
+ return -BCH_ERR_ENOMEM_fs_btree_cache_init;
- c->shrink.seeks = 1;
- c->shrink.count_objects = bch2_btree_key_cache_count;
- c->shrink.scan_objects = bch2_btree_key_cache_scan;
- return register_shrinker(&c->shrink);
+ bc->table_init_done = true;
+
+ bc->shrink.seeks = 0;
+ bc->shrink.count_objects = bch2_btree_key_cache_count;
+ bc->shrink.scan_objects = bch2_btree_key_cache_scan;
+ bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text;
+ if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name))
+ return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return 0;
}
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
{
- pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed);
- pr_buf(out, "nr_keys:\t%zu\n", atomic_long_read(&c->nr_keys));
- pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty));
+ prt_printf(out, "nr_freed:\t%zu", atomic_long_read(&c->nr_freed));
+ prt_newline(out);
+ prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys));
+ prt_newline(out);
+ prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty));
+ prt_newline(out);
}
void bch2_btree_key_cache_exit(void)
{
- if (bch2_key_cache)
- kmem_cache_destroy(bch2_key_cache);
+ kmem_cache_destroy(bch2_key_cache);
}
int __init bch2_btree_key_cache_init(void)
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index b3d241b13453..be3acde2caa0 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
#define _BCACHEFS_BTREE_KEY_CACHE_H
@@ -28,18 +29,12 @@ bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
unsigned);
-bool bch2_btree_insert_key_cached(struct btree_trans *,
- struct btree_path *, struct bkey_i *);
+bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
+ struct btree_insert_entry *);
int bch2_btree_key_cache_flush(struct btree_trans *,
enum btree_id, struct bpos);
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_key_cache_verify_clean(struct btree_trans *,
- enum btree_id, struct bpos);
-#else
-static inline void
-bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
- enum btree_id id, struct bpos pos) {}
-#endif
+void bch2_btree_key_cache_drop(struct btree_trans *,
+ struct btree_path *);
void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
new file mode 100644
index 000000000000..b99986653ade
--- /dev/null
+++ b/fs/bcachefs/btree_locking.c
@@ -0,0 +1,795 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_types.h"
+
+static struct lock_class_key bch2_btree_node_lock_key;
+
+void bch2_btree_lock_init(struct btree_bkey_cached_common *b)
+{
+ __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key);
+}
+
+#ifdef CONFIG_LOCKDEP
+void bch2_assert_btree_nodes_not_locked(void)
+{
+ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+}
+#endif
+
+/* Btree node locking: */
+
+static inline void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+ if (lock->readers)
+ this_cpu_add(*lock->readers, nr);
+ else if (nr > 0)
+ atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
+ else
+ atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
+}
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
+ struct btree_path *skip,
+ struct btree_bkey_cached_common *b,
+ unsigned level)
+{
+ struct btree_path *path;
+ struct six_lock_count ret;
+
+ memset(&ret, 0, sizeof(ret));
+
+ if (IS_ERR_OR_NULL(b))
+ return ret;
+
+ trans_for_each_path(trans, path)
+ if (path != skip && &path->l[level].b->c == b) {
+ int t = btree_node_locked_type(path, level);
+
+ if (t != BTREE_NODE_UNLOCKED)
+ ret.n[t]++;
+ }
+
+ return ret;
+}
+
+/* unlock */
+
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+ struct btree_path *path, struct btree *b)
+{
+ bch2_btree_node_unlock_write_inlined(trans, path, b);
+}
+
+/* lock */
+
+/*
+ * @trans wants to lock @b with type @type
+ */
+struct trans_waiting_for_lock {
+ struct btree_trans *trans;
+ struct btree_bkey_cached_common *node_want;
+ enum six_lock_type lock_want;
+
+ /* for iterating over held locks :*/
+ u8 path_idx;
+ u8 level;
+ u64 lock_start_time;
+};
+
+struct lock_graph {
+ struct trans_waiting_for_lock g[8];
+ unsigned nr;
+};
+
+static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ prt_printf(out, "Found lock cycle (%u entries):", g->nr);
+ prt_newline(out);
+
+ for (i = g->g; i < g->g + g->nr; i++)
+ bch2_btree_trans_to_text(out, i->trans);
+}
+
+static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g; i != g->g + g->nr; i++) {
+ if (i != g->g)
+ prt_str(out, "<- ");
+ prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
+ }
+ prt_newline(out);
+}
+
+static void lock_graph_up(struct lock_graph *g)
+{
+ closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static noinline void lock_graph_pop_all(struct lock_graph *g)
+{
+ while (g->nr)
+ lock_graph_up(g);
+}
+
+static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+ closure_get(&trans->ref);
+
+ g->g[g->nr++] = (struct trans_waiting_for_lock) {
+ .trans = trans,
+ .node_want = trans->locking,
+ .lock_want = trans->locking_wait.lock_want,
+ };
+}
+
+static bool lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g + 1; i < g->g + g->nr; i++)
+ if (i->trans->locking != i->node_want ||
+ i->trans->locking_wait.start_time != i[-1].lock_start_time) {
+ while (g->g + g->nr > i)
+ lock_graph_up(g);
+ return true;
+ }
+
+ return false;
+}
+
+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
+{
+ if (i == g->g) {
+ trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
+ return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+ } else {
+ i->trans->lock_must_abort = true;
+ wake_up_process(i->trans->locking_wait.task);
+ return 0;
+ }
+}
+
+static int btree_trans_abort_preference(struct btree_trans *trans)
+{
+ if (trans->lock_may_not_fail)
+ return 0;
+ if (trans->locking_wait.lock_want == SIX_LOCK_write)
+ return 1;
+ if (!trans->in_traverse_all)
+ return 2;
+ return 3;
+}
+
+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
+{
+ struct trans_waiting_for_lock *i, *abort = NULL;
+ unsigned best = 0, pref;
+ int ret;
+
+ if (lock_graph_remove_non_waiters(g))
+ return 0;
+
+ /* Only checking, for debugfs: */
+ if (cycle) {
+ print_cycle(cycle, g);
+ ret = -1;
+ goto out;
+ }
+
+ for (i = g->g; i < g->g + g->nr; i++) {
+ pref = btree_trans_abort_preference(i->trans);
+ if (pref > best) {
+ abort = i;
+ best = pref;
+ }
+ }
+
+ if (unlikely(!best)) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
+
+ for (i = g->g; i < g->g + g->nr; i++) {
+ struct btree_trans *trans = i->trans;
+
+ bch2_btree_trans_to_text(&buf, trans);
+
+ prt_printf(&buf, "backtrace:");
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+ bch2_prt_task_backtrace(&buf, trans->locking_wait.task);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+ }
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ BUG();
+ }
+
+ ret = abort_lock(g, abort);
+out:
+ if (ret)
+ while (g->nr)
+ lock_graph_up(g);
+ return ret;
+}
+
+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
+ struct printbuf *cycle)
+{
+ struct btree_trans *orig_trans = g->g->trans;
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g; i < g->g + g->nr; i++)
+ if (i->trans == trans)
+ return break_cycle(g, cycle);
+
+ if (g->nr == ARRAY_SIZE(g->g)) {
+ if (orig_trans->lock_may_not_fail)
+ return 0;
+
+ while (g->nr)
+ lock_graph_up(g);
+
+ if (cycle)
+ return 0;
+
+ trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
+ return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
+ }
+
+ lock_graph_down(g, trans);
+ return 0;
+}
+
+static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
+{
+ return t1 + t2 > 1;
+}
+
+int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
+{
+ struct lock_graph g;
+ struct trans_waiting_for_lock *top;
+ struct btree_bkey_cached_common *b;
+ struct btree_path *path;
+ int ret;
+
+ if (trans->lock_must_abort) {
+ if (cycle)
+ return -1;
+
+ trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+ }
+
+ g.nr = 0;
+ lock_graph_down(&g, trans);
+next:
+ if (!g.nr)
+ return 0;
+
+ top = &g.g[g.nr - 1];
+
+ trans_for_each_path_from(top->trans, path, top->path_idx) {
+ if (!path->nodes_locked)
+ continue;
+
+ if (top->path_idx != path->idx) {
+ top->path_idx = path->idx;
+ top->level = 0;
+ top->lock_start_time = 0;
+ }
+
+ for (;
+ top->level < BTREE_MAX_DEPTH;
+ top->level++, top->lock_start_time = 0) {
+ int lock_held = btree_node_locked_type(path, top->level);
+
+ if (lock_held == BTREE_NODE_UNLOCKED)
+ continue;
+
+ b = &READ_ONCE(path->l[top->level].b)->c;
+
+ if (IS_ERR_OR_NULL(b)) {
+ /*
+ * If we get here, it means we raced with the
+ * other thread updating its btree_path
+ * structures - which means it can't be blocked
+ * waiting on a lock:
+ */
+ if (!lock_graph_remove_non_waiters(&g)) {
+ /*
+ * If lock_graph_remove_non_waiters()
+ * didn't do anything, it must be
+ * because we're being called by debugfs
+ * checking for lock cycles, which
+ * invokes us on btree_transactions that
+ * aren't actually waiting on anything.
+ * Just bail out:
+ */
+ lock_graph_pop_all(&g);
+ }
+
+ goto next;
+ }
+
+ if (list_empty_careful(&b->lock.wait_list))
+ continue;
+
+ raw_spin_lock(&b->lock.wait_lock);
+ list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
+ BUG_ON(b != trans->locking);
+
+ if (top->lock_start_time &&
+ time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
+ continue;
+
+ top->lock_start_time = trans->locking_wait.start_time;
+
+ /* Don't check for self deadlock: */
+ if (trans == top->trans ||
+ !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
+ continue;
+
+ ret = lock_graph_descend(&g, trans, cycle);
+ raw_spin_unlock(&b->lock.wait_lock);
+
+ if (ret)
+ return ret;
+ goto next;
+
+ }
+ raw_spin_unlock(&b->lock.wait_lock);
+ }
+ }
+
+ if (g.nr > 1 && cycle)
+ print_chain(cycle, &g);
+ lock_graph_up(&g);
+ goto next;
+}
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
+{
+ struct btree_trans *trans = p;
+
+ return bch2_check_for_deadlock(trans, NULL);
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
+ struct btree_bkey_cached_common *b,
+ bool lock_may_not_fail)
+{
+ int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
+ int ret;
+
+ /*
+ * Must drop our read locks before calling six_lock_write() -
+ * six_unlock() won't do wakeups until the reader count
+ * goes to 0, and it's safe because we have the node intent
+ * locked:
+ */
+ six_lock_readers_add(&b->lock, -readers);
+ ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
+ lock_may_not_fail, _RET_IP_);
+ six_lock_readers_add(&b->lock, readers);
+
+ if (ret)
+ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
+
+ return ret;
+}
+
+void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b)
+{
+ struct btree_path *linked;
+ unsigned i;
+ int ret;
+
+ /*
+ * XXX BIG FAT NOTICE
+ *
+ * Drop all read locks before taking a write lock:
+ *
+ * This is a hack, because bch2_btree_node_lock_write_nofail() is a
+ * hack - but by dropping read locks first, this should never fail, and
+ * we only use this in code paths where whatever read locks we've
+ * already taken are no longer needed:
+ */
+
+ trans_for_each_path(trans, linked) {
+ if (!linked->nodes_locked)
+ continue;
+
+ for (i = 0; i < BTREE_MAX_DEPTH; i++)
+ if (btree_node_read_locked(linked, i)) {
+ btree_node_unlock(trans, linked, i);
+ btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
+ }
+ }
+
+ ret = __btree_node_lock_write(trans, path, b, true);
+ BUG_ON(ret);
+}
+
+/* relock */
+
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+ struct btree_path *path,
+ bool upgrade)
+{
+ unsigned l = path->level;
+ int fail_idx = -1;
+
+ do {
+ if (!btree_path_node(path, l))
+ break;
+
+ if (!(upgrade
+ ? bch2_btree_node_upgrade(trans, path, l)
+ : bch2_btree_node_relock(trans, path, l)))
+ fail_idx = l;
+
+ l++;
+ } while (l < path->locks_want);
+
+ /*
+ * When we fail to get a lock, we have to ensure that any child nodes
+ * can't be relocked so bch2_btree_path_traverse has to walk back up to
+ * the node that we failed to relock:
+ */
+ if (fail_idx >= 0) {
+ __bch2_btree_path_unlock(trans, path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+ do {
+ path->l[fail_idx].b = upgrade
+ ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+ : ERR_PTR(-BCH_ERR_no_btree_node_relock);
+ --fail_idx;
+ } while (fail_idx >= 0);
+ }
+
+ if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+ path->uptodate = BTREE_ITER_UPTODATE;
+
+ bch2_trans_verify_locks(trans);
+
+ return path->uptodate < BTREE_ITER_NEED_RELOCK;
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned level,
+ bool trace)
+{
+ struct btree *b = btree_path_node(path, level);
+ int want = __btree_lock_want(path, level);
+
+ if (race_fault())
+ goto fail;
+
+ if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+ (btree_node_lock_seq_matches(path, b, level) &&
+ btree_node_lock_increment(trans, &b->c, level, want))) {
+ mark_btree_node_locked(trans, path, level, want);
+ return true;
+ }
+fail:
+ if (trace && !trans->notrace_relock_fail)
+ trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
+ return false;
+}
+
+/* upgrade */
+
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
+{
+ struct btree *b = path->l[level].b;
+ struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
+
+ if (!is_btree_node(path, level))
+ return false;
+
+ switch (btree_lock_want(path, level)) {
+ case BTREE_NODE_UNLOCKED:
+ BUG_ON(btree_node_locked(path, level));
+ return true;
+ case BTREE_NODE_READ_LOCKED:
+ BUG_ON(btree_node_intent_locked(path, level));
+ return bch2_btree_node_relock(trans, path, level);
+ case BTREE_NODE_INTENT_LOCKED:
+ break;
+ case BTREE_NODE_WRITE_LOCKED:
+ BUG();
+ }
+
+ if (btree_node_intent_locked(path, level))
+ return true;
+
+ if (race_fault())
+ return false;
+
+ if (btree_node_locked(path, level)) {
+ bool ret;
+
+ six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
+ ret = six_lock_tryupgrade(&b->c.lock);
+ six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
+
+ if (ret)
+ goto success;
+ } else {
+ if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+ goto success;
+ }
+
+ /*
+ * Do we already have an intent lock via another path? If so, just bump
+ * lock count:
+ */
+ if (btree_node_lock_seq_matches(path, b, level) &&
+ btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
+ btree_node_unlock(trans, path, level);
+ goto success;
+ }
+
+ trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
+ return false;
+success:
+ mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
+ return true;
+}
+
+/* Btree path locking: */
+
+/*
+ * Only for btree_cache.c - only relocks intent locks
+ */
+int bch2_btree_path_relock_intent(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ unsigned l;
+
+ for (l = path->level;
+ l < path->locks_want && btree_path_node(path, l);
+ l++) {
+ if (!bch2_btree_node_relock(trans, path, l)) {
+ __bch2_btree_path_unlock(trans, path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
+ }
+ }
+
+ return 0;
+}
+
+__flatten
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
+{
+ return btree_path_get_locks(trans, path, false);
+}
+
+int __bch2_btree_path_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
+{
+ if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+ trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
+ }
+
+ return 0;
+}
+
+__flatten
+bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
+{
+ return btree_path_get_locks(trans, path, true);
+}
+
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ EBUG_ON(path->locks_want >= new_locks_want);
+
+ path->locks_want = new_locks_want;
+
+ return btree_path_get_locks(trans, path, true);
+}
+
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ struct btree_path *linked;
+
+ if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
+ return true;
+
+ /*
+ * XXX: this is ugly - we'd prefer to not be mucking with other
+ * iterators in the btree_trans here.
+ *
+ * On failure to upgrade the iterator, setting iter->locks_want and
+ * calling get_locks() is sufficient to make bch2_btree_path_traverse()
+ * get the locks we want on transaction restart.
+ *
+ * But if this iterator was a clone, on transaction restart what we did
+ * to this iterator isn't going to be preserved.
+ *
+ * Possibly we could add an iterator field for the parent iterator when
+ * an iterator is a copy - for now, we'll just upgrade any other
+ * iterators with the same btree id.
+ *
+ * The code below used to be needed to ensure ancestor nodes get locked
+ * before interior nodes - now that's handled by
+ * bch2_btree_path_traverse_all().
+ */
+ if (!path->cached && !trans->in_traverse_all)
+ trans_for_each_path(trans, linked)
+ if (linked != path &&
+ linked->cached == path->cached &&
+ linked->btree_id == path->btree_id &&
+ linked->locks_want < new_locks_want) {
+ linked->locks_want = new_locks_want;
+ btree_path_get_locks(trans, linked, true);
+ }
+
+ return false;
+}
+
+void __bch2_btree_path_downgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ unsigned l;
+
+ EBUG_ON(path->locks_want < new_locks_want);
+
+ path->locks_want = new_locks_want;
+
+ while (path->nodes_locked &&
+ (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
+ if (l > path->level) {
+ btree_node_unlock(trans, path, l);
+ } else {
+ if (btree_node_intent_locked(path, l)) {
+ six_lock_downgrade(&path->l[l].b->c.lock);
+ mark_btree_node_locked_noreset(path, l, SIX_LOCK_read);
+ }
+ break;
+ }
+ }
+
+ bch2_btree_path_verify_locks(path);
+}
+
+/* Btree transaction locking: */
+
+void bch2_trans_downgrade(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ bch2_btree_path_downgrade(trans, path);
+}
+
+int bch2_trans_relock(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ if (unlikely(trans->restarted))
+ return -((int) trans->restarted);
+
+ trans_for_each_path(trans, path)
+ if (path->should_be_locked &&
+ !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+ trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ }
+ return 0;
+}
+
+int bch2_trans_relock_notrace(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ if (unlikely(trans->restarted))
+ return -((int) trans->restarted);
+
+ trans_for_each_path(trans, path)
+ if (path->should_be_locked &&
+ !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ }
+ return 0;
+}
+
+void bch2_trans_unlock(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ __bch2_btree_path_unlock(trans, path);
+
+ /*
+ * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
+ * btree nodes, it implements its own walking:
+ */
+ if (!trans->is_initial_gc)
+ bch2_assert_btree_nodes_not_locked();
+}
+
+bool bch2_trans_locked(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ if (path->nodes_locked)
+ return true;
+ return false;
+}
+
+int __bch2_trans_mutex_lock(struct btree_trans *trans,
+ struct mutex *lock)
+{
+ int ret;
+
+ bch2_trans_unlock(trans);
+ mutex_lock(lock);
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ mutex_unlock(lock);
+ return ret;
+}
+
+/* Debug */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void bch2_btree_path_verify_locks(struct btree_path *path)
+{
+ unsigned l;
+
+ if (!path->nodes_locked) {
+ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+ btree_path_node(path, path->level));
+ return;
+ }
+
+ for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+ int want = btree_lock_want(path, l);
+ int have = btree_node_locked_type(path, l);
+
+ BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
+
+ BUG_ON(is_btree_node(path, l) &&
+ (want == BTREE_NODE_UNLOCKED ||
+ have != BTREE_NODE_WRITE_LOCKED) &&
+ want != have);
+ }
+}
+
+void bch2_trans_verify_locks(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ bch2_btree_path_verify_locks(path);
+}
+
+#endif
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index d599008c5fc1..327780ce8e9a 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -14,66 +14,88 @@
#include "btree_iter.h"
+void bch2_btree_lock_init(struct btree_bkey_cached_common *);
+
+#ifdef CONFIG_LOCKDEP
+void bch2_assert_btree_nodes_not_locked(void);
+#else
+static inline void bch2_assert_btree_nodes_not_locked(void) {}
+#endif
+
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
+{
+ return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
+}
+
+static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
+{
+ return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
+ ? &trans->c->btree_transaction_stats[trans->fn_idx]
+ : NULL;
+}
+
/* matches six lock types */
enum btree_node_locked_type {
BTREE_NODE_UNLOCKED = -1,
BTREE_NODE_READ_LOCKED = SIX_LOCK_read,
BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
+ BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write,
};
static inline int btree_node_locked_type(struct btree_path *path,
unsigned level)
{
- /*
- * We're relying on the fact that if nodes_intent_locked is set
- * nodes_locked must be set as well, so that we can compute without
- * branches:
- */
- return BTREE_NODE_UNLOCKED +
- ((path->nodes_locked >> level) & 1) +
- ((path->nodes_intent_locked >> level) & 1);
+ return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
}
-static inline bool btree_node_intent_locked(struct btree_path *path,
- unsigned level)
+static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
{
- return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED;
+ return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
}
-static inline bool btree_node_read_locked(struct btree_path *path,
- unsigned level)
+static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
{
- return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED;
+ return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
}
-static inline bool btree_node_locked(struct btree_path *path, unsigned level)
+static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
{
- return path->nodes_locked & (1 << level);
+ return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
}
-static inline void mark_btree_node_unlocked(struct btree_path *path,
- unsigned level)
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
{
- path->nodes_locked &= ~(1 << level);
- path->nodes_intent_locked &= ~(1 << level);
+ return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
}
-static inline void mark_btree_node_locked(struct btree_path *path,
- unsigned level,
- enum six_lock_type type)
+static inline void mark_btree_node_locked_noreset(struct btree_path *path,
+ unsigned level,
+ enum btree_node_locked_type type)
{
/* relying on this to avoid a branch */
BUILD_BUG_ON(SIX_LOCK_read != 0);
BUILD_BUG_ON(SIX_LOCK_intent != 1);
- path->nodes_locked |= 1 << level;
- path->nodes_intent_locked |= type << level;
+ path->nodes_locked &= ~(3U << (level << 1));
+ path->nodes_locked |= (type + 1) << (level << 1);
}
-static inline void mark_btree_node_intent_locked(struct btree_path *path,
- unsigned level)
+static inline void mark_btree_node_unlocked(struct btree_path *path,
+ unsigned level)
{
- mark_btree_node_locked(path, level, SIX_LOCK_intent);
+ EBUG_ON(btree_node_write_locked(path, level));
+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
+}
+
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned level,
+ enum six_lock_type type)
+{
+ mark_btree_node_locked_noreset(path, level, type);
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ path->l[level].lock_taken_time = local_clock();
+#endif
}
static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
@@ -95,56 +117,118 @@ btree_lock_want(struct btree_path *path, int level)
return BTREE_NODE_UNLOCKED;
}
-static inline void btree_node_unlock(struct btree_path *path, unsigned level)
+static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
+{
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+ if (s)
+ __bch2_time_stats_update(&s->lock_hold_times,
+ path->l[level].lock_taken_time,
+ local_clock());
+#endif
+}
+
+/* unlock: */
+
+static inline void btree_node_unlock(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
{
int lock_type = btree_node_locked_type(path, level);
EBUG_ON(level >= BTREE_MAX_DEPTH);
- if (lock_type != BTREE_NODE_UNLOCKED)
+ if (lock_type != BTREE_NODE_UNLOCKED) {
six_unlock_type(&path->l[level].b->c.lock, lock_type);
+ btree_trans_lock_hold_time_update(trans, path, level);
+ }
mark_btree_node_unlocked(path, level);
}
-static inline void __bch2_btree_path_unlock(struct btree_path *path)
+static inline int btree_path_lowest_level_locked(struct btree_path *path)
{
- btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
+ return __ffs(path->nodes_locked) >> 1;
+}
- while (path->nodes_locked)
- btree_node_unlock(path, __ffs(path->nodes_locked));
+static inline int btree_path_highest_level_locked(struct btree_path *path)
+{
+ return __fls(path->nodes_locked) >> 1;
}
-static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
+ struct btree_path *path)
{
- switch (type) {
- case SIX_LOCK_read:
- return BCH_TIME_btree_lock_contended_read;
- case SIX_LOCK_intent:
- return BCH_TIME_btree_lock_contended_intent;
- case SIX_LOCK_write:
- return BCH_TIME_btree_lock_contended_write;
- default:
- BUG();
- }
+ btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
+
+ while (path->nodes_locked)
+ btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
}
/*
- * wrapper around six locks that just traces lock contended time
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
*/
-static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
- enum six_lock_type type)
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+ struct btree *b)
+{
+ struct btree_path *linked;
+
+ EBUG_ON(path->l[b->c.level].b != b);
+ EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+ EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
+
+ mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
+
+ trans_for_each_path_with_node(trans, b, linked)
+ linked->l[b->c.level].lock_seq += 2;
+
+ six_unlock_write(&b->c.lock);
+}
+
+void bch2_btree_node_unlock_write(struct btree_trans *,
+ struct btree_path *, struct btree *);
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
+
+/* lock: */
+
+static inline int __btree_node_lock_nopath(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type,
+ bool lock_may_not_fail,
+ unsigned long ip)
{
- u64 start_time = local_clock();
+ int ret;
+
+ trans->lock_may_not_fail = lock_may_not_fail;
+ trans->lock_must_abort = false;
+ trans->locking = b;
+
+ ret = six_lock_type_ip_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans, ip);
+ WRITE_ONCE(trans->locking, NULL);
+ WRITE_ONCE(trans->locking_wait.start_time, 0);
+ return ret;
+}
- six_lock_type(&b->c.lock, type, NULL, NULL);
- bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+static inline int __must_check
+btree_node_lock_nopath(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type,
+ unsigned long ip)
+{
+ return __btree_node_lock_nopath(trans, b, type, false, ip);
}
-static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
- enum six_lock_type type)
+static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type)
{
- if (!six_trylock_type(&b->c.lock, type))
- __btree_node_lock_type(c, b, type);
+ int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_);
+
+ BUG_ON(ret);
}
/*
@@ -152,92 +236,187 @@ static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
* iterators:
*/
static inline bool btree_node_lock_increment(struct btree_trans *trans,
- struct btree *b, unsigned level,
+ struct btree_bkey_cached_common *b,
+ unsigned level,
enum btree_node_locked_type want)
{
struct btree_path *path;
trans_for_each_path(trans, path)
- if (path->l[level].b == b &&
+ if (&path->l[level].b->c == b &&
btree_node_locked_type(path, level) >= want) {
- six_lock_increment(&b->c.lock, want);
+ six_lock_increment(&b->lock, want);
return true;
}
return false;
}
-bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
- struct btree *, struct bpos, unsigned,
- enum six_lock_type,
- six_lock_should_sleep_fn, void *,
- unsigned long);
-
-static inline bool btree_node_lock(struct btree_trans *trans,
+static inline int btree_node_lock(struct btree_trans *trans,
struct btree_path *path,
- struct btree *b, struct bpos pos, unsigned level,
+ struct btree_bkey_cached_common *b,
+ unsigned level,
enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
unsigned long ip)
{
+ int ret = 0;
+
EBUG_ON(level >= BTREE_MAX_DEPTH);
EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
- return likely(six_trylock_type(&b->c.lock, type)) ||
- btree_node_lock_increment(trans, b, level, type) ||
- __bch2_btree_node_lock(trans, path, b, pos, level, type,
- should_sleep_fn, p, ip);
+ if (likely(six_trylock_type(&b->lock, type)) ||
+ btree_node_lock_increment(trans, b, level, type) ||
+ !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ path->l[b->level].lock_taken_time = local_clock();
+#endif
+ }
+
+ return ret;
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
+ struct btree_bkey_cached_common *b, bool);
+
+static inline int __btree_node_lock_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b,
+ bool lock_may_not_fail)
+{
+ EBUG_ON(&path->l[b->level].b->c != b);
+ EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
+ EBUG_ON(!btree_node_intent_locked(path, b->level));
+
+ /*
+ * six locks are unfair, and read locks block while a thread wants a
+ * write lock: thus, we need to tell the cycle detector we have a write
+ * lock _before_ taking the lock:
+ */
+ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
+
+ return likely(six_trylock_write(&b->lock))
+ ? 0
+ : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
+}
+
+static inline int __must_check
+bch2_btree_node_lock_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b)
+{
+ return __btree_node_lock_write(trans, path, b, false);
}
-bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
+void bch2_btree_node_lock_write_nofail(struct btree_trans *,
+ struct btree_path *,
+ struct btree_bkey_cached_common *);
+
+/* relock: */
+
+bool bch2_btree_path_relock_norestart(struct btree_trans *,
+ struct btree_path *, unsigned long);
+int __bch2_btree_path_relock(struct btree_trans *,
+ struct btree_path *, unsigned long);
+
+static inline int bch2_btree_path_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
+{
+ return btree_node_locked(path, path->level)
+ ? 0
+ : __bch2_btree_path_relock(trans, path, trace_ip);
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
static inline bool bch2_btree_node_relock(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
EBUG_ON(btree_node_locked(path, level) &&
- btree_node_locked_type(path, level) !=
- __btree_lock_want(path, level));
+ !btree_node_write_locked(path, level) &&
+ btree_node_locked_type(path, level) != __btree_lock_want(path, level));
return likely(btree_node_locked(path, level)) ||
- __bch2_btree_node_relock(trans, path, level);
+ (!IS_ERR_OR_NULL(path->l[level].b) &&
+ __bch2_btree_node_relock(trans, path, level, true));
}
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
-static inline void
-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
- struct btree *b)
+static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
{
- struct btree_path *linked;
+ EBUG_ON(btree_node_locked(path, level) &&
+ !btree_node_write_locked(path, level) &&
+ btree_node_locked_type(path, level) != __btree_lock_want(path, level));
- EBUG_ON(path->l[b->c.level].b != b);
- EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+ return likely(btree_node_locked(path, level)) ||
+ (!IS_ERR_OR_NULL(path->l[level].b) &&
+ __bch2_btree_node_relock(trans, path, level, false));
+}
- trans_for_each_path_with_node(trans, b, linked)
- linked->l[b->c.level].lock_seq += 2;
+/* upgrade */
- six_unlock_write(&b->c.lock);
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
+ struct btree_path *, unsigned);
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned);
+
+static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ unsigned old_locks_want = path->locks_want;
+
+ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+ if (path->locks_want < new_locks_want
+ ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+ : path->uptodate == BTREE_ITER_UPTODATE)
+ return 0;
+
+ trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
+ old_locks_want, new_locks_want);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
}
-void bch2_btree_node_unlock_write(struct btree_trans *,
- struct btree_path *, struct btree *);
+/* misc: */
+
+static inline void btree_path_set_should_be_locked(struct btree_path *path)
+{
+ EBUG_ON(!btree_node_locked(path, path->level));
+ EBUG_ON(path->uptodate);
-void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
+ path->should_be_locked = true;
+}
-static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+static inline void __btree_path_set_level_up(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned l)
{
- EBUG_ON(path->l[b->c.level].b != b);
- EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
- EBUG_ON(!btree_node_intent_locked(path, b->c.level));
+ btree_node_unlock(trans, path, l);
+ path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
+}
- if (unlikely(!six_trylock_write(&b->c.lock)))
- __bch2_btree_node_lock_write(trans, b);
+static inline void btree_path_set_level_up(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ __btree_path_set_level_up(trans, path, path->level++);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
}
-#endif /* _BCACHEFS_BTREE_LOCKING_H */
+/* debug */
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
+ struct btree_path *,
+ struct btree_bkey_cached_common *b,
+ unsigned);
+int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_path_verify_locks(struct btree_path *);
+void bch2_trans_verify_locks(struct btree_trans *);
+#else
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+#endif
+
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 65f460e3c567..6250f34fe561 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -6,9 +6,12 @@
#include <linux/rhashtable.h>
#include <linux/six.h>
-#include "bkey_methods.h"
+//#include "bkey_methods.h"
#include "buckets_types.h"
+#include "darray.h"
+#include "errcode.h"
#include "journal_types.h"
+#include "replicas_types.h"
struct open_bucket;
struct btree_update;
@@ -62,6 +65,7 @@ struct btree_bkey_cached_common {
struct six_lock lock;
u8 level;
u8 btree_id;
+ bool cached;
};
struct btree {
@@ -152,11 +156,22 @@ struct btree_cache {
struct mutex lock;
struct list_head live;
struct list_head freeable;
- struct list_head freed;
+ struct list_head freed_pcpu;
+ struct list_head freed_nonpcpu;
/* Number of elements in live + freeable lists */
unsigned used;
unsigned reserve;
+ unsigned freed;
+ unsigned not_freed_lock_intent;
+ unsigned not_freed_lock_write;
+ unsigned not_freed_dirty;
+ unsigned not_freed_read_in_flight;
+ unsigned not_freed_write_in_flight;
+ unsigned not_freed_noevict;
+ unsigned not_freed_write_blocked;
+ unsigned not_freed_will_make_reachable;
+ unsigned not_freed_access_bit;
atomic_t dirty;
struct shrinker shrink;
@@ -179,39 +194,33 @@ struct btree_node_iter {
/*
* Iterate over all possible positions, synthesizing deleted keys for holes:
*/
-#define BTREE_ITER_SLOTS (1 << 0)
+static const u16 BTREE_ITER_SLOTS = 1 << 0;
+static const u16 BTREE_ITER_ALL_LEVELS = 1 << 1;
/*
* Indicates that intent locks should be taken on leaf nodes, because we expect
* to be doing updates:
*/
-#define BTREE_ITER_INTENT (1 << 1)
+static const u16 BTREE_ITER_INTENT = 1 << 2;
/*
* Causes the btree iterator code to prefetch additional btree nodes from disk:
*/
-#define BTREE_ITER_PREFETCH (1 << 2)
-/*
- * Indicates that this iterator should not be reused until transaction commit,
- * either because a pending update references it or because the update depends
- * on that particular key being locked (e.g. by the str_hash code, for hash
- * table consistency)
- */
-#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 3)
+static const u16 BTREE_ITER_PREFETCH = 1 << 3;
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
-#define BTREE_ITER_IS_EXTENTS (1 << 4)
-#define BTREE_ITER_NOT_EXTENTS (1 << 5)
-#define BTREE_ITER_ERROR (1 << 6)
-#define BTREE_ITER_CACHED (1 << 7)
-#define BTREE_ITER_CACHED_NOFILL (1 << 8)
-#define BTREE_ITER_CACHED_NOCREATE (1 << 9)
-#define BTREE_ITER_WITH_UPDATES (1 << 10)
-#define BTREE_ITER_WITH_JOURNAL (1 << 11)
-#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
-#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13)
-#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14)
-#define BTREE_ITER_NOPRESERVE (1 << 15)
+static const u16 BTREE_ITER_IS_EXTENTS = 1 << 4;
+static const u16 BTREE_ITER_NOT_EXTENTS = 1 << 5;
+static const u16 BTREE_ITER_CACHED = 1 << 6;
+static const u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7;
+static const u16 BTREE_ITER_WITH_UPDATES = 1 << 8;
+static const u16 BTREE_ITER_WITH_JOURNAL = 1 << 9;
+static const u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
+static const u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11;
+static const u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12;
+static const u16 BTREE_ITER_NOPRESERVE = 1 << 13;
+static const u16 BTREE_ITER_CACHED_NOFILL = 1 << 14;
+static const u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15;
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -219,14 +228,9 @@ enum btree_path_uptodate {
BTREE_ITER_NEED_TRAVERSE = 2,
};
-#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
-#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2)
-#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3)
-#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4)
-#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5)
-#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6)
-#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7)
-#define BTREE_ITER_NO_NODE_CACHED ((struct btree *) 8)
+#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
+#define TRACK_PATH_ALLOCATED
+#endif
struct btree_path {
u8 idx;
@@ -237,7 +241,7 @@ struct btree_path {
/* btree_iter_copy starts here: */
struct bpos pos;
- enum btree_id btree_id:4;
+ enum btree_id btree_id:5;
bool cached:1;
bool preserve:1;
enum btree_path_uptodate uptodate:2;
@@ -247,16 +251,18 @@ struct btree_path {
*/
bool should_be_locked:1;
unsigned level:3,
- locks_want:4,
- nodes_locked:4,
- nodes_intent_locked:4;
+ locks_want:3;
+ u8 nodes_locked;
struct btree_path_level {
struct btree *b;
struct btree_node_iter iter;
u32 lock_seq;
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+ u64 lock_taken_time;
+#endif
} l[BTREE_MAX_DEPTH];
-#ifdef CONFIG_BCACHEFS_DEBUG
+#ifdef TRACK_PATH_ALLOCATED
unsigned long ip_allocated;
#endif
};
@@ -266,6 +272,15 @@ static inline struct btree_path_level *path_l(struct btree_path *path)
return path->l + path->level;
}
+static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
+{
+#ifdef TRACK_PATH_ALLOCATED
+ return path->ip_allocated;
+#else
+ return _THIS_IP_;
+#endif
+}
+
/*
* @pos - iterator's current position
* @level - current btree depth
@@ -277,9 +292,11 @@ struct btree_iter {
struct btree_trans *trans;
struct btree_path *path;
struct btree_path *update_path;
+ struct btree_path *key_cache_path;
- enum btree_id btree_id:4;
- unsigned min_depth:4;
+ enum btree_id btree_id:8;
+ unsigned min_depth:3;
+ unsigned advanced:1;
/* btree_iter_copy starts here: */
u16 flags;
@@ -288,26 +305,36 @@ struct btree_iter {
unsigned snapshot;
struct bpos pos;
- struct bpos pos_after_commit;
/*
* Current unpacked key - so that bch2_btree_iter_next()/
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
-#ifdef CONFIG_BCACHEFS_DEBUG
+
+ /* BTREE_ITER_WITH_JOURNAL: */
+ size_t journal_idx;
+ struct bpos journal_pos;
+#ifdef TRACK_PATH_ALLOCATED
unsigned long ip_allocated;
#endif
};
+struct btree_key_cache_freelist {
+ struct bkey_cached *objs[16];
+ unsigned nr;
+};
+
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
- struct list_head freed;
+ struct list_head freed_pcpu;
+ struct list_head freed_nonpcpu;
struct shrinker shrink;
unsigned shrink_iter;
+ struct btree_key_cache_freelist __percpu *pcpu_freed;
- size_t nr_freed;
+ atomic_long_t nr_freed;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
};
@@ -315,7 +342,7 @@ struct btree_key_cache {
struct bkey_cached_key {
u32 btree_id;
struct bpos pos;
-} __attribute__((packed, aligned(4)));
+} __packed __aligned(4);
#define BKEY_CACHED_ACCESSED 0
#define BKEY_CACHED_DIRTY 1
@@ -324,7 +351,7 @@ struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
- u8 u64s;
+ u16 u64s;
bool valid;
u32 btree_trans_barrier_seq;
struct bkey_cached_key key;
@@ -334,28 +361,41 @@ struct bkey_cached {
struct journal_preres res;
struct journal_entry_pin journal;
+ u64 seq;
struct bkey_i *k;
};
+static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
+{
+ return !b->cached
+ ? container_of(b, struct btree, c)->key.k.p
+ : container_of(b, struct bkey_cached, c)->key.pos;
+}
+
struct btree_insert_entry {
unsigned flags;
u8 bkey_type;
enum btree_id btree_id:8;
- u8 level;
+ u8 level:4;
bool cached:1;
bool insert_trigger_run:1;
bool overwrite_trigger_run:1;
+ bool key_cache_already_flushed:1;
+ /*
+ * @old_k may be a key from the journal; @old_btree_u64s always refers
+ * to the size of the key being overwritten in the btree:
+ */
+ u8 old_btree_u64s;
struct bkey_i *k;
struct btree_path *path;
+ /* key being overwritten: */
+ struct bkey old_k;
+ const struct bch_val *old_v;
unsigned long ip_allocated;
};
-#ifndef CONFIG_LOCKDEP
#define BTREE_ITER_MAX 64
-#else
-#define BTREE_ITER_MAX 32
-#endif
struct btree_trans_commit_hook;
typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
@@ -365,59 +405,120 @@ struct btree_trans_commit_hook {
struct btree_trans_commit_hook *next;
};
-#define BTREE_TRANS_MEM_MAX (1U << 14)
+#define BTREE_TRANS_MEM_MAX (1U << 16)
+
+#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000
struct btree_trans {
struct bch_fs *c;
const char *fn;
+ struct closure ref;
struct list_head list;
- struct btree *locking;
- unsigned locking_path_idx;
- struct bpos locking_pos;
- u8 locking_btree_id;
- u8 locking_level;
- pid_t pid;
+ u64 last_begin_time;
+
+ u8 lock_may_not_fail;
+ u8 lock_must_abort;
+ struct btree_bkey_cached_common *locking;
+ struct six_lock_waiter locking_wait;
+
int srcu_idx;
+ u8 fn_idx;
u8 nr_sorted;
u8 nr_updates;
+ u8 nr_wb_updates;
+ u8 wb_updates_size;
bool used_mempool:1;
bool in_traverse_all:1;
- bool restarted:1;
+ bool paths_sorted:1;
+ bool memory_allocation_failure:1;
bool journal_transaction_names:1;
+ bool journal_replay_not_finished:1;
+ bool is_initial_gc:1;
+ bool notrace_relock_fail:1;
+ enum bch_errcode restarted:16;
+ u32 restart_count;
+ unsigned long last_begin_ip;
+ unsigned long last_restarted_ip;
+ unsigned long srcu_lock_time;
+
/*
* For when bch2_trans_update notices we'll be splitting a compressed
* extent:
*/
unsigned extra_journal_res;
+ unsigned nr_max_paths;
u64 paths_allocated;
unsigned mem_top;
+ unsigned mem_max;
unsigned mem_bytes;
void *mem;
- u8 sorted[BTREE_ITER_MAX];
+ u8 sorted[BTREE_ITER_MAX + 8];
struct btree_path *paths;
struct btree_insert_entry *updates;
+ struct btree_write_buffered_key *wb_updates;
/* update path: */
struct btree_trans_commit_hook *hooks;
- struct jset_entry *extra_journal_entries;
- unsigned extra_journal_entry_u64s;
+ darray_u64 extra_journal_entries;
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
struct journal_preres journal_preres;
u64 *journal_seq;
struct disk_reservation *disk_res;
- unsigned flags;
unsigned journal_u64s;
unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas;
};
-#define BTREE_FLAG(flag) \
+#define BCH_BTREE_WRITE_TYPES() \
+ x(initial, 0) \
+ x(init_next_bset, 1) \
+ x(cache_reclaim, 2) \
+ x(journal_reclaim, 3) \
+ x(interior, 4)
+
+enum btree_write_type {
+#define x(t, n) BTREE_WRITE_##t,
+ BCH_BTREE_WRITE_TYPES()
+#undef x
+ BTREE_WRITE_TYPE_NR,
+};
+
+#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
+#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
+
+#define BTREE_FLAGS() \
+ x(read_in_flight) \
+ x(read_error) \
+ x(dirty) \
+ x(need_write) \
+ x(write_blocked) \
+ x(will_make_reachable) \
+ x(noevict) \
+ x(write_idx) \
+ x(accessed) \
+ x(write_in_flight) \
+ x(write_in_flight_inner) \
+ x(just_written) \
+ x(dying) \
+ x(fake) \
+ x(need_rewrite) \
+ x(never_write)
+
+enum btree_flags {
+ /* First bits for btree node write type */
+ BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
+#define x(flag) BTREE_NODE_##flag,
+ BTREE_FLAGS()
+#undef x
+};
+
+#define x(flag) \
static inline bool btree_node_ ## flag(struct btree *b) \
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
\
@@ -427,36 +528,8 @@ static inline void set_btree_node_ ## flag(struct btree *b) \
static inline void clear_btree_node_ ## flag(struct btree *b) \
{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
-enum btree_flags {
- BTREE_NODE_read_in_flight,
- BTREE_NODE_read_error,
- BTREE_NODE_dirty,
- BTREE_NODE_need_write,
- BTREE_NODE_noevict,
- BTREE_NODE_write_idx,
- BTREE_NODE_accessed,
- BTREE_NODE_write_in_flight,
- BTREE_NODE_write_in_flight_inner,
- BTREE_NODE_just_written,
- BTREE_NODE_dying,
- BTREE_NODE_fake,
- BTREE_NODE_need_rewrite,
- BTREE_NODE_never_write,
-};
-
-BTREE_FLAG(read_in_flight);
-BTREE_FLAG(read_error);
-BTREE_FLAG(need_write);
-BTREE_FLAG(noevict);
-BTREE_FLAG(write_idx);
-BTREE_FLAG(accessed);
-BTREE_FLAG(write_in_flight);
-BTREE_FLAG(write_in_flight_inner);
-BTREE_FLAG(just_written);
-BTREE_FLAG(dying);
-BTREE_FLAG(fake);
-BTREE_FLAG(need_rewrite);
-BTREE_FLAG(never_write);
+BTREE_FLAGS()
+#undef x
static inline struct btree_write *btree_current_write(struct btree *b)
{
@@ -586,24 +659,9 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
return __btree_node_type(b->c.level, b->c.btree_id);
}
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
-{
- switch (type) {
- case BKEY_TYPE_extents:
- case BKEY_TYPE_reflink:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool btree_node_is_extents(struct btree *b)
-{
- return btree_node_type_is_extents(btree_node_type(b));
-}
-
#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
((1U << BKEY_TYPE_extents)| \
+ (1U << BKEY_TYPE_alloc)| \
(1U << BKEY_TYPE_inodes)| \
(1U << BKEY_TYPE_stripes)| \
(1U << BKEY_TYPE_reflink)| \
@@ -619,6 +677,16 @@ static inline bool btree_node_is_extents(struct btree *b)
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+#define BTREE_ID_IS_EXTENTS \
+ ((1U << BTREE_ID_extents)| \
+ (1U << BTREE_ID_reflink)| \
+ (1U << BTREE_ID_freespace))
+
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+ return (1U << type) & BTREE_ID_IS_EXTENTS;
+}
+
#define BTREE_ID_HAS_SNAPSHOTS \
((1U << BTREE_ID_extents)| \
(1U << BTREE_ID_inodes)| \
@@ -634,38 +702,10 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
}
-enum btree_update_flags {
- __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
-
- __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
-
- __BTREE_TRIGGER_INSERT,
- __BTREE_TRIGGER_OVERWRITE,
-
- __BTREE_TRIGGER_GC,
- __BTREE_TRIGGER_BUCKET_INVALIDATE,
- __BTREE_TRIGGER_NOATOMIC,
-};
-
-#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
-
-#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
-
-#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
-#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
-
-#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
-#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
-
-#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
- ((1U << KEY_TYPE_alloc)| \
- (1U << KEY_TYPE_alloc_v2)| \
- (1U << KEY_TYPE_alloc_v3)| \
- (1U << KEY_TYPE_stripe)| \
- (1U << KEY_TYPE_inode)| \
- (1U << KEY_TYPE_inode_v2)| \
- (1U << KEY_TYPE_snapshot))
+static inline bool btree_type_has_ptrs(enum btree_id id)
+{
+ return (1 << id) & BTREE_ID_HAS_PTRS;
+}
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
@@ -682,15 +722,6 @@ struct btree_root {
s8 error;
};
-enum btree_insert_ret {
- BTREE_INSERT_OK,
- /* leaf node needs to be split */
- BTREE_INSERT_BTREE_NODE_FULL,
- BTREE_INSERT_NEED_MARK_REPLICAS,
- BTREE_INSERT_NEED_JOURNAL_RES,
- BTREE_INSERT_NEED_JOURNAL_RECLAIM,
-};
-
enum btree_gc_coalesce_fail_reason {
BTREE_GC_COALESCE_FAIL_RESERVE_GET,
BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 5e5a1b5e750e..4adb6f646655 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -8,20 +8,26 @@
struct bch_fs;
struct btree;
-void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *,
- struct btree *);
+void bch2_btree_node_prep_for_write(struct btree_trans *,
+ struct btree_path *, struct btree *);
bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
struct btree *, struct btree_node_iter *,
struct bkey_i *);
+
+int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
+int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
+void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
+ struct bkey_i *, u64);
+
enum btree_insert_flags {
- __BTREE_INSERT_NOFAIL,
+ /* First two bits for journal watermark: */
+ __BTREE_INSERT_NOFAIL = 2,
__BTREE_INSERT_NOCHECK_RW,
__BTREE_INSERT_LAZY_RW,
__BTREE_INSERT_USE_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
- __BTREE_INSERT_JOURNAL_RESERVED,
__BTREE_INSERT_JOURNAL_RECLAIM,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
@@ -41,9 +47,6 @@ enum btree_insert_flags {
/* Insert is for journal replay - don't get journal reservations: */
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
-/* Indicates that we have pre-reserved space in the journal: */
-#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED)
-
/* Insert is being called from journal reclaim path: */
#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM)
@@ -54,9 +57,16 @@ enum btree_insert_flags {
#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE)
#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE)
+int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
+ unsigned, unsigned);
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
+int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
-int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
+int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
+ struct bkey_i *, enum btree_update_flags);
+
+int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *,
+ enum btree_update_flags);
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, u64 *, int flags);
@@ -76,14 +86,17 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *,
int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags);
-int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
- struct bkey_i *, enum btree_update_flags);
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags);
+int __must_check bch2_trans_update_buffered(struct btree_trans *,
+ enum btree_id, struct bkey_i *);
void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
-int __bch2_trans_commit(struct btree_trans *);
+int __bch2_trans_commit(struct btree_trans *, unsigned);
+
+int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
+int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
/**
* bch2_trans_commit - insert keys at given iterator positions
@@ -91,7 +104,6 @@ int __bch2_trans_commit(struct btree_trans *);
* This is main entry point for btree updates.
*
* Return values:
- * -EINTR: locking changed, this function should be called again.
* -EROFS: filesystem read only
* -EIO: journal or btree node IO error
*/
@@ -102,35 +114,37 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
{
trans->disk_res = disk_res;
trans->journal_seq = journal_seq;
- trans->flags = flags;
- return __bch2_trans_commit(trans);
+ return __bch2_trans_commit(trans, flags);
}
-#define lockrestart_do(_trans, _do) \
+#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \
+ lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+ (_journal_seq), (_flags)))
+
+#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \
+ nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+ (_journal_seq), (_flags)))
+
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
({ \
+ struct btree_trans trans; \
int _ret; \
\
- do { \
- bch2_trans_begin(_trans); \
- _ret = (_do); \
- } while (_ret == -EINTR); \
+ bch2_trans_init(&trans, (_c), 0, 0); \
+ _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \
+ bch2_trans_exit(&trans); \
\
_ret; \
})
-#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \
- lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
- (_journal_seq), (_flags)))
-
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
+#define bch2_trans_run(_c, _do) \
({ \
struct btree_trans trans; \
int _ret; \
\
bch2_trans_init(&trans, (_c), 0, 0); \
- _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \
- _do); \
+ _ret = (_do); \
bch2_trans_exit(&trans); \
\
_ret; \
@@ -141,4 +155,32 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
+#define trans_for_each_wb_update(_trans, _i) \
+ for ((_i) = (_trans)->wb_updates; \
+ (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \
+ (_i)++)
+
+static inline void bch2_trans_reset_updates(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ bch2_path_put(trans, i->path, true);
+
+ trans->extra_journal_res = 0;
+ trans->nr_updates = 0;
+ trans->nr_wb_updates = 0;
+ trans->wb_updates = NULL;
+ trans->hooks = NULL;
+ trans->extra_journal_entries.nr = 0;
+
+ if (trans->fs_usage_deltas) {
+ trans->fs_usage_deltas->used = 0;
+ memset((void *) trans->fs_usage_deltas +
+ offsetof(struct replicas_delta_list, memset_start), 0,
+ (void *) &trans->fs_usage_deltas->memset_end -
+ (void *) &trans->fs_usage_deltas->memset_start);
+ }
+}
+
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 7b8ca1153efe..e42e852199f5 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -11,6 +11,7 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "buckets.h"
+#include "clock.h"
#include "error.h"
#include "extents.h"
#include "journal.h"
@@ -23,11 +24,27 @@
#include <linux/random.h>
#include <trace/events/bcachefs.h>
-static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
- struct btree_path *, struct btree *,
- struct keylist *, unsigned);
+static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
+ struct btree_path *, struct btree *,
+ struct keylist *, unsigned);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
+static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
+ enum btree_id btree_id,
+ unsigned level,
+ struct bpos pos)
+{
+ struct btree_path *path;
+
+ path = bch2_path_get(trans, btree_id, pos, level + 1, level,
+ BTREE_ITER_NOPRESERVE|
+ BTREE_ITER_INTENT, _RET_IP_);
+ path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_);
+ bch2_btree_path_downgrade(trans, path);
+ __bch2_btree_path_unlock(trans, path);
+ return path;
+}
+
/* Debug code: */
/*
@@ -41,7 +58,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
struct bkey_s_c k;
struct bkey_s_c_btree_ptr_v2 bp;
struct bkey unpacked;
- char buf1[100], buf2[100];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
BUG_ON(!b->c.level);
@@ -56,21 +73,21 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
break;
bp = bkey_s_c_to_btree_ptr_v2(k);
- if (bpos_cmp(next_node, bp.v->min_key)) {
+ if (!bpos_eq(next_node, bp.v->min_key)) {
bch2_dump_btree_node(c, b);
- panic("expected next min_key %s got %s\n",
- (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2));
+ bch2_bpos_to_text(&buf1, next_node);
+ bch2_bpos_to_text(&buf2, bp.v->min_key);
+ panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
}
bch2_btree_node_iter_advance(&iter, b);
if (bch2_btree_node_iter_end(&iter)) {
- if (bpos_cmp(k.k->p, b->key.k.p)) {
+ if (!bpos_eq(k.k->p, b->key.k.p)) {
bch2_dump_btree_node(c, b);
- panic("expected end %s got %s\n",
- (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
- (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2));
+ bch2_bpos_to_text(&buf1, b->key.k.p);
+ bch2_bpos_to_text(&buf2, k.k->p);
+ panic("expected end %s got %s\n", buf1.buf, buf2.buf);
}
break;
}
@@ -143,8 +160,9 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
static void __btree_node_free(struct bch_fs *c, struct btree *b)
{
- trace_btree_node_free(c, b);
+ trace_and_count(c, btree_node_free, c, b);
+ BUG_ON(btree_node_write_blocked(b));
BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_need_write(b));
BUG_ON(b == btree_node_root(c, b));
@@ -160,43 +178,84 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
}
static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+ struct btree_path *path,
struct btree *b)
{
struct bch_fs *c = trans->c;
- struct btree_path *path;
+ unsigned level = b->c.level;
+
+ bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+ __btree_node_free(c, b);
+ six_unlock_write(&b->c.lock);
+ mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
trans_for_each_path(trans, path)
- BUG_ON(path->l[b->c.level].b == b &&
- path->l[b->c.level].lock_seq == b->c.lock.state.seq);
+ if (path->l[level].b == b) {
+ btree_node_unlock(trans, path, level);
+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+ }
+}
+
+static void bch2_btree_node_free_never_used(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree *b)
+{
+ struct bch_fs *c = as->c;
+ struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
+ struct btree_path *path;
+ unsigned level = b->c.level;
- six_lock_write(&b->c.lock, NULL, NULL);
+ BUG_ON(!list_empty(&b->write_blocked));
+ BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
+
+ b->will_make_reachable = 0;
+ closure_put(&as->cl);
+ clear_btree_node_will_make_reachable(b);
+ clear_btree_node_accessed(b);
+ clear_btree_node_dirty_acct(c, b);
+ clear_btree_node_need_write(b);
+
+ mutex_lock(&c->btree_cache.lock);
+ list_del_init(&b->list);
bch2_btree_node_hash_remove(&c->btree_cache, b);
- __btree_node_free(c, b);
+ mutex_unlock(&c->btree_cache.lock);
+
+ BUG_ON(p->nr >= ARRAY_SIZE(p->b));
+ p->b[p->nr++] = b;
- six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
+
+ trans_for_each_path(trans, path)
+ if (path->l[level].b == b) {
+ btree_node_unlock(trans, path, level);
+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+ }
}
-static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
+static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct disk_reservation *res,
struct closure *cl,
+ bool interior_node,
unsigned flags)
{
+ struct bch_fs *c = trans->c;
struct write_point *wp;
struct btree *b;
- __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+ BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct open_buckets ob = { .nr = 0 };
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
unsigned nr_reserve;
enum alloc_reserve alloc_reserve;
+ int ret;
if (flags & BTREE_INSERT_USE_RESERVE) {
nr_reserve = 0;
- alloc_reserve = RESERVE_BTREE_MOVINGGC;
+ alloc_reserve = RESERVE_btree_movinggc;
} else {
nr_reserve = BTREE_NODE_RESERVE;
- alloc_reserve = RESERVE_BTREE;
+ alloc_reserve = RESERVE_btree;
}
mutex_lock(&c->btree_reserve_cache_lock);
@@ -212,7 +271,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
- wp = bch2_alloc_sectors_start(c,
+ ret = bch2_alloc_sectors_start_trans(trans,
c->opts.metadata_target ?:
c->opts.foreground_target,
0,
@@ -220,9 +279,9 @@ retry:
&devs_have,
res->nr_replicas,
c->opts.metadata_replicas_required,
- alloc_reserve, 0, cl);
- if (IS_ERR(wp))
- return ERR_CAST(wp);
+ alloc_reserve, 0, cl, &wp);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
if (wp->sectors_free < btree_sectors(c)) {
struct open_bucket *ob;
@@ -242,7 +301,9 @@ retry:
bch2_open_bucket_get(c, wp, &ob);
bch2_alloc_sectors_done(c, wp);
mem_alloc:
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(trans, interior_node);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
/* we hold cannibalize_lock: */
BUG_ON(IS_ERR(b));
@@ -254,19 +315,25 @@ mem_alloc:
return b;
}
-static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
+static struct btree *bch2_btree_node_alloc(struct btree_update *as,
+ struct btree_trans *trans,
+ unsigned level)
{
struct bch_fs *c = as->c;
struct btree *b;
+ struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
int ret;
BUG_ON(level >= BTREE_MAX_DEPTH);
- BUG_ON(!as->nr_prealloc_nodes);
+ BUG_ON(!p->nr);
- b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+ b = p->b[--p->nr];
+
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
set_btree_node_accessed(b);
- set_btree_node_dirty(c, b);
+ set_btree_node_dirty_acct(c, b);
set_btree_node_need_write(b);
bch2_bset_init_first(b, &b->data->keys);
@@ -296,7 +363,8 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
BUG_ON(ret);
- trace_btree_node_alloc(c, b);
+ trace_and_count(c, btree_node_alloc, c, b);
+ bch2_increment_clock(c, btree_sectors(c), WRITE);
return b;
}
@@ -313,13 +381,19 @@ static void btree_set_max(struct btree *b, struct bpos pos)
b->data->max_key = pos;
}
-struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
- struct btree *b,
- struct bkey_format format)
+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree *b)
{
- struct btree *n;
+ struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
+ struct bkey_format format = bch2_btree_calc_format(b);
- n = bch2_btree_node_alloc(as, b->c.level);
+ /*
+ * The keys might expand with the new format - if they wouldn't fit in
+ * the btree node anymore, use the old format for now:
+ */
+ if (!bch2_btree_node_format_fits(as->c, b, &format))
+ format = b->format;
SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
@@ -332,29 +406,13 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
bch2_btree_sort_into(as->c, n, b);
btree_node_reset_sib_u64s(n);
-
- n->key.k.p = b->key.k.p;
return n;
}
-static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
- struct btree *b)
-{
- struct bkey_format new_f = bch2_btree_calc_format(b);
-
- /*
- * The keys might expand with the new format - if they wouldn't fit in
- * the btree node anymore, use the old format for now:
- */
- if (!bch2_btree_node_format_fits(as->c, b, &new_f))
- new_f = b->format;
-
- return __bch2_btree_node_alloc_replacement(as, b, new_f);
-}
-
-static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
+static struct btree *__btree_root_alloc(struct btree_update *as,
+ struct btree_trans *trans, unsigned level)
{
- struct btree *b = bch2_btree_node_alloc(as, level);
+ struct btree *b = bch2_btree_node_alloc(as, trans, level);
btree_set_min(b, POS_MIN);
btree_set_max(b, SPOS_MAX);
@@ -363,85 +421,92 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
btree_node_set_format(b, b->data->format);
bch2_btree_build_aux_trees(b);
- bch2_btree_update_add_new_node(as, b);
- six_unlock_write(&b->c.lock);
-
return b;
}
-static void bch2_btree_reserve_put(struct btree_update *as)
+static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
{
struct bch_fs *c = as->c;
+ struct prealloc_nodes *p;
- mutex_lock(&c->btree_reserve_cache_lock);
-
- while (as->nr_prealloc_nodes) {
- struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+ for (p = as->prealloc_nodes;
+ p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
+ p++) {
+ while (p->nr) {
+ struct btree *b = p->b[--p->nr];
- six_unlock_write(&b->c.lock);
+ mutex_lock(&c->btree_reserve_cache_lock);
- if (c->btree_reserve_cache_nr <
- ARRAY_SIZE(c->btree_reserve_cache)) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+ if (c->btree_reserve_cache_nr <
+ ARRAY_SIZE(c->btree_reserve_cache)) {
+ struct btree_alloc *a =
+ &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
- a->ob = b->ob;
- b->ob.nr = 0;
- bkey_copy(&a->k, &b->key);
- } else {
- bch2_open_buckets_put(c, &b->ob);
- }
+ a->ob = b->ob;
+ b->ob.nr = 0;
+ bkey_copy(&a->k, &b->key);
+ } else {
+ bch2_open_buckets_put(c, &b->ob);
+ }
- btree_node_lock_type(c, b, SIX_LOCK_write);
- __btree_node_free(c, b);
- six_unlock_write(&b->c.lock);
+ mutex_unlock(&c->btree_reserve_cache_lock);
- six_unlock_intent(&b->c.lock);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+ __btree_node_free(c, b);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ }
}
-
- mutex_unlock(&c->btree_reserve_cache_lock);
}
-static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
- unsigned flags, struct closure *cl)
+static int bch2_btree_reserve_get(struct btree_trans *trans,
+ struct btree_update *as,
+ unsigned nr_nodes[2],
+ unsigned flags,
+ struct closure *cl)
{
struct bch_fs *c = as->c;
struct btree *b;
- int ret;
+ unsigned interior;
+ int ret = 0;
- BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+ BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
/*
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve:
+ *
+ * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
+ * blocking on this lock:
*/
ret = bch2_btree_cache_cannibalize_lock(c, cl);
if (ret)
return ret;
- while (as->nr_prealloc_nodes < nr_nodes) {
- b = __bch2_btree_node_alloc(c, &as->disk_res,
- flags & BTREE_INSERT_NOWAIT
- ? NULL : cl, flags);
- if (IS_ERR(b)) {
- ret = PTR_ERR(b);
- goto err_free;
- }
+ for (interior = 0; interior < 2; interior++) {
+ struct prealloc_nodes *p = as->prealloc_nodes + interior;
- as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
- }
+ while (p->nr < nr_nodes[interior]) {
+ b = __bch2_btree_node_alloc(trans, &as->disk_res,
+ flags & BTREE_INSERT_NOWAIT ? NULL : cl,
+ interior, flags);
+ if (IS_ERR(b)) {
+ ret = PTR_ERR(b);
+ goto err;
+ }
+ p->b[p->nr++] = b;
+ }
+ }
+err:
bch2_btree_cache_cannibalize_unlock(c);
- return 0;
-err_free:
- bch2_btree_cache_cannibalize_unlock(c);
- trace_btree_reserve_get_fail(c, nr_nodes, cl);
return ret;
}
/* Asynchronous interior node update machinery */
-static void bch2_btree_update_free(struct btree_update *as)
+static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
{
struct bch_fs *c = as->c;
@@ -454,7 +519,7 @@ static void bch2_btree_update_free(struct btree_update *as)
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
bch2_disk_reservation_put(c, &as->disk_res);
- bch2_btree_reserve_put(as);
+ bch2_btree_reserve_put(as, trans);
bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
as->start_time);
@@ -475,20 +540,18 @@ static void bch2_btree_update_free(struct btree_update *as)
mutex_unlock(&c->btree_interior_update_lock);
}
-static void btree_update_will_delete_key(struct btree_update *as,
- struct bkey_i *k)
+static void btree_update_add_key(struct btree_update *as,
+ struct keylist *keys, struct btree *b)
{
- BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s >
+ struct bkey_i *k = &b->key;
+
+ BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
ARRAY_SIZE(as->_old_keys));
- bch2_keylist_add(&as->old_keys, k);
-}
-static void btree_update_will_add_key(struct btree_update *as,
- struct bkey_i *k)
-{
- BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s >
- ARRAY_SIZE(as->_new_keys));
- bch2_keylist_add(&as->new_keys, k);
+ bkey_copy(keys->top, k);
+ bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
+
+ bch2_keylist_push(keys);
}
/*
@@ -501,24 +564,29 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
struct bkey_i *k;
int ret;
- trans->extra_journal_entries = (void *) &as->journal_entries[0];
- trans->extra_journal_entry_u64s = as->journal_u64s;
+ ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
+ if (ret)
+ return ret;
+
+ memcpy(&darray_top(trans->extra_journal_entries),
+ as->journal_entries,
+ as->journal_u64s * sizeof(u64));
+ trans->extra_journal_entries.nr += as->journal_u64s;
+
trans->journal_pin = &as->journal;
- for_each_keylist_key(&as->new_keys, k) {
- ret = bch2_trans_mark_key(trans,
- bkey_s_c_null,
- bkey_i_to_s_c(k),
- BTREE_TRIGGER_INSERT);
+ for_each_keylist_key(&as->old_keys, k) {
+ unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+ ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
if (ret)
return ret;
}
- for_each_keylist_key(&as->old_keys, k) {
- ret = bch2_trans_mark_key(trans,
- bkey_i_to_s_c(k),
- bkey_s_c_null,
- BTREE_TRIGGER_OVERWRITE);
+ for_each_keylist_key(&as->new_keys, k) {
+ unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+ ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
if (ret)
return ret;
}
@@ -529,12 +597,13 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
static void btree_update_nodes_written(struct btree_update *as)
{
struct bch_fs *c = as->c;
- struct btree *b = as->b;
+ struct btree *b;
struct btree_trans trans;
u64 journal_seq = 0;
unsigned i;
int ret;
+ bch2_trans_init(&trans, c, 0, 512);
/*
* If we're already in an error state, it might be because a btree node
* was never written, and we might be trying to free that same btree
@@ -546,22 +615,21 @@ static void btree_update_nodes_written(struct btree_update *as)
if (ret)
goto err;
- BUG_ON(!journal_pin_active(&as->journal));
-
/*
* Wait for any in flight writes to finish before we free the old nodes
* on disk:
*/
for (i = 0; i < as->nr_old_nodes; i++) {
- struct btree *old = as->old_nodes[i];
__le64 seq;
- six_lock_read(&old->c.lock, NULL, NULL);
- seq = old->data ? old->data->keys.seq : 0;
- six_unlock_read(&old->c.lock);
+ b = as->old_nodes[i];
+
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ seq = b->data ? b->data->keys.seq : 0;
+ six_unlock_read(&b->c.lock);
if (seq == as->old_nodes_seq[i])
- wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
TASK_UNINTERRUPTIBLE);
}
@@ -578,19 +646,23 @@ static void btree_update_nodes_written(struct btree_update *as)
* journal reclaim does btree updates when flushing bkey_cached entries,
* which may require allocations as well.
*/
- bch2_trans_init(&trans, c, 0, 512);
- ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_JOURNAL_RESERVED,
- btree_update_nodes_written_trans(&trans, as));
- bch2_trans_exit(&trans);
+ ret = commit_do(&trans, &as->disk_res, &journal_seq,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_JOURNAL_RECLAIM|
+ JOURNAL_WATERMARK_reserved,
+ btree_update_nodes_written_trans(&trans, as));
+ bch2_trans_unlock(&trans);
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
- "error %i in btree_update_nodes_written()", ret);
+ "%s(): error %s", __func__, bch2_err_str(ret));
err:
- if (b) {
+ if (as->b) {
+ struct btree_path *path;
+
+ b = as->b;
+ path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p);
/*
* @b is the node we did the final insert into:
*
@@ -603,11 +675,29 @@ err:
* we're in journal error state:
*/
- btree_node_lock_type(c, b, SIX_LOCK_intent);
- btree_node_lock_type(c, b, SIX_LOCK_write);
+ /*
+ * Ensure transaction is unlocked before using
+ * btree_node_lock_nopath() (the use of which is always suspect,
+ * we need to work on removing this in the future)
+ *
+ * It should be, but get_unlocked_mut_path() -> bch2_path_get()
+ * calls bch2_path_upgrade(), before we call path_make_mut(), so
+ * we may rarely end up with a locked path besides the one we
+ * have here:
+ */
+ bch2_trans_unlock(&trans);
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
+ mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
+ path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+ path->l[b->c.level].b = b;
+
+ bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
+
mutex_lock(&c->btree_interior_update_lock);
list_del(&as->write_blocked_list);
+ if (list_empty(&b->write_blocked))
+ clear_btree_node_write_blocked(b);
/*
* Node might have been freed, recheck under
@@ -621,8 +711,8 @@ err:
if (!ret) {
i->journal_seq = cpu_to_le64(
- max(journal_seq,
- le64_to_cpu(i->journal_seq)));
+ max(journal_seq,
+ le64_to_cpu(i->journal_seq)));
bch2_btree_add_journal_pin(c, b, journal_seq);
} else {
@@ -636,10 +726,13 @@ err:
}
mutex_unlock(&c->btree_interior_update_lock);
+
+ mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
six_unlock_write(&b->c.lock);
btree_node_write_if_need(c, b, SIX_LOCK_intent);
- six_unlock_intent(&b->c.lock);
+ btree_node_unlock(&trans, path, b->c.level);
+ bch2_path_put(&trans, path, true);
}
bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -652,13 +745,14 @@ err:
BUG_ON(b->will_make_reachable != (unsigned long) as);
b->will_make_reachable = 0;
+ clear_btree_node_will_make_reachable(b);
}
mutex_unlock(&c->btree_interior_update_lock);
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
- btree_node_lock_type(c, b, SIX_LOCK_read);
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
}
@@ -666,7 +760,8 @@ err:
for (i = 0; i < as->nr_open_buckets; i++)
bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
- bch2_btree_update_free(as);
+ bch2_btree_update_free(as, &trans);
+ bch2_trans_exit(&trans);
}
static void btree_interior_update_work(struct work_struct *work)
@@ -715,9 +810,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
BUG_ON(!btree_node_dirty(b));
+ BUG_ON(!b->c.level);
as->mode = BTREE_INTERIOR_UPDATING_NODE;
as->b = b;
+
+ set_btree_node_write_blocked(b);
list_add(&as->write_blocked_list, &b->write_blocked);
mutex_unlock(&c->btree_interior_update_lock);
@@ -783,10 +881,19 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree
as->new_nodes[as->nr_new_nodes++] = b;
b->will_make_reachable = 1UL|(unsigned long) as;
+ set_btree_node_will_make_reachable(b);
mutex_unlock(&c->btree_interior_update_lock);
- btree_update_will_add_key(as, &b->key);
+ btree_update_add_key(as, &as->new_keys, b);
+
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
+ unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
+
+ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+ cpu_to_le16(sectors);
+ }
}
/*
@@ -805,6 +912,7 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
* xchg() is for synchronization with bch2_btree_complete_write:
*/
v = xchg(&b->will_make_reachable, 0);
+ clear_btree_node_will_make_reachable(b);
as = (struct btree_update *) (v & ~1UL);
if (!as) {
@@ -838,7 +946,7 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b
* btree_updates to point to this btree_update:
*/
static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
- struct btree *b)
+ struct btree *b)
{
struct bch_fs *c = as->c;
struct btree_update *p, *n;
@@ -870,8 +978,9 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
closure_wake_up(&c->btree_interior_update_wait);
}
- clear_btree_node_dirty(c, b);
+ clear_btree_node_dirty_acct(c, b);
clear_btree_node_need_write(b);
+ clear_btree_node_write_blocked(b);
/*
* Does this node have unwritten data that has a pin on the journal?
@@ -902,14 +1011,14 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
*/
btree_update_drop_new_node(c, b);
- btree_update_will_delete_key(as, &b->key);
+ btree_update_add_key(as, &as->old_keys, b);
as->old_nodes[as->nr_old_nodes] = b;
as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
as->nr_old_nodes++;
}
-static void bch2_btree_update_done(struct btree_update *as)
+static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
{
struct bch_fs *c = as->c;
u64 start_time = as->start_time;
@@ -920,7 +1029,7 @@ static void bch2_btree_update_done(struct btree_update *as)
up_read(&as->c->gc_lock);
as->took_gc_lock = false;
- bch2_btree_reserve_put(as);
+ bch2_btree_reserve_put(as, trans);
continue_at(&as->cl, btree_update_set_nodes_written,
as->c->btree_interior_update_worker);
@@ -931,34 +1040,44 @@ static void bch2_btree_update_done(struct btree_update *as)
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level, unsigned nr_nodes, unsigned flags)
+ unsigned level, bool split, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_update *as;
- struct closure cl;
u64 start_time = local_clock();
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0;
- int journal_flags = 0;
+ unsigned nr_nodes[2] = { 0, 0 };
+ unsigned update_level = level;
+ int journal_flags = flags & JOURNAL_WATERMARK_MASK;
int ret = 0;
+ u32 restart_count = trans->restart_count;
BUG_ON(!path->should_be_locked);
- if (flags & BTREE_INSERT_JOURNAL_RESERVED)
- journal_flags |= JOURNAL_RES_GET_RESERVED;
+ if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+ journal_flags |= JOURNAL_RES_GET_NONBLOCK;
- closure_init_stack(&cl);
-retry:
+ while (1) {
+ nr_nodes[!!update_level] += 1 + split;
+ update_level++;
- /*
- * XXX: figure out how far we might need to split,
- * instead of locking/reserving all the way to the root:
- */
- if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
- trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
- path->btree_id, &path->pos);
- ret = btree_trans_restart(trans);
- return ERR_PTR(ret);
+ ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!btree_path_node(path, update_level)) {
+ /* Allocating new root? */
+ nr_nodes[1] += split;
+ update_level = BTREE_MAX_DEPTH;
+ break;
+ }
+
+ if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+ BKEY_BTREE_PTR_U64s_MAX * (1 + split)))
+ break;
+
+ split = true;
}
if (flags & BTREE_INSERT_GC_LOCK_HELD)
@@ -966,9 +1085,10 @@ retry:
else if (!down_read_trylock(&c->gc_lock)) {
bch2_trans_unlock(trans);
down_read(&c->gc_lock);
- if (!bch2_trans_relock(trans)) {
+ ret = bch2_trans_relock(trans);
+ if (ret) {
up_read(&c->gc_lock);
- return ERR_PTR(-EINTR);
+ return ERR_PTR(ret);
}
}
@@ -980,6 +1100,7 @@ retry:
as->mode = BTREE_INTERIOR_NO_UPDATE;
as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
as->btree_id = path->btree_id;
+ as->update_level = update_level;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
@@ -1006,57 +1127,64 @@ retry:
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
journal_flags|JOURNAL_RES_GET_NONBLOCK);
- if (ret == -EAGAIN) {
+ if (ret) {
bch2_trans_unlock(trans);
if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
- bch2_btree_update_free(as);
- btree_trans_restart(trans);
- return ERR_PTR(ret);
+ ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ goto err;
}
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
- BTREE_UPDATE_JOURNAL_RES,
- journal_flags);
+ BTREE_UPDATE_JOURNAL_RES,
+ journal_flags);
if (ret) {
- trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
+ trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
goto err;
}
- if (!bch2_trans_relock(trans)) {
- ret = -EINTR;
+ ret = bch2_trans_relock(trans);
+ if (ret)
goto err;
- }
}
ret = bch2_disk_reservation_get(c, &as->disk_res,
- nr_nodes * btree_sectors(c),
+ (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas,
disk_res_flags);
if (ret)
goto err;
- ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
- if (ret)
- goto err;
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
+ if (bch2_err_matches(ret, ENOSPC) ||
+ bch2_err_matches(ret, ENOMEM)) {
+ struct closure cl;
- bch2_journal_pin_add(&c->journal,
- atomic64_read(&c->journal.seq),
- &as->journal, NULL);
+ closure_init_stack(&cl);
- return as;
-err:
- bch2_btree_update_free(as);
+ do {
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
- if (ret == -EAGAIN) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- ret = -EINTR;
+ bch2_trans_unlock(trans);
+ closure_sync(&cl);
+ } while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
}
- if (ret == -EINTR && bch2_trans_relock(trans))
- goto retry;
+ if (ret) {
+ trace_and_count(c, btree_reserve_get_fail, trans->fn,
+ _RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
+ goto err;
+ }
+
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ goto err;
+ bch2_trans_verify_not_restarted(trans, restart_count);
+ return as;
+err:
+ bch2_btree_update_free(as, trans);
return ERR_PTR(ret);
}
@@ -1069,11 +1197,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
list_del_init(&b->list);
mutex_unlock(&c->btree_cache.lock);
- if (b->c.level)
- six_lock_pcpu_alloc(&b->c.lock);
- else
- six_lock_pcpu_free(&b->c.lock);
-
mutex_lock(&c->btree_root_lock);
BUG_ON(btree_node_root(c, b) &&
(b->c.level < btree_node_root(c, b)->c.level ||
@@ -1105,9 +1228,7 @@ static void bch2_btree_set_root(struct btree_update *as,
struct bch_fs *c = as->c;
struct btree *old;
- trace_btree_set_root(c, b);
- BUG_ON(!b->written &&
- !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
+ trace_and_count(c, btree_node_set_root, c, b);
old = btree_node_root(c, b);
@@ -1115,7 +1236,7 @@ static void bch2_btree_set_root(struct btree_update *as,
* Ensure no one is using the old root while we switch to the
* new root:
*/
- bch2_btree_node_lock_write(trans, path, old);
+ bch2_btree_node_lock_write_nofail(trans, path, &old->c);
bch2_btree_set_root_inmem(c, b);
@@ -1142,7 +1263,8 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
{
struct bch_fs *c = as->c;
struct bkey_packed *k;
- const char *invalid;
+ struct printbuf buf = PRINTBUF;
+ unsigned long old, new, v;
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
!btree_ptr_sectors_written(insert));
@@ -1150,13 +1272,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
- invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
- bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
- if (invalid) {
- char buf[160];
-
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
- bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+ btree_node_type(b), WRITE, &buf) ?:
+ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "inserting invalid bkey\n ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+ prt_printf(&buf, "\n ");
+ bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+ btree_node_type(b), WRITE, &buf);
+ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
dump_stack();
}
@@ -1174,8 +1301,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
bch2_btree_node_iter_advance(node_iter, b);
bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
- set_btree_node_dirty(c, b);
- set_btree_node_need_write(b);
+ set_btree_node_dirty_acct(c, b);
+
+ v = READ_ONCE(b->flags);
+ do {
+ old = new = v;
+
+ new &= ~BTREE_WRITE_TYPE_MASK;
+ new |= BTREE_WRITE_interior;
+ new |= 1 << BTREE_NODE_need_write;
+ } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+ printbuf_exit(&buf);
}
static void
@@ -1196,8 +1333,12 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
;
while (!bch2_keylist_empty(keys)) {
- bch2_insert_fixup_btree_ptr(as, trans, path, b,
- &node_iter, bch2_keylist_front(keys));
+ struct bkey_i *k = bch2_keylist_front(keys);
+
+ if (bpos_gt(k->k.p, b->key.k.p))
+ break;
+
+ bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k);
bch2_keylist_pop_front(keys);
}
}
@@ -1206,109 +1347,91 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
* Move keys from n1 (original replacement node, now lower node) to n2 (higher
* node)
*/
-static struct btree *__btree_split_node(struct btree_update *as,
- struct btree *n1)
+static void __btree_split_node(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree *b,
+ struct btree *n[2])
{
- struct bkey_format_state s;
- size_t nr_packed = 0, nr_unpacked = 0;
- struct btree *n2;
- struct bset *set1, *set2;
- struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
- struct bpos n1_pos;
+ struct bkey_packed *k;
+ struct bpos n1_pos = POS_MIN;
+ struct btree_node_iter iter;
+ struct bset *bsets[2];
+ struct bkey_format_state format[2];
+ struct bkey_packed *out[2];
+ struct bkey uk;
+ unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
+ int i;
- n2 = bch2_btree_node_alloc(as, n1->c.level);
- bch2_btree_update_add_new_node(as, n2);
+ for (i = 0; i < 2; i++) {
+ BUG_ON(n[i]->nsets != 1);
- n2->data->max_key = n1->data->max_key;
- n2->data->format = n1->format;
- SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
- n2->key.k.p = n1->key.k.p;
+ bsets[i] = btree_bset_first(n[i]);
+ out[i] = bsets[i]->start;
- set1 = btree_bset_first(n1);
- set2 = btree_bset_first(n2);
+ SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
+ bch2_bkey_format_init(&format[i]);
+ }
- /*
- * Has to be a linear search because we don't have an auxiliary
- * search tree yet
- */
- k = set1->start;
- while (1) {
- struct bkey_packed *n = bkey_next(k);
+ u64s = 0;
+ for_each_btree_node_key(b, k, &iter) {
+ if (bkey_deleted(k))
+ continue;
+
+ i = u64s >= n1_u64s;
+ u64s += k->u64s;
+ uk = bkey_unpack_key(b, k);
+ if (!i)
+ n1_pos = uk.p;
+ bch2_bkey_format_add_key(&format[i], &uk);
+ }
- if (n == vstruct_last(set1))
- break;
- if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
- break;
+ btree_set_min(n[0], b->data->min_key);
+ btree_set_max(n[0], n1_pos);
+ btree_set_min(n[1], bpos_successor(n1_pos));
+ btree_set_max(n[1], b->data->max_key);
- if (bkey_packed(k))
- nr_packed++;
- else
- nr_unpacked++;
+ for (i = 0; i < 2; i++) {
+ bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
+ bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
- prev = k;
- k = n;
+ n[i]->data->format = bch2_bkey_format_done(&format[i]);
+ btree_node_set_format(n[i], n[i]->data->format);
}
- BUG_ON(!prev);
- set2_start = k;
- set2_end = vstruct_last(set1);
+ u64s = 0;
+ for_each_btree_node_key(b, k, &iter) {
+ if (bkey_deleted(k))
+ continue;
- set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
- set_btree_bset_end(n1, n1->set);
+ i = u64s >= n1_u64s;
+ u64s += k->u64s;
- n1->nr.live_u64s = le16_to_cpu(set1->u64s);
- n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s);
- n1->nr.packed_keys = nr_packed;
- n1->nr.unpacked_keys = nr_unpacked;
-
- n1_pos = bkey_unpack_pos(n1, prev);
- if (as->c->sb.version < bcachefs_metadata_version_snapshot)
- n1_pos.snapshot = U32_MAX;
+ if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
+ ? &b->format: &bch2_bkey_format_current, k))
+ out[i]->format = KEY_FORMAT_LOCAL_BTREE;
+ else
+ bch2_bkey_unpack(b, (void *) out[i], k);
- btree_set_max(n1, n1_pos);
- btree_set_min(n2, bpos_successor(n1->key.k.p));
+ out[i]->needs_whiteout = false;
- bch2_bkey_format_init(&s);
- bch2_bkey_format_add_pos(&s, n2->data->min_key);
- bch2_bkey_format_add_pos(&s, n2->data->max_key);
-
- for (k = set2_start; k != set2_end; k = bkey_next(k)) {
- struct bkey uk = bkey_unpack_key(n1, k);
- bch2_bkey_format_add_key(&s, &uk);
+ btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
+ out[i] = bkey_p_next(out[i]);
}
- n2->data->format = bch2_bkey_format_done(&s);
- btree_node_set_format(n2, n2->data->format);
-
- out = set2->start;
- memset(&n2->nr, 0, sizeof(n2->nr));
+ for (i = 0; i < 2; i++) {
+ bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
- for (k = set2_start; k != set2_end; k = bkey_next(k)) {
- BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
- ? &n1->format : &bch2_bkey_format_current, k));
- out->format = KEY_FORMAT_LOCAL_BTREE;
- btree_keys_account_key_add(&n2->nr, 0, out);
- out = bkey_next(out);
- }
-
- set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
- set_btree_bset_end(n2, n2->set);
+ BUG_ON(!bsets[i]->u64s);
- BUG_ON(!set1->u64s);
- BUG_ON(!set2->u64s);
+ set_btree_bset_end(n[i], n[i]->set);
- btree_node_reset_sib_u64s(n1);
- btree_node_reset_sib_u64s(n2);
+ btree_node_reset_sib_u64s(n[i]);
- bch2_verify_btree_nr_keys(n1);
- bch2_verify_btree_nr_keys(n2);
+ bch2_verify_btree_nr_keys(n[i]);
- if (n1->c.level) {
- btree_node_interior_verify(as->c, n1);
- btree_node_interior_verify(as->c, n2);
+ if (b->c.level)
+ btree_node_interior_verify(as->c, n[i]);
}
-
- return n2;
}
/*
@@ -1328,75 +1451,67 @@ static void btree_split_insert_keys(struct btree_update *as,
struct btree *b,
struct keylist *keys)
{
- struct btree_node_iter node_iter;
- struct bkey_i *k = bch2_keylist_front(keys);
- struct bkey_packed *src, *dst, *n;
- struct bset *i;
+ if (!bch2_keylist_empty(keys) &&
+ bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
+ struct btree_node_iter node_iter;
- bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
+ bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
- __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+ __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
- /*
- * We can't tolerate whiteouts here - with whiteouts there can be
- * duplicate keys, and it would be rather bad if we picked a duplicate
- * for the pivot:
- */
- i = btree_bset_first(b);
- src = dst = i->start;
- while (src != vstruct_last(i)) {
- n = bkey_next(src);
- if (!bkey_deleted(src)) {
- memmove_u64s_down(dst, src, src->u64s);
- dst = bkey_next(dst);
- }
- src = n;
+ btree_node_interior_verify(as->c, b);
}
-
- /* Also clear out the unwritten whiteouts area: */
- b->whiteout_u64s = 0;
-
- i->u64s = cpu_to_le16((u64 *) dst - i->_data);
- set_btree_bset_end(b, b->set);
-
- BUG_ON(b->nsets != 1 ||
- b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
-
- btree_node_interior_verify(as->c, b);
}
-static void btree_split(struct btree_update *as, struct btree_trans *trans,
- struct btree_path *path, struct btree *b,
- struct keylist *keys, unsigned flags)
+static int btree_split(struct btree_update *as, struct btree_trans *trans,
+ struct btree_path *path, struct btree *b,
+ struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
struct btree *parent = btree_node_parent(path, b);
struct btree *n1, *n2 = NULL, *n3 = NULL;
+ struct btree_path *path1 = NULL, *path2 = NULL;
u64 start_time = local_clock();
+ int ret = 0;
BUG_ON(!parent && (b != btree_node_root(c, b)));
- BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
+ BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
bch2_btree_interior_update_will_free_node(as, b);
- n1 = bch2_btree_node_alloc_replacement(as, b);
- bch2_btree_update_add_new_node(as, n1);
+ if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
+ struct btree *n[2];
+
+ trace_and_count(c, btree_node_split, c, b);
- if (keys)
- btree_split_insert_keys(as, trans, path, n1, keys);
+ n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
+ n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
- if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
- trace_btree_split(c, b);
+ __btree_split_node(as, trans, b, n);
- n2 = __btree_split_node(as, n1);
+ if (keys) {
+ btree_split_insert_keys(as, trans, path, n1, keys);
+ btree_split_insert_keys(as, trans, path, n2, keys);
+ BUG_ON(!bch2_keylist_empty(keys));
+ }
bch2_btree_build_aux_trees(n2);
bch2_btree_build_aux_trees(n1);
+
+ bch2_btree_update_add_new_node(as, n1);
+ bch2_btree_update_add_new_node(as, n2);
six_unlock_write(&n2->c.lock);
six_unlock_write(&n1->c.lock);
- bch2_btree_node_write(c, n1, SIX_LOCK_intent);
- bch2_btree_node_write(c, n2, SIX_LOCK_intent);
+ path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+ six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(trans, path1, n1);
+
+ path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
+ six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(trans, path2, n2);
/*
* Note that on recursive parent_keys == keys, so we
@@ -1408,22 +1523,40 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
if (!parent) {
/* Depth increases, make a new root */
- n3 = __btree_root_alloc(as, b->c.level + 1);
+ n3 = __btree_root_alloc(as, trans, b->c.level + 1);
+
+ bch2_btree_update_add_new_node(as, n3);
+ six_unlock_write(&n3->c.lock);
+
+ path2->locks_want++;
+ BUG_ON(btree_node_locked(path2, n3->c.level));
+ six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(trans, path2, n3);
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
-
- bch2_btree_node_write(c, n3, SIX_LOCK_intent);
}
} else {
- trace_btree_compact(c, b);
+ trace_and_count(c, btree_node_compact, c, b);
+
+ n1 = bch2_btree_node_alloc_replacement(as, trans, b);
+
+ if (keys) {
+ btree_split_insert_keys(as, trans, path, n1, keys);
+ BUG_ON(!bch2_keylist_empty(keys));
+ }
bch2_btree_build_aux_trees(n1);
+ bch2_btree_update_add_new_node(as, n1);
six_unlock_write(&n1->c.lock);
- bch2_btree_node_write(c, n1, SIX_LOCK_intent);
+ path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+ six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(trans, path1, n1);
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key);
@@ -1433,7 +1566,9 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
if (parent) {
/* Split a non root node */
- bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ if (ret)
+ goto err;
} else if (n3) {
bch2_btree_set_root(as, trans, path, n3);
} else {
@@ -1441,20 +1576,16 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_set_root(as, trans, path, n1);
}
- bch2_btree_update_get_open_buckets(as, n1);
- if (n2)
- bch2_btree_update_get_open_buckets(as, n2);
- if (n3)
+ if (n3) {
bch2_btree_update_get_open_buckets(as, n3);
-
- /* Successful split, update the path to point to the new nodes: */
-
- six_lock_increment(&b->c.lock, SIX_LOCK_intent);
- if (n3)
- bch2_trans_node_add(trans, n3);
- if (n2)
- bch2_trans_node_add(trans, n2);
- bch2_trans_node_add(trans, n1);
+ bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+ }
+ if (n2) {
+ bch2_btree_update_get_open_buckets(as, n2);
+ bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+ }
+ bch2_btree_update_get_open_buckets(as, n1);
+ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
/*
* The old node must be freed (in memory) _before_ unlocking the new
@@ -1462,13 +1593,28 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
* node after another thread has locked and updated the new node, thus
* seeing stale data:
*/
- bch2_btree_node_free_inmem(trans, b);
+ bch2_btree_node_free_inmem(trans, path, b);
+
+ if (n3)
+ bch2_trans_node_add(trans, n3);
+ if (n2)
+ bch2_trans_node_add(trans, n2);
+ bch2_trans_node_add(trans, n1);
if (n3)
six_unlock_intent(&n3->c.lock);
if (n2)
six_unlock_intent(&n2->c.lock);
six_unlock_intent(&n1->c.lock);
+out:
+ if (path2) {
+ __bch2_btree_path_unlock(trans, path2);
+ bch2_path_put(trans, path2, true);
+ }
+ if (path1) {
+ __bch2_btree_path_unlock(trans, path1);
+ bch2_path_put(trans, path1, true);
+ }
bch2_trans_verify_locks(trans);
@@ -1476,6 +1622,14 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
? BCH_TIME_btree_node_split
: BCH_TIME_btree_node_compact],
start_time);
+ return ret;
+err:
+ if (n3)
+ bch2_btree_node_free_never_used(as, trans, n3);
+ if (n2)
+ bch2_btree_node_free_never_used(as, trans, n2);
+ bch2_btree_node_free_never_used(as, trans, n1);
+ goto out;
}
static void
@@ -1510,22 +1664,30 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
* If a split occurred, this function will return early. This can only happen
* for leaf nodes -- inserts into interior nodes have to be atomic.
*/
-static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
- struct btree_path *path, struct btree *b,
- struct keylist *keys, unsigned flags)
+static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+ struct btree_path *path, struct btree *b,
+ struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
+ int ret;
lockdep_assert_held(&c->gc_lock);
- BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
+ BUG_ON(!btree_node_intent_locked(path, b->c.level));
BUG_ON(!b->c.level);
BUG_ON(!as || as->b);
bch2_verify_keylist_sorted(keys);
- bch2_btree_node_lock_for_insert(trans, path, b);
+ if ((local_clock() & 63) == 63)
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+
+ ret = bch2_btree_node_lock_write(trans, path, &b->c);
+ if (ret)
+ return ret;
+
+ bch2_btree_node_prep_for_write(trans, path, b);
if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
bch2_btree_node_unlock_write(trans, path, b);
@@ -1551,30 +1713,43 @@ static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *
bch2_btree_node_unlock_write(trans, path, b);
btree_node_interior_verify(c, b);
- return;
+ return 0;
split:
- btree_split(as, trans, path, b, keys, flags);
+ /*
+ * We could attempt to avoid the transaction restart, by calling
+ * bch2_btree_path_upgrade() and allocating more nodes:
+ */
+ if (b->c.level >= as->update_level) {
+ trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+ }
+
+ return btree_split(as, trans, path, b, keys, flags);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
struct btree_path *path,
unsigned flags)
{
- struct bch_fs *c = trans->c;
struct btree *b = path_l(path)->b;
struct btree_update *as;
unsigned l;
int ret = 0;
as = bch2_btree_update_start(trans, path, path->level,
- btree_update_reserve_required(c, b), flags);
+ true, flags);
if (IS_ERR(as))
return PTR_ERR(as);
- btree_split(as, trans, path, b, NULL, flags);
- bch2_btree_update_done(as);
+ ret = btree_split(as, trans, path, b, NULL, flags);
+ if (ret) {
+ bch2_btree_update_free(as, trans);
+ return ret;
+ }
+
+ bch2_btree_update_done(as, trans);
- for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
+ for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
ret = bch2_foreground_maybe_merge(trans, path, l, flags);
return ret;
@@ -1587,7 +1762,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
enum btree_node_sibling sib)
{
struct bch_fs *c = trans->c;
- struct btree_path *sib_path = NULL;
+ struct btree_path *sib_path = NULL, *new_path = NULL;
struct btree_update *as;
struct bkey_format_state new_s;
struct bkey_format new_f;
@@ -1603,8 +1778,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
b = path->l[level].b;
- if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
- (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) {
+ if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
+ (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
b->sib_u64s[sib] = U16_MAX;
return 0;
}
@@ -1619,7 +1794,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if (ret)
goto err;
- sib_path->should_be_locked = true;
+ btree_path_set_should_be_locked(sib_path);
m = sib_path->l[level].b;
@@ -1637,16 +1812,18 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
next = m;
}
- if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
- char buf1[100], buf2[100];
+ if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
- bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
- bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
+ bch2_bpos_to_text(&buf1, prev->data->max_key);
+ bch2_bpos_to_text(&buf2, next->data->min_key);
bch_err(c,
- "btree topology error in btree merge:\n"
+ "%s(): btree topology error:\n"
" prev ends at %s\n"
" next starts at %s",
- buf1, buf2);
+ __func__, buf1.buf, buf2.buf);
+ printbuf_exit(&buf1);
+ printbuf_exit(&buf2);
bch2_topology_error(c);
ret = -EIO;
goto err;
@@ -1676,36 +1853,42 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
goto out;
parent = btree_node_parent(path, b);
- as = bch2_btree_update_start(trans, path, level,
- btree_update_reserve_required(c, parent) + 1,
- flags|
+ as = bch2_btree_update_start(trans, path, level, false,
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE);
+ BTREE_INSERT_USE_RESERVE|
+ flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto err;
- trace_btree_merge(c, b);
+ trace_and_count(c, btree_node_merge, c, b);
bch2_btree_interior_update_will_free_node(as, b);
bch2_btree_interior_update_will_free_node(as, m);
- n = bch2_btree_node_alloc(as, b->c.level);
- bch2_btree_update_add_new_node(as, n);
+ n = bch2_btree_node_alloc(as, trans, b->c.level);
+
+ SET_BTREE_NODE_SEQ(n->data,
+ max(BTREE_NODE_SEQ(b->data),
+ BTREE_NODE_SEQ(m->data)) + 1);
btree_set_min(n, prev->data->min_key);
btree_set_max(n, next->data->max_key);
- n->data->format = new_f;
+ n->data->format = new_f;
btree_node_set_format(n, new_f);
bch2_btree_sort_into(c, n, prev);
bch2_btree_sort_into(c, n, next);
bch2_btree_build_aux_trees(n);
+ bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- bch2_btree_node_write(c, n, SIX_LOCK_intent);
+ new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
+ six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(trans, new_path, n);
bkey_init(&delete.k);
delete.k.p = prev->key.k.p;
@@ -1714,32 +1897,38 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_trans_verify_paths(trans);
- bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+ if (ret)
+ goto err_free_update;
bch2_trans_verify_paths(trans);
bch2_btree_update_get_open_buckets(as, n);
+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
- six_lock_increment(&b->c.lock, SIX_LOCK_intent);
- six_lock_increment(&m->c.lock, SIX_LOCK_intent);
+ bch2_btree_node_free_inmem(trans, path, b);
+ bch2_btree_node_free_inmem(trans, sib_path, m);
bch2_trans_node_add(trans, n);
bch2_trans_verify_paths(trans);
- bch2_btree_node_free_inmem(trans, b);
- bch2_btree_node_free_inmem(trans, m);
-
six_unlock_intent(&n->c.lock);
- bch2_btree_update_done(as);
+ bch2_btree_update_done(as, trans);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
out:
err:
+ if (new_path)
+ bch2_path_put(trans, new_path, true);
bch2_path_put(trans, sib_path, true);
bch2_trans_verify_locks(trans);
return ret;
+err_free_update:
+ bch2_btree_node_free_never_used(as, trans, n);
+ bch2_btree_update_free(as, trans);
+ goto out;
}
/**
@@ -1751,6 +1940,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
+ struct btree_path *new_path = NULL;
struct btree *n, *parent;
struct btree_update *as;
int ret;
@@ -1759,52 +1949,60 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
parent = btree_node_parent(iter->path, b);
as = bch2_btree_update_start(trans, iter->path, b->c.level,
- (parent
- ? btree_update_reserve_required(c, parent)
- : 0) + 1,
- flags);
+ false, flags);
ret = PTR_ERR_OR_ZERO(as);
- if (ret) {
- trace_btree_gc_rewrite_node_fail(c, b);
+ if (ret)
goto out;
- }
bch2_btree_interior_update_will_free_node(as, b);
- n = bch2_btree_node_alloc_replacement(as, b);
- bch2_btree_update_add_new_node(as, n);
+ n = bch2_btree_node_alloc_replacement(as, trans, b);
bch2_btree_build_aux_trees(n);
+ bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- trace_btree_gc_rewrite_node(c, b);
+ new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
+ six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+ bch2_btree_path_level_init(trans, new_path, n);
- bch2_btree_node_write(c, n, SIX_LOCK_intent);
+ trace_and_count(c, btree_node_rewrite, c, b);
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
- bch2_btree_insert_node(as, trans, iter->path, parent,
- &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, iter->path, parent,
+ &as->parent_keys, flags);
+ if (ret)
+ goto err;
} else {
bch2_btree_set_root(as, trans, iter->path, n);
}
bch2_btree_update_get_open_buckets(as, n);
+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+
+ bch2_btree_node_free_inmem(trans, iter->path, b);
- six_lock_increment(&b->c.lock, SIX_LOCK_intent);
bch2_trans_node_add(trans, n);
- bch2_btree_node_free_inmem(trans, b);
six_unlock_intent(&n->c.lock);
- bch2_btree_update_done(as);
+ bch2_btree_update_done(as, trans);
out:
- bch2_btree_path_downgrade(iter->path);
+ if (new_path)
+ bch2_path_put(trans, new_path, true);
+ bch2_btree_path_downgrade(trans, iter->path);
return ret;
+err:
+ bch2_btree_node_free_never_used(as, trans, n);
+ bch2_btree_update_free(as, trans);
+ goto out;
}
struct async_btree_rewrite {
struct bch_fs *c;
struct work_struct work;
+ struct list_head list;
enum btree_id btree_id;
unsigned level;
struct bpos pos;
@@ -1814,6 +2012,7 @@ struct async_btree_rewrite {
static int async_btree_node_rewrite_trans(struct btree_trans *trans,
struct async_btree_rewrite *a)
{
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct btree *b;
int ret;
@@ -1825,11 +2024,21 @@ static int async_btree_node_rewrite_trans(struct btree_trans *trans,
if (ret)
goto out;
- if (!b || b->data->keys.seq != a->seq)
+ if (!b || b->data->keys.seq != a->seq) {
+ struct printbuf buf = PRINTBUF;
+
+ if (b)
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ else
+ prt_str(&buf, "(null");
+ bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
+ __func__, a->seq, buf.buf);
+ printbuf_exit(&buf);
goto out;
+ }
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
-out :
+out:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -1840,23 +2049,24 @@ void async_btree_node_rewrite_work(struct work_struct *work)
struct async_btree_rewrite *a =
container_of(work, struct async_btree_rewrite, work);
struct bch_fs *c = a->c;
+ int ret;
- bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_do(c, NULL, NULL, 0,
async_btree_node_rewrite_trans(&trans, a));
- percpu_ref_put(&c->writes);
+ if (ret)
+ bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
+ bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
{
struct async_btree_rewrite *a;
-
- if (!percpu_ref_tryget(&c->writes))
- return;
+ int ret;
a = kmalloc(sizeof(*a), GFP_NOFS);
if (!a) {
- percpu_ref_put(&c->writes);
+ bch_err(c, "%s: error allocating memory", __func__);
return;
}
@@ -1865,11 +2075,63 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
a->level = b->c.level;
a->pos = b->key.k.p;
a->seq = b->data->keys.seq;
-
INIT_WORK(&a->work, async_btree_node_rewrite_work);
+
+ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+ mutex_lock(&c->pending_node_rewrites_lock);
+ list_add(&a->list, &c->pending_node_rewrites);
+ mutex_unlock(&c->pending_node_rewrites_lock);
+ return;
+ }
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+ if (test_bit(BCH_FS_STARTED, &c->flags)) {
+ bch_err(c, "%s: error getting c->writes ref", __func__);
+ kfree(a);
+ return;
+ }
+
+ ret = bch2_fs_read_write_early(c);
+ if (ret) {
+ bch_err(c, "%s: error going read-write: %s",
+ __func__, bch2_err_str(ret));
+ kfree(a);
+ return;
+ }
+
+ bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+ }
+
queue_work(c->btree_interior_update_worker, &a->work);
}
+void bch2_do_pending_node_rewrites(struct bch_fs *c)
+{
+ struct async_btree_rewrite *a, *n;
+
+ mutex_lock(&c->pending_node_rewrites_lock);
+ list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
+ list_del(&a->list);
+
+ bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+ queue_work(c->btree_interior_update_worker, &a->work);
+ }
+ mutex_unlock(&c->pending_node_rewrites_lock);
+}
+
+void bch2_free_pending_node_rewrites(struct bch_fs *c)
+{
+ struct async_btree_rewrite *a, *n;
+
+ mutex_lock(&c->pending_node_rewrites_lock);
+ list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
+ list_del(&a->list);
+
+ kfree(a);
+ }
+ mutex_unlock(&c->pending_node_rewrites_lock);
+}
+
static int __bch2_btree_node_update_key(struct btree_trans *trans,
struct btree_iter *iter,
struct btree *b, struct btree *new_hash,
@@ -1879,21 +2141,16 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter2 = { NULL };
struct btree *parent;
- u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
int ret;
if (!skip_triggers) {
- ret = bch2_trans_mark_key(trans,
- bkey_s_c_null,
- bkey_i_to_s_c(new_key),
- BTREE_TRIGGER_INSERT);
+ ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
+ bkey_i_to_s_c(&b->key), 0);
if (ret)
return ret;
- ret = bch2_trans_mark_key(trans,
- bkey_i_to_s_c(&b->key),
- bkey_s_c_null,
- BTREE_TRIGGER_OVERWRITE);
+ ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
+ new_key, 0);
if (ret)
return ret;
}
@@ -1914,11 +2171,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
_THIS_IP_);
BUG_ON(iter2.path->level != b->c.level);
- BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
+ BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p));
+
+ btree_path_set_level_up(trans, iter2.path);
- btree_node_unlock(iter2.path, iter2.path->level);
- path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
- iter2.path->level++;
+ trans->paths_sorted = false;
ret = bch2_btree_iter_traverse(&iter2) ?:
bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
@@ -1927,12 +2184,16 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
} else {
BUG_ON(btree_node_root(c, b) != b);
- trans->extra_journal_entries = (void *) &journal_entries[0];
- trans->extra_journal_entry_u64s =
- journal_entry_set((void *) &journal_entries[0],
- BCH_JSET_ENTRY_btree_root,
- b->c.btree_id, b->c.level,
- new_key, new_key->k.u64s);
+ ret = darray_make_room(&trans->extra_journal_entries,
+ jset_u64s(new_key->k.u64s));
+ if (ret)
+ return ret;
+
+ journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
+ BCH_JSET_ENTRY_btree_root,
+ b->c.btree_id, b->c.level,
+ new_key, new_key->k.u64s);
+ trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
}
ret = bch2_trans_commit(trans, NULL, NULL,
@@ -1940,11 +2201,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_JOURNAL_RESERVED);
+ JOURNAL_WATERMARK_reserved);
if (ret)
goto err;
- bch2_btree_node_lock_write(trans, iter->path, b);
+ bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
@@ -1982,11 +2243,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
struct closure cl;
int ret = 0;
- if (!btree_node_intent_locked(path, b->c.level) &&
- !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) {
- btree_trans_restart(trans);
- return -EINTR;
- }
+ ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
+ if (ret)
+ return ret;
closure_init_stack(&cl);
@@ -1999,11 +2258,12 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
if (ret) {
bch2_trans_unlock(trans);
closure_sync(&cl);
- if (!bch2_trans_relock(trans))
- return -EINTR;
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ return ret;
}
- new_hash = bch2_btree_node_mem_alloc(c);
+ new_hash = bch2_btree_node_mem_alloc(trans, false);
}
path->intent_ref++;
@@ -2066,8 +2326,9 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
bch2_btree_set_root_inmem(c, b);
}
-void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
{
+ struct bch_fs *c = trans->c;
struct closure cl;
struct btree *b;
int ret;
@@ -2079,7 +2340,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
closure_sync(&cl);
} while (ret);
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(trans, false);
bch2_btree_cache_cannibalize_unlock(c);
set_btree_node_fake(b);
@@ -2108,6 +2369,12 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
+ return 0;
+}
+
+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+{
+ bch2_trans_run(c, __bch2_btree_root_alloc(&trans, id));
}
void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
@@ -2116,7 +2383,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->btree_interior_update_lock);
list_for_each_entry(as, &c->btree_interior_update_list, list)
- pr_buf(out, "%p m %u w %u r %u j %llu\n",
+ prt_printf(out, "%p m %u w %u r %u j %llu\n",
as,
as->mode,
as->nodes_written,
@@ -2125,33 +2392,36 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
mutex_unlock(&c->btree_interior_update_lock);
}
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
{
- size_t ret = 0;
- struct list_head *i;
+ bool ret;
mutex_lock(&c->btree_interior_update_lock);
- list_for_each(i, &c->btree_interior_update_list)
- ret++;
+ ret = !list_empty(&c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
return ret;
}
-void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
+bool bch2_btree_interior_updates_flush(struct bch_fs *c)
{
- struct btree_root *r;
- struct jset_entry *entry;
+ bool ret = bch2_btree_interior_updates_pending(c);
+
+ if (ret)
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_pending(c));
+ return ret;
+}
+
+void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
+{
+ struct btree_root *r = &c->btree_roots[entry->btree_id];
mutex_lock(&c->btree_root_lock);
- vstruct_for_each(jset, entry)
- if (entry->type == BCH_JSET_ENTRY_btree_root) {
- r = &c->btree_roots[entry->btree_id];
- r->level = entry->level;
- r->alive = true;
- bkey_copy(&r->key, &entry->start[0]);
- }
+ r->level = entry->level;
+ r->alive = true;
+ bkey_copy(&r->key, &entry->start[0]);
mutex_unlock(&c->btree_root_lock);
}
@@ -2177,7 +2447,7 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c,
BCH_JSET_ENTRY_btree_root,
i, c->btree_roots[i].level,
&c->btree_roots[i].key,
- c->btree_roots[i].key.u64s);
+ c->btree_roots[i].key.k.u64s);
end = vstruct_next(end);
}
@@ -2201,11 +2471,17 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c)
mutex_init(&c->btree_interior_update_lock);
INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
+ INIT_LIST_HEAD(&c->pending_node_rewrites);
+ mutex_init(&c->pending_node_rewrites_lock);
+
c->btree_interior_update_worker =
alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
if (!c->btree_interior_update_worker)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
- return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
- sizeof(struct btree_update));
+ if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+ sizeof(struct btree_update)))
+ return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
+
+ return 0;
}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 8dc86fa636d6..dcfd7ceacc59 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -52,6 +52,7 @@ struct btree_update {
unsigned took_gc_lock:1;
enum btree_id btree_id;
+ unsigned update_level;
struct disk_reservation disk_res;
struct journal_preres journal_preres;
@@ -76,8 +77,10 @@ struct btree_update {
struct journal_entry_pin journal;
/* Preallocated nodes we reserve when we start the update: */
- struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX];
- unsigned nr_prealloc_nodes;
+ struct prealloc_nodes {
+ struct btree *b[BTREE_UPDATE_NODES_MAX];
+ unsigned nr;
+ } prealloc_nodes[2];
/* Nodes being freed: */
struct keylist old_keys;
@@ -115,6 +118,7 @@ struct btree_update {
};
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+ struct btree_trans *,
struct btree *,
struct bkey_format);
@@ -278,6 +282,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
struct bkey_packed k;
BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+ EBUG_ON(btree_node_just_written(b));
if (!bkey_pack_pos(&k, pos, b)) {
struct bkey *u = (void *) &k;
@@ -307,12 +312,15 @@ static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+bool bch2_btree_interior_updates_flush(struct bch_fs *);
-void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
+void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
struct jset_entry *, struct jset_entry *);
+void bch2_do_pending_node_rewrites(struct bch_fs *);
+void bch2_free_pending_node_rewrites(struct bch_fs *);
+
void bch2_fs_btree_interior_update_exit(struct bch_fs *);
int bch2_fs_btree_interior_update_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 7186457d198b..c17d048b1c26 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -8,8 +8,10 @@
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "debug.h"
+#include "errcode.h"
#include "error.h"
#include "extent_update.h"
#include "journal.h"
@@ -23,10 +25,54 @@
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
+/*
+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
+ * different snapshot:
+ */
+struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+{
+ struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
+
+ if (k.k && bpos_eq(path->pos, k.k->p))
+ return k;
+
+ bkey_init(u);
+ u->p = path->pos;
+ return (struct bkey_s_c) { u, NULL };
+}
+
+static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct bch_fs *c = trans->c;
+ struct bkey u;
+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
+
+ if (unlikely(trans->journal_replay_not_finished)) {
+ struct bkey_i *j_k =
+ bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
+
+ if (j_k)
+ k = bkey_i_to_s_c(j_k);
+ }
+
+ u = *k.k;
+ u.needs_whiteout = i->old_k.needs_whiteout;
+
+ BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
+ BUG_ON(i->old_v != k.v);
+#endif
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+ struct bkey_i *, enum btree_update_flags);
+
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
const struct btree_insert_entry *r)
{
return cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->cached, r->cached) ?:
-cmp_int(l->level, r->level) ?:
bpos_cmp(l->k->k.p, r->k->k.p);
}
@@ -50,15 +96,12 @@ static inline bool same_leaf_as_next(struct btree_trans *trans,
insert_l(&i[0])->b == insert_l(&i[1])->b;
}
-static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
struct bch_fs *c = trans->c;
- if (path->cached)
- return;
-
if (unlikely(btree_node_just_written(b)) &&
bch2_btree_post_write_cleanup(c, b))
bch2_trans_node_reinit_iter(trans, b);
@@ -71,14 +114,6 @@ static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
bch2_btree_init_next(trans, b);
}
-void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
-{
- bch2_btree_node_lock_write(trans, path, b);
- bch2_btree_node_prep_for_write(trans, path, b);
-}
-
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
@@ -94,8 +129,8 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
EBUG_ON(btree_node_just_written(b));
EBUG_ON(bset_written(b, btree_bset_last(b)));
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
- EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
- EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
+ EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
+ EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
EBUG_ON(insert->k.u64s >
bch_btree_keys_u64s_remaining(trans->c, b));
@@ -163,20 +198,41 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
+ struct btree_trans trans;
+ unsigned long old, new, v;
+ unsigned idx = w - b->writes;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ v = READ_ONCE(b->flags);
+
+ do {
+ old = new = v;
- btree_node_lock_type(c, b, SIX_LOCK_read);
- bch2_btree_node_write_cond(c, b,
- (btree_current_write(b) == w && w->journal.seq == seq));
+ if (!(old & (1 << BTREE_NODE_dirty)) ||
+ !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
+ w->journal.seq != seq)
+ break;
+
+ new &= ~BTREE_WRITE_TYPE_MASK;
+ new |= BTREE_WRITE_journal_reclaim;
+ new |= 1 << BTREE_NODE_need_write;
+ } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+ btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
+
+ bch2_trans_exit(&trans);
return 0;
}
-static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
return __btree_node_flush(j, pin, 0, seq);
}
-static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
return __btree_node_flush(j, pin, 1, seq);
}
@@ -188,35 +244,36 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
bch2_journal_pin_add(&c->journal, seq, &w->journal,
btree_node_write_idx(b) == 0
- ? btree_node_flush0
- : btree_node_flush1);
+ ? bch2_btree_node_flush0
+ : bch2_btree_node_flush1);
}
/**
* btree_insert_key - insert a key one key into a leaf node
*/
-static bool btree_insert_key_leaf(struct btree_trans *trans,
- struct btree_insert_entry *insert)
+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
+ struct btree_path *path,
+ struct bkey_i *insert,
+ u64 journal_seq)
{
struct bch_fs *c = trans->c;
- struct btree *b = insert_l(insert)->b;
+ struct btree *b = path_l(path)->b;
struct bset_tree *t = bset_tree_last(b);
struct bset *i = bset(b, t);
int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
- if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
- &insert_l(insert)->iter, insert->k)))
- return false;
+ if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
+ &path_l(path)->iter, insert)))
+ return;
- i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
- le64_to_cpu(i->journal_seq)));
+ i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
- bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
+ bch2_btree_add_journal_pin(c, b, journal_seq);
if (unlikely(!btree_node_dirty(b)))
- set_btree_node_dirty(c, b);
+ set_btree_node_dirty_acct(c, b);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -229,8 +286,12 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
if (u64s_added > live_u64s_added &&
bch2_maybe_compact_whiteouts(c, b))
bch2_trans_node_reinit_iter(trans, b);
+}
- return true;
+static void btree_insert_key_leaf(struct btree_trans *trans,
+ struct btree_insert_entry *insert)
+{
+ bch2_btree_insert_key_leaf(trans, insert->path, insert->k, trans->journal_res.seq);
}
/* Cached btree updates: */
@@ -240,7 +301,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct btree_insert_entry *i)
{
- BUG_ON(bpos_cmp(i->k->k.p, i->path->pos));
+ BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
BUG_ON(i->cached != i->path->cached);
BUG_ON(i->level != i->path->level);
BUG_ON(i->btree_id != i->path->btree_id);
@@ -252,7 +313,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
}
static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
@@ -261,31 +322,26 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
bch2_trans_unlock(trans);
ret = bch2_journal_preres_get(&c->journal,
- &trans->journal_preres, u64s, 0);
+ &trans->journal_preres,
+ trans->journal_preres_u64s,
+ (flags & JOURNAL_WATERMARK_MASK));
if (ret)
return ret;
- if (!bch2_trans_relock(trans)) {
- trace_trans_restart_journal_preres_get(trans->fn, trace_ip);
- return -EINTR;
+ ret = bch2_trans_relock(trans);
+ if (ret) {
+ trace_and_count(c, trans_restart_journal_preres_get, trans, trace_ip, 0);
+ return ret;
}
return 0;
}
-static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
- unsigned flags)
+static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+ unsigned flags)
{
- struct bch_fs *c = trans->c;
- int ret;
-
- if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
- flags |= JOURNAL_RES_GET_RESERVED;
-
- ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
- trans->journal_u64s, flags);
-
- return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
+ return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
+ trans->journal_u64s, flags);
}
#define JSET_ENTRY_LOG_U64s 4
@@ -293,47 +349,34 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
static noinline void journal_transaction_name(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res);
- struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
- unsigned u64s = JSET_ENTRY_LOG_U64s - 1;
- unsigned b, buflen = u64s * sizeof(u64);
-
- l->entry.u64s = cpu_to_le16(u64s);
- l->entry.btree_id = 0;
- l->entry.level = 0;
- l->entry.type = BCH_JSET_ENTRY_log;
- l->entry.pad[0] = 0;
- l->entry.pad[1] = 0;
- l->entry.pad[2] = 0;
- b = min_t(unsigned, strlen(trans->fn), buflen);
- memcpy(l->d, trans->fn, b);
- while (b < buflen)
- l->d[b++] = '\0';
-
- trans->journal_res.offset += JSET_ENTRY_LOG_U64s;
- trans->journal_res.u64s -= JSET_ENTRY_LOG_U64s;
+ struct journal *j = &c->journal;
+ struct jset_entry *entry =
+ bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_log, 0, 0,
+ JSET_ENTRY_LOG_U64s);
+ struct jset_entry_log *l =
+ container_of(entry, struct jset_entry_log, entry);
+
+ strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
}
-static inline enum btree_insert_ret
-btree_key_can_insert(struct btree_trans *trans,
- struct btree *b,
- unsigned u64s)
+static inline int btree_key_can_insert(struct btree_trans *trans,
+ struct btree *b, unsigned u64s)
{
struct bch_fs *c = trans->c;
if (!bch2_btree_node_insert_fits(c, b, u64s))
- return BTREE_INSERT_BTREE_NODE_FULL;
+ return -BCH_ERR_btree_insert_btree_node_full;
- return BTREE_INSERT_OK;
+ return 0;
}
-static enum btree_insert_ret
-btree_key_can_insert_cached(struct btree_trans *trans,
- struct btree_path *path,
- unsigned u64s)
+static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
+ struct btree_path *path, unsigned u64s)
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck = (void *) path->l[0].b;
+ struct btree_insert_entry *i;
unsigned new_u64s;
struct bkey_i *new_k;
@@ -341,8 +384,8 @@ btree_key_can_insert_cached(struct btree_trans *trans,
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bch2_btree_key_cache_must_wait(c) &&
- !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
- return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
+ !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
+ return -BCH_ERR_btree_insert_need_journal_reclaim;
/*
* bch2_varint_decode can read past the end of the buffer by at most 7
@@ -351,54 +394,190 @@ btree_key_can_insert_cached(struct btree_trans *trans,
u64s += 1;
if (u64s <= ck->u64s)
- return BTREE_INSERT_OK;
+ return 0;
new_u64s = roundup_pow_of_two(u64s);
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
if (!new_k) {
bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_ids[path->btree_id], new_u64s);
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_btree_key_cache_insert;
}
+ trans_for_each_update(trans, i)
+ if (i->old_v == &ck->k->v)
+ i->old_v = &new_k->v;
+
ck->u64s = new_u64s;
ck->k = new_k;
- return BTREE_INSERT_OK;
+ return 0;
}
-static inline void do_btree_insert_one(struct btree_trans *trans,
- struct btree_insert_entry *i)
+/* Triggers: */
+
+static int run_one_mem_trigger(struct btree_trans *trans,
+ struct btree_insert_entry *i,
+ unsigned flags)
{
- struct bch_fs *c = trans->c;
- struct journal *j = &c->journal;
- bool did_work;
+ struct bkey_s_c old = { &i->old_k, i->old_v };
+ struct bkey_i *new = i->k;
+ int ret;
+
+ verify_update_old_key(trans, i);
- EBUG_ON(trans->journal_res.ref !=
- !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
+ if (unlikely(flags & BTREE_TRIGGER_NORUN))
+ return 0;
- i->k->k.needs_whiteout = false;
+ if (!btree_node_type_needs_gc(i->btree_id))
+ return 0;
- did_work = !i->cached
- ? btree_insert_key_leaf(trans, i)
- : bch2_btree_insert_key_cached(trans, i->path, i->k);
- if (!did_work)
- return;
+ if (bch2_bkey_ops[old.k->type].atomic_trigger ==
+ bch2_bkey_ops[i->k->k.type].atomic_trigger &&
+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ ret = bch2_mark_key(trans, i->btree_id, i->level,
+ old, bkey_i_to_s_c(new),
+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+ } else {
+ struct bkey _deleted = KEY(0, 0, 0);
+ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
- if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- bch2_journal_add_keys(j, &trans->journal_res,
- i->btree_id,
- i->level,
- i->k);
+ _deleted.p = i->path->pos;
- if (trans->journal_seq)
- *trans->journal_seq = trans->journal_res.seq;
+ ret = bch2_mark_key(trans, i->btree_id, i->level,
+ deleted, bkey_i_to_s_c(new),
+ BTREE_TRIGGER_INSERT|flags) ?:
+ bch2_mark_key(trans, i->btree_id, i->level,
+ old, deleted,
+ BTREE_TRIGGER_OVERWRITE|flags);
}
+
+ return ret;
}
-static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+ bool overwrite)
+{
+ /*
+ * Transactional triggers create new btree_insert_entries, so we can't
+ * pass them a pointer to a btree_insert_entry, that memory is going to
+ * move:
+ */
+ struct bkey old_k = i->old_k;
+ struct bkey_s_c old = { &old_k, i->old_v };
+
+ verify_update_old_key(trans, i);
+
+ if ((i->flags & BTREE_TRIGGER_NORUN) ||
+ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+ return 0;
+
+ if (!i->insert_trigger_run &&
+ !i->overwrite_trigger_run &&
+ bch2_bkey_ops[old.k->type].trans_trigger ==
+ bch2_bkey_ops[i->k->k.type].trans_trigger &&
+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ i->overwrite_trigger_run = true;
+ i->insert_trigger_run = true;
+ return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
+ BTREE_TRIGGER_INSERT|
+ BTREE_TRIGGER_OVERWRITE|
+ i->flags) ?: 1;
+ } else if (overwrite && !i->overwrite_trigger_run) {
+ i->overwrite_trigger_run = true;
+ return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
+ } else if (!overwrite && !i->insert_trigger_run) {
+ i->insert_trigger_run = true;
+ return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
+ } else {
+ return 0;
+ }
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+ struct btree_insert_entry *btree_id_start)
+{
+ struct btree_insert_entry *i;
+ bool trans_trigger_run;
+ int ret, overwrite;
+
+ for (overwrite = 1; overwrite >= 0; --overwrite) {
+
+ /*
+ * Running triggers will append more updates to the list of updates as
+ * we're walking it:
+ */
+ do {
+ trans_trigger_run = false;
+
+ for (i = btree_id_start;
+ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+ i++) {
+ if (i->btree_id != btree_id)
+ continue;
+
+ ret = run_one_trans_trigger(trans, i, overwrite);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ trans_trigger_run = true;
+ }
+ } while (trans_trigger_run);
+ }
+
+ return 0;
+}
+
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+ unsigned btree_id = 0;
+ int ret = 0;
+
+ /*
+ *
+ * For a given btree, this algorithm runs insert triggers before
+ * overwrite triggers: this is so that when extents are being moved
+ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+ * they are re-added.
+ */
+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+ if (btree_id == BTREE_ID_alloc)
+ continue;
+
+ while (btree_id_start < trans->updates + trans->nr_updates &&
+ btree_id_start->btree_id < btree_id)
+ btree_id_start++;
+
+ ret = run_btree_triggers(trans, btree_id, btree_id_start);
+ if (ret)
+ return ret;
+ }
+
+ trans_for_each_update(trans, i) {
+ if (i->btree_id > BTREE_ID_alloc)
+ break;
+ if (i->btree_id == BTREE_ID_alloc) {
+ ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+ if (ret)
+ return ret;
+ break;
+ }
+ }
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ trans_for_each_update(trans, i)
+ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+ (!i->insert_trigger_run || !i->overwrite_trigger_run));
+#endif
+ return 0;
+}
+
+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
+ int ret = 0;
trans_for_each_update(trans, i) {
/*
@@ -407,28 +586,32 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
*/
BUG_ON(i->cached || i->level);
- if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b)))
- bch2_mark_update(trans, i->path, i->k,
- i->flags|BTREE_TRIGGER_GC);
+ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
+ ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+ if (ret)
+ break;
+ }
}
+
+ return ret;
}
static inline int
-bch2_trans_commit_write_locked(struct btree_trans *trans,
+bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
struct btree_insert_entry **stopped_at,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
+ struct btree_write_buffered_key *wb;
struct btree_trans_commit_hook *h;
unsigned u64s = 0;
bool marking = false;
int ret;
if (race_fault()) {
- trace_trans_restart_fault_inject(trans->fn, trace_ip);
- trans->restarted = true;
- return -EINTR;
+ trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
+ return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
}
/*
@@ -439,14 +622,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
prefetch(&trans->c->journal.flags);
- h = trans->hooks;
- while (h) {
- ret = h->fn(trans, h);
- if (ret)
- return ret;
- h = h->next;
- }
-
trans_for_each_update(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
@@ -455,7 +630,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
u64s += i->k->k.u64s;
ret = !i->cached
? btree_key_can_insert(trans, insert_l(i)->b, u64s)
- : btree_key_can_insert_cached(trans, i->path, u64s);
+ : btree_key_can_insert_cached(trans, flags, i->path, u64s);
if (ret) {
*stopped_at = i;
return ret;
@@ -465,12 +640,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
marking = true;
}
+ if (trans->nr_wb_updates &&
+ trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
+ return -BCH_ERR_btree_insert_need_flush_buffer;
+
/*
* Don't get journal reservation until after we know insert will
* succeed:
*/
- if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
ret = bch2_trans_journal_res_get(trans,
+ (flags & JOURNAL_WATERMARK_MASK)|
JOURNAL_RES_GET_NONBLOCK);
if (ret)
return ret;
@@ -481,21 +661,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
trans->journal_res.seq = c->journal.replay_journal_seq;
}
- if (unlikely(trans->extra_journal_entry_u64s)) {
- memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
- trans->extra_journal_entries,
- trans->extra_journal_entry_u64s);
-
- trans->journal_res.offset += trans->extra_journal_entry_u64s;
- trans->journal_res.u64s -= trans->extra_journal_entry_u64s;
- }
-
/*
* Not allowed to fail after we've gotten our journal reservation - we
* have to use it:
*/
- if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+ !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (bch2_journal_seq_verify)
trans_for_each_update(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
@@ -506,89 +678,120 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (trans->fs_usage_deltas &&
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
- return BTREE_INSERT_NEED_MARK_REPLICAS;
+ return -BCH_ERR_btree_insert_need_mark_replicas;
- trans_for_each_update(trans, i)
- if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
- bch2_mark_update(trans, i->path, i->k, i->flags);
+ if (trans->nr_wb_updates) {
+ EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
+
+ ret = bch2_btree_insert_keys_write_buffer(trans);
+ if (ret)
+ goto revert_fs_usage;
+ }
- if (unlikely(c->gc_pos.phase))
- bch2_trans_mark_gc(trans);
+ h = trans->hooks;
+ while (h) {
+ ret = h->fn(trans, h);
+ if (ret)
+ goto revert_fs_usage;
+ h = h->next;
+ }
trans_for_each_update(trans, i)
- do_btree_insert_one(trans, i);
+ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
+ ret = run_one_mem_trigger(trans, i, i->flags);
+ if (ret)
+ goto fatal_err;
+ }
- return ret;
-}
+ if (unlikely(c->gc_pos.phase)) {
+ ret = bch2_trans_commit_run_gc_triggers(trans);
+ if (ret)
+ goto fatal_err;
+ }
-static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path)
-{
- unsigned l;
+ if (unlikely(trans->extra_journal_entries.nr)) {
+ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
+ trans->extra_journal_entries.data,
+ trans->extra_journal_entries.nr);
- for (l = 0; l < BTREE_MAX_DEPTH; l++)
- if (btree_node_read_locked(path, l))
- BUG_ON(!bch2_btree_node_upgrade(trans, path, l));
-}
+ trans->journal_res.offset += trans->extra_journal_entries.nr;
+ trans->journal_res.u64s -= trans->extra_journal_entries.nr;
+ }
-static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
-{
- struct btree *b = path_l(path)->b;
+ if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ struct journal *j = &c->journal;
+ struct jset_entry *entry;
- do {
- if (path->nodes_locked &&
- path->nodes_locked != path->nodes_intent_locked)
- path_upgrade_readers(trans, path);
- } while ((path = prev_btree_path(trans, path)) &&
- path_l(path)->b == b);
-}
+ trans_for_each_update(trans, i) {
+ if (i->key_cache_already_flushed)
+ continue;
-/*
- * Check for nodes that we have both read and intent locks on, and upgrade the
- * readers to intent:
- */
-static inline void normalize_read_intent_locks(struct btree_trans *trans)
-{
- struct btree_path *path;
- unsigned i, nr_read = 0, nr_intent = 0;
+ if (i->flags & BTREE_UPDATE_NOJOURNAL)
+ continue;
- trans_for_each_path_inorder(trans, path, i) {
- struct btree_path *next = i + 1 < trans->nr_sorted
- ? trans->paths + trans->sorted[i + 1]
- : NULL;
+ verify_update_old_key(trans, i);
- if (path->nodes_locked) {
- if (path->nodes_intent_locked)
- nr_intent++;
- else
- nr_read++;
+ if (trans->journal_transaction_names) {
+ entry = bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_overwrite,
+ i->btree_id, i->level,
+ i->old_k.u64s);
+ bkey_reassemble(&entry->start[0],
+ (struct bkey_s_c) { &i->old_k, i->old_v });
+ }
+
+ entry = bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_btree_keys,
+ i->btree_id, i->level,
+ i->k->k.u64s);
+ bkey_copy(&entry->start[0], i->k);
}
- if (!next || path_l(path)->b != path_l(next)->b) {
- if (nr_read && nr_intent)
- upgrade_readers(trans, path);
+ trans_for_each_wb_update(trans, wb) {
+ entry = bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_btree_keys,
+ wb->btree, 0,
+ wb->k.k.u64s);
+ bkey_copy(&entry->start[0], &wb->k);
+ }
- nr_read = nr_intent = 0;
+ if (trans->journal_seq)
+ *trans->journal_seq = trans->journal_res.seq;
+ }
+
+ trans_for_each_update(trans, i) {
+ i->k->k.needs_whiteout = false;
+
+ if (!i->cached)
+ btree_insert_key_leaf(trans, i);
+ else if (!i->key_cache_already_flushed)
+ bch2_btree_insert_key_cached(trans, flags, i);
+ else {
+ bch2_btree_key_cache_drop(trans, i->path);
+ btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
}
}
- bch2_trans_verify_locks(trans);
+ return 0;
+fatal_err:
+ bch2_fatal_error(c);
+revert_fs_usage:
+ if (trans->fs_usage_deltas)
+ bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
+ return ret;
}
-static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos)
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
{
- struct btree_path *path;
- unsigned i;
-
- trans_for_each_path_inorder(trans, path, i) {
- //if (path == pos)
- // break;
+ while (--i >= trans->updates) {
+ if (same_leaf_as_prev(trans, i))
+ continue;
- if (path->nodes_locked != path->nodes_intent_locked &&
- !bch2_btree_path_upgrade(trans, path, path->level + 1))
- return true;
+ bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
}
- return false;
+ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
static inline int trans_lock_write(struct btree_trans *trans)
@@ -599,87 +802,93 @@ static inline int trans_lock_write(struct btree_trans *trans)
if (same_leaf_as_prev(trans, i))
continue;
- if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
- if (have_conflicting_read_lock(trans, i->path))
- goto fail;
-
- __btree_node_lock_type(trans->c, insert_l(i)->b,
- SIX_LOCK_write);
- }
+ if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+ return trans_lock_write_fail(trans, i);
- bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+ if (!i->cached)
+ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
return 0;
-fail:
- while (--i >= trans->updates) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
- }
-
- trace_trans_restart_would_deadlock_write(trans->fn);
- return btree_trans_restart(trans);
}
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
struct btree_insert_entry *i;
+ struct btree_write_buffered_key *wb;
trans_for_each_update(trans, i)
bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+
+ trans_for_each_wb_update(trans, wb)
+ bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
+ struct btree_insert_entry *i,
+ struct printbuf *err)
+{
+ struct bch_fs *c = trans->c;
+ int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+
+ printbuf_reset(err);
+ prt_printf(err, "invalid bkey on insert from %s -> %ps",
+ trans->fn, (void *) i->ip_allocated);
+ prt_newline(err);
+ printbuf_indent_add(err, 2);
+
+ bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
+ prt_newline(err);
+
+ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+ i->bkey_type, rw, err);
+ bch2_print_string_as_lines(KERN_ERR, err->buf);
+
+ bch2_inconsistent_error(c);
+ bch2_dump_trans_updates(trans);
+ printbuf_exit(err);
+
+ return -EINVAL;
}
+#endif
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
-static inline int do_bch2_trans_commit(struct btree_trans *trans,
+static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
struct btree_insert_entry **stopped_at,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- struct bkey_s_c old;
int ret, u64s_delta = 0;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct printbuf buf = PRINTBUF;
+
trans_for_each_update(trans, i) {
- const char *invalid = bch2_bkey_invalid(c,
- bkey_i_to_s_c(i->k), i->bkey_type);
- if (invalid) {
- char buf[200];
-
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
- bch_err(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
- buf, trans->fn, (void *) i->ip_allocated, invalid);
- bch2_fatal_error(c);
- return -EINVAL;
- }
+ int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+
+ if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+ i->bkey_type, rw, &buf)))
+ return bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
btree_insert_entry_checks(trans, i);
}
+ printbuf_exit(&buf);
+#endif
trans_for_each_update(trans, i) {
- struct bkey u;
-
- /*
- * peek_slot() doesn't yet work on iterators that point to
- * interior nodes:
- */
- if (i->cached || i->level)
+ if (i->cached)
continue;
- old = bch2_btree_path_peek_slot(i->path, &u);
- ret = bkey_err(old);
- if (unlikely(ret))
- return ret;
-
u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
- u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+ u64s_delta -= i->old_btree_u64s;
if (!same_leaf_as_next(trans, i)) {
if (u64s_delta <= 0) {
ret = bch2_foreground_maybe_merge(trans, i->path,
- i->level, trans->flags);
+ i->level, flags);
if (unlikely(ret))
return ret;
}
@@ -690,24 +899,19 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
ret = bch2_journal_preres_get(&c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
- JOURNAL_RES_GET_NONBLOCK|
- ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
- ? JOURNAL_RES_GET_RESERVED : 0));
- if (unlikely(ret == -EAGAIN))
- ret = bch2_trans_journal_preres_get_cold(trans,
- trans->journal_preres_u64s, trace_ip);
+ (flags & JOURNAL_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
+ if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
+ ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
if (unlikely(ret))
return ret;
- normalize_read_intent_locks(trans);
-
ret = trans_lock_write(trans);
if (unlikely(ret))
return ret;
- ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
+ ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
- if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+ if (!ret && unlikely(trans->journal_replay_not_finished))
bch2_drop_overwrites_from_journal(trans);
trans_for_each_update(trans, i)
@@ -744,335 +948,271 @@ static int journal_reclaim_wait_done(struct bch_fs *c)
}
static noinline
-int bch2_trans_commit_error(struct btree_trans *trans,
+int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
struct btree_insert_entry *i,
int ret, unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
switch (ret) {
- case BTREE_INSERT_BTREE_NODE_FULL:
- ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
- if (!ret)
- return 0;
-
- if (ret == -EINTR)
- trace_trans_restart_btree_node_split(trans->fn, trace_ip,
- i->btree_id, &i->path->pos);
+ case -BCH_ERR_btree_insert_btree_node_full:
+ ret = bch2_btree_split_leaf(trans, i->path, flags);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
break;
- case BTREE_INSERT_NEED_MARK_REPLICAS:
+ case -BCH_ERR_btree_insert_need_mark_replicas:
bch2_trans_unlock(trans);
ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
if (ret)
break;
- if (bch2_trans_relock(trans))
- return 0;
-
- trace_trans_restart_mark_replicas(trans->fn, trace_ip);
- ret = -EINTR;
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip);
break;
- case BTREE_INSERT_NEED_JOURNAL_RES:
+ case -BCH_ERR_journal_res_get_blocked:
bch2_trans_unlock(trans);
- if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
- !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) {
- trans->restarted = true;
- ret = -EAGAIN;
+ if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ !(flags & JOURNAL_WATERMARK_reserved)) {
+ ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
}
- ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
+ ret = bch2_trans_journal_res_get(trans,
+ (flags & JOURNAL_WATERMARK_MASK)|
+ JOURNAL_RES_GET_CHECK);
if (ret)
break;
- if (bch2_trans_relock(trans))
- return 0;
-
- trace_trans_restart_journal_res_get(trans->fn, trace_ip);
- ret = -EINTR;
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip);
break;
- case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
+ case -BCH_ERR_btree_insert_need_journal_reclaim:
bch2_trans_unlock(trans);
- trace_trans_blocked_journal_reclaim(trans->fn, trace_ip);
+ trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
wait_event_freezable(c->journal.reclaim_wait,
(ret = journal_reclaim_wait_done(c)));
if (ret < 0)
break;
- if (bch2_trans_relock(trans))
- return 0;
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
+ break;
+ case -BCH_ERR_btree_insert_need_flush_buffer: {
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ ret = 0;
+
+ if (wb->state.nr > wb->size * 3 / 4) {
+ bch2_trans_reset_updates(trans);
+ bch2_trans_unlock(trans);
+
+ mutex_lock(&wb->flush_lock);
- trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
- ret = -EINTR;
+ if (wb->state.nr > wb->size * 3 / 4)
+ ret = __bch2_btree_write_buffer_flush(trans,
+ flags|BTREE_INSERT_NOCHECK_RW, true);
+ else
+ mutex_unlock(&wb->flush_lock);
+
+ if (!ret) {
+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+ }
+ }
break;
+ }
default:
BUG_ON(ret >= 0);
break;
}
- BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
- BUG_ON(ret == -ENOSPC &&
- !(trans->flags & BTREE_INSERT_NOWAIT) &&
- (trans->flags & BTREE_INSERT_NOFAIL));
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
+
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
+ !(flags & BTREE_INSERT_NOWAIT) &&
+ (flags & BTREE_INSERT_NOFAIL), c,
+ "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
return ret;
}
static noinline int
-bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
{
struct bch_fs *c = trans->c;
int ret;
- if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) ||
+ if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
test_bit(BCH_FS_STARTED, &c->flags))
- return -EROFS;
+ return -BCH_ERR_erofs_trans_commit;
bch2_trans_unlock(trans);
- ret = bch2_fs_read_write_early(c);
+ ret = bch2_fs_read_write_early(c) ?:
+ bch2_trans_relock(trans);
if (ret)
return ret;
- if (!bch2_trans_relock(trans))
- return -EINTR;
-
- percpu_ref_get(&c->writes);
+ bch2_write_ref_get(c, BCH_WRITE_REF_trans);
return 0;
}
-static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
- bool overwrite)
-{
- struct bkey _deleted = KEY(0, 0, 0);
- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
- struct bkey_s_c old;
- struct bkey unpacked;
- int ret = 0;
-
- if ((i->flags & BTREE_TRIGGER_NORUN) ||
- !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
- return 0;
-
- if (!overwrite) {
- if (i->insert_trigger_run)
- return 0;
-
- BUG_ON(i->overwrite_trigger_run);
- i->insert_trigger_run = true;
- } else {
- if (i->overwrite_trigger_run)
- return 0;
-
- BUG_ON(!i->insert_trigger_run);
- i->overwrite_trigger_run = true;
- }
-
- old = bch2_btree_path_peek_slot(i->path, &unpacked);
- _deleted.p = i->path->pos;
-
- if (overwrite) {
- ret = bch2_trans_mark_key(trans, old, deleted,
- BTREE_TRIGGER_OVERWRITE|i->flags);
- } else if (old.k->type == i->k->k.type &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
- i->overwrite_trigger_run = true;
- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
- } else {
- ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
- BTREE_TRIGGER_INSERT|i->flags);
- }
-
- if (ret == -EINTR)
- trace_trans_restart_mark(trans->fn, _RET_IP_,
- i->btree_id, &i->path->pos);
- return ret ?: 1;
-}
-
-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
- struct btree_insert_entry *btree_id_start)
+/*
+ * This is for updates done in the early part of fsck - btree_gc - before we've
+ * gone RW. we only add the new key to the list of keys for journal replay to
+ * do.
+ */
+static noinline int
+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- bool trans_trigger_run;
- int ret, overwrite;
-
- for (overwrite = 0; overwrite < 2; overwrite++) {
-
- /*
- * Running triggers will append more updates to the list of updates as
- * we're walking it:
- */
- do {
- trans_trigger_run = false;
-
- for (i = btree_id_start;
- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
- i++) {
- ret = run_one_trigger(trans, i, overwrite);
- if (ret < 0)
- return ret;
- if (ret)
- trans_trigger_run = true;
- }
- } while (trans_trigger_run);
- }
-
- return 0;
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
- struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
- unsigned btree_id = 0;
int ret = 0;
- /*
- *
- * For a given btree, this algorithm runs insert triggers before
- * overwrite triggers: this is so that when extents are being moved
- * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
- * they are re-added.
- */
- for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
- while (btree_id_start < trans->updates + trans->nr_updates &&
- btree_id_start->btree_id < btree_id)
- btree_id_start++;
-
- ret = run_btree_triggers(trans, btree_id, btree_id_start);
+ trans_for_each_update(trans, i) {
+ ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
if (ret)
- return ret;
+ break;
}
- trans_for_each_update(trans, i)
- BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
- (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
- (!i->insert_trigger_run || !i->overwrite_trigger_run));
-
- return 0;
+ return ret;
}
-int __bch2_trans_commit(struct btree_trans *trans)
+int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
+ struct btree_write_buffered_key *wb;
unsigned u64s;
int ret = 0;
if (!trans->nr_updates &&
- !trans->extra_journal_entry_u64s)
+ !trans->nr_wb_updates &&
+ !trans->extra_journal_entries.nr)
goto out_reset;
- if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
+ if (flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&c->gc_lock);
- memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
- trans->journal_u64s = trans->extra_journal_entry_u64s;
- trans->journal_preres_u64s = 0;
-
- trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+ ret = bch2_trans_commit_run_triggers(trans);
+ if (ret)
+ goto out_reset;
- if (trans->journal_transaction_names)
- trans->journal_u64s += JSET_ENTRY_LOG_U64s;
+ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+ ret = do_bch2_trans_commit_to_journal_replay(trans);
+ goto out_reset;
+ }
- if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
- unlikely(!percpu_ref_tryget(&c->writes))) {
- ret = bch2_trans_commit_get_rw_cold(trans);
+ if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
+ unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
+ ret = bch2_trans_commit_get_rw_cold(trans, flags);
if (ret)
goto out_reset;
}
-#ifdef CONFIG_BCACHEFS_DEBUG
- /*
- * if BTREE_TRIGGER_NORUN is set, it means we're probably being called
- * from the key cache flush code:
- */
- trans_for_each_update(trans, i)
- if (!i->cached &&
- !(i->flags & BTREE_TRIGGER_NORUN))
- bch2_btree_key_cache_verify_clean(trans,
- i->btree_id, i->k->k.p);
-#endif
+ if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
+ mutex_trylock(&c->btree_write_buffer.flush_lock)) {
+ bch2_trans_begin(trans);
+ bch2_trans_unlock(trans);
- ret = bch2_trans_commit_run_triggers(trans);
- if (ret)
+ ret = __bch2_btree_write_buffer_flush(trans,
+ flags|BTREE_INSERT_NOCHECK_RW, true);
+ if (!ret) {
+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+ }
goto out;
+ }
+
+ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+
+ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
+
+ trans->journal_u64s = trans->extra_journal_entries.nr;
+ trans->journal_preres_u64s = 0;
+
+ trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+
+ if (trans->journal_transaction_names)
+ trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
trans_for_each_update(trans, i) {
- BUG_ON(!i->path->should_be_locked);
+ EBUG_ON(!i->path->should_be_locked);
- if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
- trace_trans_restart_upgrade(trans->fn, _RET_IP_,
- i->btree_id, &i->path->pos);
- ret = btree_trans_restart(trans);
+ ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
+ if (unlikely(ret))
goto out;
- }
- BUG_ON(!btree_node_intent_locked(i->path, i->level));
+ EBUG_ON(!btree_node_intent_locked(i->path, i->level));
+
+ if (i->key_cache_already_flushed)
+ continue;
+ /* we're going to journal the key being updated: */
u64s = jset_u64s(i->k->k.u64s);
if (i->cached &&
- likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
+ likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
trans->journal_preres_u64s += u64s;
+
+ if (i->flags & BTREE_UPDATE_NOJOURNAL)
+ continue;
+
trans->journal_u64s += u64s;
+
+ /* and we're also going to log the overwrite: */
+ if (trans->journal_transaction_names)
+ trans->journal_u64s += jset_u64s(i->old_k.u64s);
}
+ trans_for_each_wb_update(trans, wb)
+ trans->journal_u64s += jset_u64s(wb->k.k.u64s);
+
if (trans->extra_journal_res) {
ret = bch2_disk_reservation_add(c, trans->disk_res,
trans->extra_journal_res,
- (trans->flags & BTREE_INSERT_NOFAIL)
+ (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0);
if (ret)
goto err;
}
retry:
- BUG_ON(trans->restarted);
+ bch2_trans_verify_not_in_restart(trans);
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
- ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
+ ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
/* make sure we didn't drop or screw up locks: */
bch2_trans_verify_locks(trans);
if (ret)
goto err;
+
+ trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
- if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
- percpu_ref_put(&c->writes);
+ if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
+ bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset:
- trans_for_each_update(trans, i)
- bch2_path_put(trans, i->path, true);
-
- trans->extra_journal_res = 0;
- trans->nr_updates = 0;
- trans->hooks = NULL;
- trans->extra_journal_entries = NULL;
- trans->extra_journal_entry_u64s = 0;
-
- if (trans->fs_usage_deltas) {
- trans->fs_usage_deltas->used = 0;
- memset(&trans->fs_usage_deltas->memset_start, 0,
- (void *) &trans->fs_usage_deltas->memset_end -
- (void *) &trans->fs_usage_deltas->memset_start);
- }
+ bch2_trans_reset_updates(trans);
return ret;
err:
- ret = bch2_trans_commit_error(trans, i, ret, _RET_IP_);
+ ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
if (ret)
goto out;
goto retry;
}
-static int check_pos_snapshot_overwritten(struct btree_trans *trans,
+static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
enum btree_id id,
struct bpos pos)
{
@@ -1081,12 +1221,6 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- if (!btree_type_has_snapshots(id))
- return 0;
-
- if (!snapshot_t(c, pos.snapshot)->children[0])
- return 0;
-
bch2_trans_iter_init(trans, &iter, id, pos,
BTREE_ITER_NOT_EXTENTS|
BTREE_ITER_ALL_SNAPSHOTS);
@@ -1099,7 +1233,7 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
if (!k.k)
break;
- if (bkey_cmp(pos, k.k->p))
+ if (!bkey_eq(pos, k.k->p))
break;
if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
@@ -1112,13 +1246,109 @@ static int check_pos_snapshot_overwritten(struct btree_trans *trans,
return ret;
}
+static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos pos)
+{
+ if (!btree_type_has_snapshots(id) ||
+ pos.snapshot == U32_MAX ||
+ !snapshot_t(trans->c, pos.snapshot)->children[0])
+ return 0;
+
+ return __check_pos_snapshot_overwritten(trans, id, pos);
+}
+
+static noinline int extent_front_merge(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct bkey_i **insert,
+ enum btree_update_flags flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i *update;
+ int ret;
+
+ update = bch2_bkey_make_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ return ret;
+
+ if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
+ return 0;
+
+ ret = check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
+ check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+
+ ret = bch2_btree_delete_at(trans, iter, flags);
+ if (ret)
+ return ret;
+
+ *insert = update;
+ return 0;
+}
+
+static noinline int extent_back_merge(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *insert,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ int ret;
+
+ ret = check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
+ check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+
+ bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+ return 0;
+}
+
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+ enum btree_id btree_id, struct bpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u32 snapshot = pos.snapshot;
+ int ret;
+
+ if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+ return 0;
+
+ pos.snapshot++;
+
+ for_each_btree_key_norestart(trans, iter, btree_id, pos,
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_NOPRESERVE, k, ret) {
+ if (!bkey_eq(k.k->p, pos))
+ break;
+
+ if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+ k.k->p.snapshot)) {
+ ret = !bkey_whiteout(k.k);
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
int bch2_trans_update_extent(struct btree_trans *trans,
struct btree_iter *orig_iter,
struct bkey_i *insert,
enum btree_update_flags flags)
{
- struct bch_fs *c = trans->c;
- struct btree_iter iter, update_iter;
+ struct btree_iter iter;
struct bpos start = bkey_start_pos(&insert->k);
struct bkey_i *update;
struct bkey_s_c k;
@@ -1129,58 +1359,25 @@ int bch2_trans_update_extent(struct btree_trans *trans,
BTREE_ITER_INTENT|
BTREE_ITER_WITH_UPDATES|
BTREE_ITER_NOT_EXTENTS);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
goto out;
- if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
- /*
- * We can't merge extents if they belong to interior snapshot
- * tree nodes, and there's a snapshot in which one extent is
- * visible and the other is not - i.e. if visibility is
- * different.
- *
- * Instead of checking if visibilitiy of the two extents is
- * different, for now we just check if either has been
- * overwritten:
- */
- ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
- if (ret < 0)
- goto err;
- if (ret)
- goto nomerge1;
-
- ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
- if (ret < 0)
- goto err;
- if (ret)
- goto nomerge1;
-
- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
- if ((ret = PTR_ERR_OR_ZERO(update)))
- goto err;
-
- bkey_reassemble(update, k);
-
- if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) {
- ret = bch2_btree_delete_at(trans, &iter, flags);
+ if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
+ if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+ ret = extent_front_merge(trans, &iter, k, &insert, flags);
if (ret)
goto err;
-
- insert = update;
- goto next;
}
- }
-nomerge1:
- ret = 0;
- if (!bkey_cmp(k.k->p, start))
+
goto next;
+ }
- while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
- bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0;
- bool back_split = bkey_cmp(k.k->p, insert->k.p) > 0;
+ while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
+ bool front_split = bkey_lt(bkey_start_pos(k.k), start);
+ bool back_split = bkey_gt(k.k->p, insert->k.p);
/*
* If we're going to be splitting a compressed extent, note it
@@ -1193,84 +1390,63 @@ nomerge1:
trans->extra_journal_res += compressed_sectors;
if (front_split) {
- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ update = bch2_bkey_make_mut(trans, k);
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
- bkey_reassemble(update, k);
-
bch2_cut_back(start, update);
- bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&update_iter) ?:
- bch2_trans_update(trans, &update_iter, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
- flags);
- bch2_trans_iter_exit(trans, &update_iter);
-
+ ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
if (ret)
goto err;
}
if (k.k->p.snapshot != insert->k.p.snapshot &&
(front_split || back_split)) {
- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ update = bch2_bkey_make_mut(trans, k);
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
- bkey_reassemble(update, k);
-
bch2_cut_front(start, update);
bch2_cut_back(insert->k.p, update);
- bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&update_iter) ?:
- bch2_trans_update(trans, &update_iter, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
- flags);
- bch2_trans_iter_exit(trans, &update_iter);
+ ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
if (ret)
goto err;
}
- if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
+ if (bkey_le(k.k->p, insert->k.p)) {
update = bch2_trans_kmalloc(trans, sizeof(*update));
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
bkey_init(&update->k);
update->k.p = k.k->p;
+ update->k.p.snapshot = insert->k.p.snapshot;
if (insert->k.p.snapshot != k.k->p.snapshot) {
- update->k.p.snapshot = insert->k.p.snapshot;
update->k.type = KEY_TYPE_whiteout;
+ } else if (btree_type_has_snapshots(btree_id)) {
+ ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ update->k.type = KEY_TYPE_whiteout;
}
- bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&update_iter) ?:
- bch2_trans_update(trans, &update_iter, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
- flags);
- bch2_trans_iter_exit(trans, &update_iter);
-
+ ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
if (ret)
goto err;
}
if (back_split) {
- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ update = bch2_bkey_make_mut(trans, k);
if ((ret = PTR_ERR_OR_ZERO(update)))
goto err;
- bkey_reassemble(update, k);
bch2_cut_front(insert->k.p, update);
ret = bch2_trans_update_by_path(trans, iter.path, update,
@@ -1281,7 +1457,8 @@ nomerge1:
goto out;
}
next:
- k = bch2_btree_iter_next(&iter);
+ bch2_btree_iter_advance(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -1289,22 +1466,10 @@ next:
}
if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
- ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
- if (ret < 0)
- goto err;
+ ret = extent_back_merge(trans, &iter, insert, k);
if (ret)
- goto nomerge2;
-
- ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
- if (ret < 0)
goto err;
- if (ret)
- goto nomerge2;
-
- bch2_bkey_merge(c, bkey_i_to_s(insert), k);
}
-nomerge2:
- ret = 0;
out:
if (!bkey_deleted(&insert->k)) {
/*
@@ -1324,49 +1489,49 @@ err:
return ret;
}
-/*
- * When deleting, check if we need to emit a whiteout (because we're overwriting
- * something in an ancestor snapshot)
- */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos)
+static int __must_check
+bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *k, enum btree_update_flags flags,
+ unsigned long ip);
+
+static noinline int flush_new_cached_update(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_insert_entry *i,
+ enum btree_update_flags flags,
+ unsigned long ip)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- u32 snapshot = pos.snapshot;
+ struct btree_path *btree_path;
int ret;
- if (!bch2_snapshot_parent(trans->c, pos.snapshot))
- return 0;
+ i->key_cache_already_flushed = true;
+ i->flags |= BTREE_TRIGGER_NORUN;
- pos.snapshot++;
-
- for_each_btree_key_norestart(trans, iter, btree_id, pos,
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_NOPRESERVE, k, ret) {
- if (bkey_cmp(k.k->p, pos))
- break;
+ btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+ BTREE_ITER_INTENT, _THIS_IP_);
- if (bch2_snapshot_is_ancestor(trans->c, snapshot,
- k.k->p.snapshot)) {
- ret = !bkey_whiteout(k.k);
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
+ ret = bch2_btree_path_traverse(trans, btree_path, 0);
+ if (ret)
+ goto err;
+ btree_path_set_should_be_locked(btree_path);
+ ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip);
+err:
+ bch2_path_put(trans, btree_path, true);
return ret;
}
-int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
- struct bkey_i *k, enum btree_update_flags flags)
+static int __must_check
+bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *k, enum btree_update_flags flags,
+ unsigned long ip)
{
+ struct bch_fs *c = trans->c;
struct btree_insert_entry *i, n;
+ int cmp;
- BUG_ON(!path->should_be_locked);
-
- BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
- BUG_ON(bpos_cmp(k->k.p, path->pos));
+ EBUG_ON(!path->should_be_locked);
+ EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+ EBUG_ON(!bpos_eq(k->k.p, path->pos));
n = (struct btree_insert_entry) {
.flags = flags,
@@ -1376,7 +1541,7 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
.cached = path->cached,
.path = path,
.k = k,
- .ip_allocated = _RET_IP_,
+ .ip_allocated = ip,
};
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -1389,44 +1554,77 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
* Pending updates are kept sorted: first, find position of new update,
* then delete/trim any updates the new update overwrites:
*/
- trans_for_each_update(trans, i)
- if (btree_insert_entry_cmp(&n, i) <= 0)
+ trans_for_each_update(trans, i) {
+ cmp = btree_insert_entry_cmp(&n, i);
+ if (cmp <= 0)
break;
+ }
- if (i < trans->updates + trans->nr_updates &&
- !btree_insert_entry_cmp(&n, i)) {
- BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
-
- /*
- * This is a hack to ensure that inode creates update the btree,
- * not the key cache, which helps with cache coherency issues in
- * other areas:
- */
- if (n.cached && !i->cached) {
- i->k = n.k;
- i->flags = n.flags;
- return 0;
- }
+ if (!cmp && i < trans->updates + trans->nr_updates) {
+ EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
bch2_path_put(trans, i->path, true);
- *i = n;
- } else
+ i->flags = n.flags;
+ i->cached = n.cached;
+ i->k = n.k;
+ i->path = n.path;
+ i->ip_allocated = n.ip_allocated;
+ } else {
array_insert_item(trans->updates, trans->nr_updates,
i - trans->updates, n);
- __btree_path_get(n.path, true);
+ i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
+ i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+
+ if (unlikely(trans->journal_replay_not_finished)) {
+ struct bkey_i *j_k =
+ bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
+
+ if (j_k) {
+ i->old_k = j_k->k;
+ i->old_v = &j_k->v;
+ }
+ }
+ }
+
+ __btree_path_get(i->path, true);
+
+ /*
+ * If a key is present in the key cache, it must also exist in the
+ * btree - this is necessary for cache coherency. When iterating over
+ * a btree that's cached in the key cache, the btree iter code checks
+ * the key cache - but the key has to exist in the btree for that to
+ * work:
+ */
+ if (path->cached &&
+ bkey_deleted(&i->old_k) &&
+ !(flags & BTREE_UPDATE_NO_KEY_CACHE_COHERENCY))
+ return flush_new_cached_update(trans, path, i, flags, ip);
+
return 0;
}
+static inline int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *k, enum btree_update_flags flags)
+{
+ return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_);
+}
+
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_update_flags flags)
{
+ struct btree_path *path = iter->update_path ?: iter->path;
+ struct bkey_cached *ck;
+ int ret;
+
if (iter->flags & BTREE_ITER_IS_EXTENTS)
return bch2_trans_update_extent(trans, iter, k, flags);
if (bkey_deleted(&k->k) &&
+ !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
- int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+ ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
if (unlikely(ret < 0))
return ret;
@@ -1434,8 +1632,99 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
k->k.type = KEY_TYPE_whiteout;
}
- return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path,
- k, flags);
+ /*
+ * Ensure that updates to cached btrees go to the key cache:
+ */
+ if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+ !path->cached &&
+ !path->level &&
+ btree_id_cached(trans->c, path->btree_id)) {
+ if (!iter->key_cache_path ||
+ !iter->key_cache_path->should_be_locked ||
+ !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
+ if (!iter->key_cache_path)
+ iter->key_cache_path =
+ bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_CACHED, _THIS_IP_);
+
+ iter->key_cache_path =
+ bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+ iter->flags & BTREE_ITER_INTENT,
+ _THIS_IP_);
+
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+ BTREE_ITER_CACHED);
+ if (unlikely(ret))
+ return ret;
+
+ ck = (void *) iter->key_cache_path->l[0].b;
+
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+ }
+
+ btree_path_set_should_be_locked(iter->key_cache_path);
+ }
+
+ path = iter->key_cache_path;
+ }
+
+ return bch2_trans_update_by_path(trans, path, k, flags);
+}
+
+int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_i *k)
+{
+ struct btree_write_buffered_key *i;
+ int ret;
+
+ EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
+ EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+ trans_for_each_wb_update(trans, i) {
+ if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
+ bkey_copy(&i->k, k);
+ return 0;
+ }
+ }
+
+ if (!trans->wb_updates ||
+ trans->nr_wb_updates == trans->wb_updates_size) {
+ struct btree_write_buffered_key *u;
+
+ if (trans->nr_wb_updates == trans->wb_updates_size) {
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+ BUG_ON(trans->wb_updates_size > U8_MAX / 2);
+ trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
+ if (s)
+ s->wb_updates_size = trans->wb_updates_size;
+ }
+
+ u = bch2_trans_kmalloc_nomemzero(trans,
+ trans->wb_updates_size *
+ sizeof(struct btree_write_buffered_key));
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
+
+ if (trans->nr_wb_updates)
+ memcpy(u, trans->wb_updates, trans->nr_wb_updates *
+ sizeof(struct btree_write_buffered_key));
+ trans->wb_updates = u;
+ }
+
+ trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
+ .btree = btree,
+ };
+
+ bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
+ trans->nr_wb_updates++;
+
+ return 0;
}
void bch2_trans_commit_hook(struct btree_trans *trans,
@@ -1445,16 +1734,33 @@ void bch2_trans_commit_hook(struct btree_trans *trans,
trans->hooks = h;
}
-int __bch2_btree_insert(struct btree_trans *trans,
- enum btree_id id, struct bkey_i *k)
+int bch2_btree_insert_nonextent(struct btree_trans *trans,
+ enum btree_id btree, struct bkey_i *k,
+ enum btree_update_flags flags)
+{
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, k, flags);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
+ struct bkey_i *k, enum btree_update_flags flags)
{
struct btree_iter iter;
int ret;
bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+ BTREE_ITER_CACHED|
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, k, 0);
+ bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1472,11 +1778,11 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
u64 *journal_seq, int flags)
{
return bch2_trans_do(c, disk_res, journal_seq, flags,
- __bch2_btree_insert(&trans, id, k));
+ __bch2_btree_insert(&trans, id, k, 0));
}
-int bch2_btree_delete_at(struct btree_trans *trans,
- struct btree_iter *iter, unsigned update_flags)
+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
+ unsigned len, unsigned update_flags)
{
struct bkey_i *k;
@@ -1486,28 +1792,50 @@ int bch2_btree_delete_at(struct btree_trans *trans,
bkey_init(&k->k);
k->k.p = iter->pos;
+ bch2_key_resize(&k->k, len);
return bch2_trans_update(trans, iter, k, update_flags);
}
+int bch2_btree_delete_at(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned update_flags)
+{
+ return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
+}
+
+int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+ enum btree_id btree, struct bpos pos)
+{
+ struct bkey_i *k;
+
+ k = bch2_trans_kmalloc(trans, sizeof(*k));
+ if (IS_ERR(k))
+ return PTR_ERR(k);
+
+ bkey_init(&k->k);
+ k->k.p = pos;
+ return bch2_trans_update_buffered(trans, btree, k);
+}
+
int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
struct bpos start, struct bpos end,
- unsigned iter_flags,
+ unsigned update_flags,
u64 *journal_seq)
{
+ u32 restart_count = trans->restart_count;
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
-retry:
- while ((bch2_trans_begin(trans),
- (k = bch2_btree_iter_peek(&iter)).k) &&
- !(ret = bkey_err(k)) &&
- bkey_cmp(iter.pos, end) < 0) {
+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+ while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(trans->c, 0);
struct bkey_i delete;
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
bkey_init(&delete.k);
/*
@@ -1526,33 +1854,33 @@ retry:
*/
delete.k.p = iter.pos;
- if (iter.flags & BTREE_ITER_IS_EXTENTS) {
- unsigned max_sectors =
- KEY_SIZE_MAX & (~0 << trans->c->block_bits);
-
- /* create the biggest key we can */
- bch2_key_resize(&delete.k, max_sectors);
- bch2_cut_back(end, &delete);
+ if (iter.flags & BTREE_ITER_IS_EXTENTS)
+ bch2_key_resize(&delete.k,
+ bpos_min(end, k.k->p).offset -
+ iter.pos.offset);
- ret = bch2_extent_trim_atomic(trans, &iter, &delete);
- if (ret)
- break;
- }
-
- ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+ ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
bch2_trans_commit(trans, &disk_res, journal_seq,
- BTREE_INSERT_NOFAIL);
+ BTREE_INSERT_NOFAIL);
bch2_disk_reservation_put(trans->c, &disk_res);
+err:
+ /*
+ * the bch2_trans_begin() call is in a weird place because we
+ * need to call it after every transaction commit, to avoid path
+ * overflow, but don't want to call it if the delete operation
+ * is a no-op and we have no work to do:
+ */
+ bch2_trans_begin(trans);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
if (ret)
break;
}
-
- if (ret == -EINTR) {
- ret = 0;
- goto retry;
- }
-
bch2_trans_iter_exit(trans, &iter);
+
+ if (!ret && trans_was_restarted(trans, restart_count))
+ ret = -BCH_ERR_transaction_restart_nested;
return ret;
}
@@ -1563,10 +1891,92 @@ retry:
*/
int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
struct bpos start, struct bpos end,
- unsigned iter_flags,
+ unsigned update_flags,
u64 *journal_seq)
{
- return bch2_trans_do(c, NULL, journal_seq, 0,
- bch2_btree_delete_range_trans(&trans, id, start, end,
- iter_flags, journal_seq));
+ int ret = bch2_trans_run(c,
+ bch2_btree_delete_range_trans(&trans, id, start, end,
+ update_flags, journal_seq));
+ if (ret == -BCH_ERR_transaction_restart_nested)
+ ret = 0;
+ return ret;
+}
+
+static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
+{
+ struct printbuf buf = PRINTBUF;
+ struct jset_entry_log *l;
+ unsigned u64s;
+ int ret;
+
+ prt_vprintf(&buf, fmt, args);
+ ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+ if (ret)
+ goto err;
+
+ u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
+
+ ret = darray_make_room(entries, jset_u64s(u64s));
+ if (ret)
+ goto err;
+
+ l = (void *) &darray_top(*entries);
+ l->entry.u64s = cpu_to_le16(u64s);
+ l->entry.btree_id = 0;
+ l->entry.level = 1;
+ l->entry.type = BCH_JSET_ENTRY_log;
+ l->entry.pad[0] = 0;
+ l->entry.pad[1] = 0;
+ l->entry.pad[2] = 0;
+ memcpy(l->d, buf.buf, buf.pos);
+ while (buf.pos & 7)
+ l->d[buf.pos++] = '\0';
+
+ entries->nr += jset_u64s(u64s);
+err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int
+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
+ va_list args)
+{
+ int ret;
+
+ if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+ ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+ } else {
+ ret = bch2_trans_do(c, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|commit_flags,
+ __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
+ }
+
+ return ret;
+}
+
+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+ va_list args;
+ int ret;
+
+ va_start(args, fmt);
+ ret = __bch2_fs_log_msg(c, 0, fmt, args);
+ va_end(args);
+ return ret;
+}
+
+/*
+ * Use for logging messages during recovery to enable reserved space and avoid
+ * blocking.
+ */
+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+ va_list args;
+ int ret;
+
+ va_start(args, fmt);
+ ret = __bch2_fs_log_msg(c, JOURNAL_WATERMARK_reserved, fmt, args);
+ va_end(args);
+ return ret;
}
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
new file mode 100644
index 000000000000..9983a47853b9
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+
+#include <linux/sort.h>
+
+static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->btree, r->btree) ?:
+ bpos_cmp(l->k.k.p, r->k.k.p) ?:
+ cmp_int(l->journal_seq, r->journal_seq) ?:
+ cmp_int(l->journal_offset, r->journal_offset);
+}
+
+static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->journal_seq, r->journal_seq);
+}
+
+static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree_write_buffered_key *wb,
+ unsigned commit_flags,
+ bool *write_locked,
+ size_t *fast)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
+ int ret;
+
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ return ret;
+
+ path = iter->path;
+
+ if (!*write_locked) {
+ ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
+ if (ret)
+ return ret;
+
+ bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
+ *write_locked = true;
+ }
+
+ if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ *write_locked = false;
+ goto trans_commit;
+ }
+
+ bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
+ (*fast)++;
+
+ if (path->ref > 1) {
+ /*
+ * We can't clone a path that has write locks: if the path is
+ * shared, unlock before set_pos(), traverse():
+ */
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ *write_locked = false;
+ }
+ return 0;
+trans_commit:
+ return bch2_trans_update(trans, iter, &wb->k, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ commit_flags|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_RECLAIM);
+}
+
+static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
+{
+ union btree_write_buffer_state old, new;
+ u64 v = READ_ONCE(wb->state.v);
+
+ do {
+ old.v = new.v = v;
+
+ new.nr = 0;
+ new.idx++;
+ } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+
+ while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
+ cpu_relax();
+
+ smp_mb();
+
+ return old;
+}
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
+ bool locked)
+{
+ struct bch_fs *c = trans->c;
+ struct journal *j = &c->journal;
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ struct journal_entry_pin pin;
+ struct btree_write_buffered_key *i, *keys;
+ struct btree_iter iter = { NULL };
+ size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
+ bool write_locked = false;
+ union btree_write_buffer_state s;
+ int ret = 0;
+
+ memset(&pin, 0, sizeof(pin));
+
+ if (!locked && !mutex_trylock(&wb->flush_lock))
+ return 0;
+
+ bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
+ bch2_journal_pin_drop(j, &wb->journal_pin);
+
+ s = btree_write_buffer_switch(wb);
+ keys = wb->keys[s.idx];
+ nr = s.nr;
+
+ /*
+ * We first sort so that we can detect and skip redundant updates, and
+ * then we attempt to flush in sorted btree order, as this is most
+ * efficient.
+ *
+ * However, since we're not flushing in the order they appear in the
+ * journal we won't be able to drop our journal pin until everything is
+ * flushed - which means this could deadlock the journal if we weren't
+ * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
+ * if it would block taking a journal reservation.
+ *
+ * If that happens, simply skip the key so we can optimistically insert
+ * as many keys as possible in the fast path.
+ */
+ sort(keys, nr, sizeof(keys[0]),
+ btree_write_buffered_key_cmp, NULL);
+
+ for (i = keys; i < keys + nr; i++) {
+ if (i + 1 < keys + nr &&
+ i[0].btree == i[1].btree &&
+ bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
+ skipped++;
+ i->journal_seq = 0;
+ continue;
+ }
+
+ if (write_locked &&
+ (iter.path->btree_id != i->btree ||
+ bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
+ bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+ write_locked = false;
+ }
+
+ if (!iter.path || iter.path->btree_id != i->btree) {
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT);
+ }
+
+ bch2_btree_iter_set_pos(&iter, i->k.k.p);
+ iter.path->preserve = false;
+
+ do {
+ ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
+ commit_flags, &write_locked, &fast);
+ if (!write_locked)
+ bch2_trans_begin(trans);
+ } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+ if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+ slowpath++;
+ continue;
+ }
+ if (ret)
+ break;
+
+ i->journal_seq = 0;
+ }
+
+ if (write_locked)
+ bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+ bch2_trans_iter_exit(trans, &iter);
+
+ trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
+
+ if (slowpath)
+ goto slowpath;
+
+ bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+out:
+ bch2_journal_pin_drop(j, &pin);
+ mutex_unlock(&wb->flush_lock);
+ return ret;
+slowpath:
+ trace_write_buffer_flush_slowpath(trans, i - keys, nr);
+
+ /*
+ * Now sort the rest by journal seq and bump the journal pin as we go.
+ * The slowpath zapped the seq of keys that were successfully flushed so
+ * we can skip those here.
+ */
+ sort(keys, nr, sizeof(keys[0]),
+ btree_write_buffered_journal_cmp,
+ NULL);
+
+ for (i = keys; i < keys + nr; i++) {
+ if (!i->journal_seq)
+ continue;
+
+ if (i->journal_seq > pin.seq) {
+ struct journal_entry_pin pin2;
+
+ memset(&pin2, 0, sizeof(pin2));
+
+ bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
+ bch2_journal_pin_drop(j, &pin);
+ bch2_journal_pin_copy(j, &pin, &pin2, NULL);
+ bch2_journal_pin_drop(j, &pin2);
+ }
+
+ ret = commit_do(trans, NULL, NULL,
+ commit_flags|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_RECLAIM|
+ JOURNAL_WATERMARK_reserved,
+ __bch2_btree_insert(trans, i->btree, &i->k, 0));
+ if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
+ break;
+ }
+
+ goto out;
+}
+
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+{
+ bch2_trans_unlock(trans);
+ mutex_lock(&trans->c->btree_write_buffer.flush_lock);
+ return __bch2_btree_write_buffer_flush(trans, 0, true);
+}
+
+int bch2_btree_write_buffer_flush(struct btree_trans *trans)
+{
+ return __bch2_btree_write_buffer_flush(trans, 0, false);
+}
+
+static int bch2_btree_write_buffer_journal_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ mutex_lock(&wb->flush_lock);
+
+ return bch2_trans_run(c,
+ __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true));
+}
+
+static inline u64 btree_write_buffer_ref(int idx)
+{
+ return ((union btree_write_buffer_state) {
+ .ref0 = idx == 0,
+ .ref1 = idx == 1,
+ }).v;
+}
+
+int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ struct btree_write_buffered_key *i;
+ union btree_write_buffer_state old, new;
+ int ret = 0;
+ u64 v;
+
+ trans_for_each_wb_update(trans, i) {
+ EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+ i->journal_seq = trans->journal_res.seq;
+ i->journal_offset = trans->journal_res.offset;
+ }
+
+ preempt_disable();
+ v = READ_ONCE(wb->state.v);
+ do {
+ old.v = new.v = v;
+
+ new.v += btree_write_buffer_ref(new.idx);
+ new.nr += trans->nr_wb_updates;
+ if (new.nr > wb->size) {
+ ret = -BCH_ERR_btree_insert_need_flush_buffer;
+ goto out;
+ }
+ } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+
+ memcpy(wb->keys[new.idx] + old.nr,
+ trans->wb_updates,
+ sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
+
+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
+out:
+ preempt_enable();
+ return ret;
+}
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
+
+ kvfree(wb->keys[1]);
+ kvfree(wb->keys[0]);
+}
+
+int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ mutex_init(&wb->flush_lock);
+ wb->size = c->opts.btree_write_buffer_size;
+
+ wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
+ wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
+ if (!wb->keys[0] || !wb->keys[1])
+ return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
+
+ return 0;
+}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
new file mode 100644
index 000000000000..322df1c8304e
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_H
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
+int bch2_btree_write_buffer_flush(struct btree_trans *);
+
+int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
+int bch2_fs_btree_write_buffer_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
new file mode 100644
index 000000000000..99993ba77aea
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+
+#include "journal_types.h"
+
+#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
+#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
+
+struct btree_write_buffered_key {
+ u64 journal_seq;
+ unsigned journal_offset;
+ enum btree_id btree;
+ __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
+};
+
+union btree_write_buffer_state {
+ struct {
+ atomic64_t counter;
+ };
+
+ struct {
+ u64 v;
+ };
+
+ struct {
+ u64 nr:23;
+ u64 idx:1;
+ u64 ref0:20;
+ u64 ref1:20;
+ };
+};
+
+struct btree_write_buffer {
+ struct mutex flush_lock;
+ struct journal_entry_pin journal_pin;
+
+ union btree_write_buffer_state state;
+ size_t size;
+
+ struct btree_write_buffered_key *keys[2];
+};
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 64bed7aa3eb9..0362e10eb6ae 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -7,6 +7,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
+#include "backpointers.h"
#include "bset.h"
#include "btree_gc.h"
#include "btree_update.h"
@@ -88,20 +89,17 @@ static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
: ca->usage[journal_seq & JOURNAL_BUF_MASK]);
}
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
+void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
{
struct bch_fs *c = ca->fs;
- struct bch_dev_usage ret;
unsigned seq, i, u64s = dev_usage_u64s();
do {
seq = read_seqcount_begin(&c->usage_lock);
- memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+ memcpy(usage, ca->usage_base, u64s * sizeof(u64));
for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
- acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+ acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
} while (read_seqcount_retry(&c->usage_lock, seq));
-
- return ret;
}
static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
@@ -139,23 +137,28 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
{
struct bch_fs_usage_online *ret;
- unsigned seq, i, u64s;
+ unsigned seq, i, v, u64s = fs_usage_u64s(c) + 1;
+retry:
+ ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
+ if (unlikely(!ret))
+ return NULL;
percpu_down_read(&c->mark_lock);
- ret = kmalloc(sizeof(struct bch_fs_usage_online) +
- sizeof(u64) * c->replicas.nr, GFP_NOFS);
- if (unlikely(!ret)) {
+ v = fs_usage_u64s(c) + 1;
+ if (unlikely(u64s != v)) {
+ u64s = v;
percpu_up_read(&c->mark_lock);
- return NULL;
+ kfree(ret);
+ goto retry;
}
ret->online_reserved = percpu_u64_get(c->online_reserved);
- u64s = fs_usage_u64s(c);
do {
seq = read_seqcount_begin(&c->usage_lock);
- memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
+ unsafe_memcpy(&ret->u, c->usage_base, u64s * sizeof(u64),
+ "embedded variable length struct");
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
} while (read_seqcount_retry(&c->usage_lock, seq));
@@ -197,26 +200,26 @@ void bch2_fs_usage_to_text(struct printbuf *out,
{
unsigned i;
- pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
+ prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
- pr_buf(out, "hidden:\t\t\t\t%llu\n",
+ prt_printf(out, "hidden:\t\t\t\t%llu\n",
fs_usage->u.hidden);
- pr_buf(out, "data:\t\t\t\t%llu\n",
+ prt_printf(out, "data:\t\t\t\t%llu\n",
fs_usage->u.data);
- pr_buf(out, "cached:\t\t\t\t%llu\n",
+ prt_printf(out, "cached:\t\t\t\t%llu\n",
fs_usage->u.cached);
- pr_buf(out, "reserved:\t\t\t%llu\n",
+ prt_printf(out, "reserved:\t\t\t%llu\n",
fs_usage->u.reserved);
- pr_buf(out, "nr_inodes:\t\t\t%llu\n",
+ prt_printf(out, "nr_inodes:\t\t\t%llu\n",
fs_usage->u.nr_inodes);
- pr_buf(out, "online reserved:\t\t%llu\n",
+ prt_printf(out, "online reserved:\t\t%llu\n",
fs_usage->online_reserved);
for (i = 0;
i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
i++) {
- pr_buf(out, "%u replicas:\n", i + 1);
- pr_buf(out, "\treserved:\t\t%llu\n",
+ prt_printf(out, "%u replicas:\n", i + 1);
+ prt_printf(out, "\treserved:\t\t%llu\n",
fs_usage->u.persistent_reserved[i]);
}
@@ -224,9 +227,9 @@ void bch2_fs_usage_to_text(struct printbuf *out,
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
- pr_buf(out, "\t");
+ prt_printf(out, "\t");
bch2_replicas_entry_to_text(out, e);
- pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
+ prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
}
}
@@ -279,44 +282,22 @@ bch2_fs_usage_read_short(struct bch_fs *c)
return ret;
}
-static inline int is_unavailable_bucket(struct bucket_mark m)
+void bch2_dev_usage_init(struct bch_dev *ca)
{
- return !is_available_bucket(m);
+ ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
}
static inline int bucket_sectors_fragmented(struct bch_dev *ca,
- struct bucket_mark m)
+ struct bch_alloc_v4 a)
{
- return bucket_sectors_used(m)
- ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+ return a.dirty_sectors
+ ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
: 0;
}
-static inline int is_stripe_data_bucket(struct bucket_mark m)
-{
- return m.stripe && m.data_type != BCH_DATA_parity;
-}
-
-static inline enum bch_data_type bucket_type(struct bucket_mark m)
-{
- return m.cached_sectors && !m.dirty_sectors
- ? BCH_DATA_cached
- : m.data_type;
-}
-
-static inline void account_bucket(struct bch_fs_usage *fs_usage,
- struct bch_dev_usage *dev_usage,
- enum bch_data_type type,
- int nr, s64 size)
-{
- if (type == BCH_DATA_sb || type == BCH_DATA_journal)
- fs_usage->hidden += size;
-
- dev_usage->d[type].buckets += nr;
-}
-
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
- struct bucket_mark old, struct bucket_mark new,
+ struct bch_alloc_v4 old,
+ struct bch_alloc_v4 new,
u64 journal_seq, bool gc)
{
struct bch_fs_usage *fs_usage;
@@ -324,32 +305,52 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
- u = dev_usage_ptr(ca, journal_seq, gc);
- if (bucket_type(old))
- account_bucket(fs_usage, u, bucket_type(old),
- -1, -ca->mi.bucket_size);
+ if (data_type_is_hidden(old.data_type))
+ fs_usage->hidden -= ca->mi.bucket_size;
+ if (data_type_is_hidden(new.data_type))
+ fs_usage->hidden += ca->mi.bucket_size;
+
+ u = dev_usage_ptr(ca, journal_seq, gc);
- if (bucket_type(new))
- account_bucket(fs_usage, u, bucket_type(new),
- 1, ca->mi.bucket_size);
+ u->d[old.data_type].buckets--;
+ u->d[new.data_type].buckets++;
- u->buckets_ec += (int) new.stripe - (int) old.stripe;
- u->buckets_unavailable +=
- is_unavailable_bucket(new) - is_unavailable_bucket(old);
+ u->buckets_ec -= (int) !!old.stripe;
+ u->buckets_ec += (int) !!new.stripe;
u->d[old.data_type].sectors -= old.dirty_sectors;
u->d[new.data_type].sectors += new.dirty_sectors;
- u->d[BCH_DATA_cached].sectors +=
- (int) new.cached_sectors - (int) old.cached_sectors;
+
+ u->d[BCH_DATA_cached].sectors += new.cached_sectors;
+ u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
preempt_enable();
+}
+
+static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket old, struct bucket new,
+ u64 journal_seq, bool gc)
+{
+ struct bch_alloc_v4 old_a = {
+ .gen = old.gen,
+ .data_type = old.data_type,
+ .dirty_sectors = old.dirty_sectors,
+ .cached_sectors = old.cached_sectors,
+ .stripe = old.stripe,
+ };
+ struct bch_alloc_v4 new_a = {
+ .gen = new.gen,
+ .data_type = new.data_type,
+ .dirty_sectors = new.dirty_sectors,
+ .cached_sectors = new.cached_sectors,
+ .stripe = new.stripe,
+ };
- if (!is_available_bucket(old) && is_available_bucket(new))
- bch2_wake_allocator(ca);
+ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
}
static inline int __update_replicas(struct bch_fs *c,
@@ -373,22 +374,22 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
{
struct bch_fs_usage __percpu *fs_usage;
int idx, ret = 0;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
percpu_down_read(&c->mark_lock);
+ buf.atomic++;
idx = bch2_replicas_entry_idx(c, r);
if (idx < 0 &&
- (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err(c, "no replicas entry\n"
- " while marking %s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) {
+ fsck_err(c, "no replicas entry\n"
+ " while marking %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
percpu_up_read(&c->mark_lock);
ret = bch2_mark_replicas(c, r);
- if (ret)
- return ret;
-
percpu_down_read(&c->mark_lock);
+
+ if (ret)
+ goto err;
idx = bch2_replicas_entry_idx(c, r);
}
if (idx < 0) {
@@ -404,6 +405,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
err:
fsck_err:
percpu_up_read(&c->mark_lock);
+ printbuf_exit(&buf);
return ret;
}
@@ -467,7 +469,8 @@ static inline void update_replicas_list(struct btree_trans *trans,
n = (void *) d->d + d->used;
n->delta = sectors;
- memcpy(&n->r, r, replicas_entry_bytes(r));
+ memcpy((void *) n + offsetof(struct replicas_delta, r),
+ r, replicas_entry_bytes(r));
bch2_replicas_entry_sort(&n->r);
d->used += b;
}
@@ -482,31 +485,18 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
update_replicas_list(trans, &r.e, sectors);
}
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, bool owned_by_allocator)
-{
- struct bucket *g = bucket(ca, b);
- struct bucket_mark old, new;
-
- old = bucket_cmpxchg(g, new, ({
- new.owned_by_allocator = owned_by_allocator;
- }));
-
- BUG_ON(owned_by_allocator == old.owned_by_allocator);
-}
-
-static int bch2_mark_alloc(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_alloc(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
u64 journal_seq = trans->journal_res.seq;
+ u64 bucket_journal_seq;
struct bch_fs *c = trans->c;
- struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old);
- struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new);
+ struct bch_alloc_v4 old_a_convert, new_a_convert;
+ const struct bch_alloc_v4 *old_a, *new_a;
struct bch_dev *ca;
- struct bucket *g;
- struct bucket_mark old_m, m;
int ret = 0;
/*
@@ -516,59 +506,72 @@ static int bch2_mark_alloc(struct btree_trans *trans,
!(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
return 0;
+ if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
+ "alloc key for invalid device or bucket"))
+ return -EIO;
+
+ ca = bch_dev_bkey_exists(c, new.k->p.inode);
+
+ old_a = bch2_alloc_to_v4(old, &old_a_convert);
+ new_a = bch2_alloc_to_v4(new, &new_a_convert);
+
+ bucket_journal_seq = new_a->journal_seq;
+
if ((flags & BTREE_TRIGGER_INSERT) &&
- !old_u.data_type != !new_u.data_type &&
- new.k->type == KEY_TYPE_alloc_v3) {
- struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
+ data_type_is_empty(old_a->data_type) !=
+ data_type_is_empty(new_a->data_type) &&
+ new.k->type == KEY_TYPE_alloc_v4) {
+ struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
- BUG_ON(!journal_seq);
+ EBUG_ON(!journal_seq);
/*
* If the btree updates referring to a bucket weren't flushed
* before the bucket became empty again, then the we don't have
* to wait on a journal flush before we can reuse the bucket:
*/
- new_u.journal_seq = !new_u.data_type &&
- bch2_journal_noflush_seq(&c->journal, journal_seq)
+ v->journal_seq = bucket_journal_seq =
+ data_type_is_empty(new_a->data_type) &&
+ (journal_seq == v->journal_seq ||
+ bch2_journal_noflush_seq(&c->journal, v->journal_seq))
? 0 : journal_seq;
- v->journal_seq = cpu_to_le64(new_u.journal_seq);
}
- if (old_u.data_type && !new_u.data_type && new_u.journal_seq) {
- ret = bch2_set_bucket_needs_journal_commit(c,
- new_u.dev, new_u.bucket,
- new_u.journal_seq);
- if (ret)
+ if (!data_type_is_empty(old_a->data_type) &&
+ data_type_is_empty(new_a->data_type) &&
+ bucket_journal_seq) {
+ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ new.k->p.inode, new.k->p.offset,
+ bucket_journal_seq);
+ if (ret) {
+ bch2_fs_fatal_error(c,
+ "error setting bucket_needs_journal_commit: %i", ret);
return ret;
+ }
}
- ca = bch_dev_bkey_exists(c, new_u.dev);
+ percpu_down_read(&c->mark_lock);
+ if (!gc && new_a->gen != old_a->gen)
+ *bucket_gen(ca, new.k->p.offset) = new_a->gen;
- if (new_u.bucket >= ca->mi.nbuckets)
- return 0;
+ bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc);
- percpu_down_read(&c->mark_lock);
- if (!gc && new_u.gen != old_u.gen)
- *bucket_gen(ca, new_u.bucket) = new_u.gen;
-
- g = __bucket(ca, new_u.bucket, gc);
-
- old_m = bucket_cmpxchg(g, m, ({
- m.gen = new_u.gen;
- m.data_type = new_u.data_type;
- m.dirty_sectors = new_u.dirty_sectors;
- m.cached_sectors = new_u.cached_sectors;
- m.stripe = new_u.stripe != 0;
- }));
-
- bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
-
- g->io_time[READ] = new_u.read_time;
- g->io_time[WRITE] = new_u.write_time;
- g->oldest_gen = new_u.oldest_gen;
- g->gen_valid = 1;
- g->stripe = new_u.stripe;
- g->stripe_redundancy = new_u.stripe_redundancy;
+ if (gc) {
+ struct bucket *g = gc_bucket(ca, new.k->p.offset);
+
+ bucket_lock(g);
+
+ g->gen_valid = 1;
+ g->gen = new_a->gen;
+ g->data_type = new_a->data_type;
+ g->stripe = new_a->stripe;
+ g->stripe_redundancy = new_a->stripe_redundancy;
+ g->dirty_sectors = new_a->dirty_sectors;
+ g->cached_sectors = new_a->cached_sectors;
+
+ bucket_unlock(g);
+ }
percpu_up_read(&c->mark_lock);
/*
@@ -577,40 +580,43 @@ static int bch2_mark_alloc(struct btree_trans *trans,
*/
if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
- old_m.cached_sectors) {
+ old_a->cached_sectors) {
ret = update_cached_sectors(c, new, ca->dev_idx,
- -old_m.cached_sectors,
+ -((s64) old_a->cached_sectors),
journal_seq, gc);
if (ret) {
- bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
+ bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
+ __func__);
return ret;
}
-
- trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket),
- old_m.cached_sectors);
}
+ if (new_a->data_type == BCH_DATA_free &&
+ (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+ closure_wake_up(&c->freelist_wait);
+
+ if (new_a->data_type == BCH_DATA_need_discard &&
+ (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
+ bch2_do_discards(c);
+
+ if (old_a->data_type != BCH_DATA_cached &&
+ new_a->data_type == BCH_DATA_cached &&
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
+ bch2_do_invalidates(c);
+
+ if (new_a->data_type == BCH_DATA_need_gc_gens)
+ bch2_do_gc_gens(c);
+
return 0;
}
-#define checked_add(a, b) \
-({ \
- unsigned _res = (unsigned) (a) + (b); \
- bool overflow = _res > U16_MAX; \
- if (overflow) \
- _res = U16_MAX; \
- (a) = _res; \
- overflow; \
-})
-
-void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, enum bch_data_type data_type,
- unsigned sectors, struct gc_pos pos,
- unsigned flags)
+int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, enum bch_data_type data_type,
+ unsigned sectors, struct gc_pos pos,
+ unsigned flags)
{
- struct bucket *g;
- struct bucket_mark old, new;
- bool overflow;
+ struct bucket old, new, *g;
+ int ret = 0;
BUG_ON(!(flags & BTREE_TRIGGER_GC));
BUG_ON(data_type != BCH_DATA_sb &&
@@ -620,115 +626,144 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
* Backup superblock might be past the end of our normal usable space:
*/
if (b >= ca->mi.nbuckets)
- return;
+ return 0;
percpu_down_read(&c->mark_lock);
g = gc_bucket(ca, b);
- old = bucket_cmpxchg(g, new, ({
- new.data_type = data_type;
- overflow = checked_add(new.dirty_sectors, sectors);
- }));
-
- bch2_fs_inconsistent_on(old.data_type &&
- old.data_type != data_type, c,
- "different types of data in same bucket: %s, %s",
- bch2_data_types[old.data_type],
- bch2_data_types[data_type]);
-
- bch2_fs_inconsistent_on(overflow, c,
- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX",
- ca->dev_idx, b, new.gen,
- bch2_data_types[old.data_type ?: data_type],
- old.dirty_sectors, sectors);
-
- bch2_dev_usage_update(c, ca, old, new, 0, true);
- percpu_up_read(&c->mark_lock);
-}
-static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
-{
- EBUG_ON(sectors < 0);
+ bucket_lock(g);
+ old = *g;
- return p.crc.compression_type &&
- p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible
- ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
- p.crc.uncompressed_size)
- : sectors;
+ if (bch2_fs_inconsistent_on(g->data_type &&
+ g->data_type != data_type, c,
+ "different types of data in same bucket: %s, %s",
+ bch2_data_types[g->data_type],
+ bch2_data_types[data_type])) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
+ ca->dev_idx, b, g->gen,
+ bch2_data_types[g->data_type ?: data_type],
+ g->dirty_sectors, sectors)) {
+ ret = -EIO;
+ goto err;
+ }
+
+
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
+ new = *g;
+err:
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update_m(c, ca, old, new, 0, true);
+ percpu_up_read(&c->mark_lock);
+ return ret;
}
-static int check_bucket_ref(struct bch_fs *c,
+static int check_bucket_ref(struct btree_trans *trans,
struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
- u8 bucket_gen, u8 bucket_data_type,
- u16 dirty_sectors, u16 cached_sectors)
+ u8 b_gen, u8 bucket_data_type,
+ u32 dirty_sectors, u32 cached_sectors)
{
- size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
u16 bucket_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (bucket_data_type == BCH_DATA_cached)
+ bucket_data_type = BCH_DATA_user;
+
+ if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) ||
+ (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe))
+ bucket_data_type = ptr_data_type = BCH_DATA_stripe;
- if (gen_after(ptr->gen, bucket_gen)) {
+ if (gen_after(ptr->gen, b_gen)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
- if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
+ if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
- if (bucket_gen != ptr->gen && !ptr->cached) {
+ if (b_gen != ptr->gen && !ptr->cached) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
+ "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
+ *bucket_gen(ca, bucket_nr),
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
- if (bucket_gen != ptr->gen)
- return 1;
+ if (b_gen != ptr->gen) {
+ ret = 1;
+ goto out;
+ }
- if (bucket_data_type && ptr_data_type &&
+ if (!data_type_is_empty(bucket_data_type) &&
+ ptr_data_type &&
bucket_data_type != ptr_data_type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type],
bch2_data_types[ptr_data_type],
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
- if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
+ if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
bucket_sectors, sectors,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EIO;
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EIO;
+ goto err;
}
-
- return 0;
+out:
+ printbuf_exit(&buf);
+ return ret;
+err:
+ bch2_dump_trans_updates(trans);
+ goto out;
}
static int mark_stripe_bucket(struct btree_trans *trans,
@@ -741,13 +776,12 @@ static int mark_stripe_bucket(struct btree_trans *trans,
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
bool parity = ptr_idx >= nr_data;
- enum bch_data_type data_type = parity ? BCH_DATA_parity : 0;
+ enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g;
- struct bucket_mark new, old;
- char buf[200];
+ struct bucket old, new, *g;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
BUG_ON(!(flags & BTREE_TRIGGER_GC));
@@ -755,40 +789,41 @@ static int mark_stripe_bucket(struct btree_trans *trans,
/* * XXX doesn't handle deletion */
percpu_down_read(&c->mark_lock);
+ buf.atomic++;
g = PTR_GC_BUCKET(ca, ptr);
- if (g->mark.dirty_sectors ||
+ if (g->dirty_sectors ||
(g->stripe && g->stripe != k.k->p.offset)) {
bch2_fs_inconsistent(c,
"bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EINVAL;
goto err;
}
- old = bucket_cmpxchg(g, new, ({
- ret = check_bucket_ref(c, k, ptr, sectors, data_type,
- new.gen, new.data_type,
- new.dirty_sectors, new.cached_sectors);
- if (ret)
- goto err;
+ bucket_lock(g);
+ old = *g;
- new.dirty_sectors += sectors;
- if (data_type)
- new.data_type = data_type;
+ ret = check_bucket_ref(trans, k, ptr, sectors, data_type,
+ g->gen, g->data_type,
+ g->dirty_sectors, g->cached_sectors);
+ if (ret)
+ goto err;
- new.stripe = true;
- }));
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
g->stripe = k.k->p.offset;
g->stripe_redundancy = s->nr_redundant;
-
- bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+ new = *g;
err:
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
percpu_up_read(&c->mark_lock);
-
- return 0;
+ printbuf_exit(&buf);
+ return ret;
}
static int __mark_pointer(struct btree_trans *trans,
@@ -796,12 +831,12 @@ static int __mark_pointer(struct btree_trans *trans,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
u8 bucket_gen, u8 *bucket_data_type,
- u16 *dirty_sectors, u16 *cached_sectors)
+ u32 *dirty_sectors, u32 *cached_sectors)
{
- u16 *dst_sectors = !ptr->cached
+ u32 *dst_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
- int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
+ int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
bucket_gen, *bucket_data_type,
*dirty_sectors, *cached_sectors);
@@ -815,50 +850,40 @@ static int __mark_pointer(struct btree_trans *trans,
}
static int bch2_mark_pointer(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
struct bkey_s_c k,
struct extent_ptr_decoded p,
- s64 sectors, enum bch_data_type data_type,
+ s64 sectors,
unsigned flags)
{
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
- struct bucket_mark old, new;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket *g;
+ struct bucket old, new, *g;
+ enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
u8 bucket_data_type;
- u64 v;
int ret = 0;
BUG_ON(!(flags & BTREE_TRIGGER_GC));
percpu_down_read(&c->mark_lock);
g = PTR_GC_BUCKET(ca, &p.ptr);
-
- v = atomic64_read(&g->_mark.v);
- do {
- new.v.counter = old.v.counter = v;
- bucket_data_type = new.data_type;
-
- ret = __mark_pointer(trans, k, &p.ptr, sectors,
- data_type, new.gen,
- &bucket_data_type,
- &new.dirty_sectors,
- &new.cached_sectors);
- if (ret)
- goto err;
-
- new.data_type = bucket_data_type;
-
- if (flags & BTREE_TRIGGER_NOATOMIC) {
- g->_mark = new;
- break;
- }
- } while ((v = atomic64_cmpxchg(&g->_mark.v,
- old.v.counter,
- new.v.counter)) != old.v.counter);
-
- bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
-err:
+ bucket_lock(g);
+ old = *g;
+
+ bucket_data_type = g->data_type;
+ ret = __mark_pointer(trans, k, &p.ptr, sectors,
+ data_type, g->gen,
+ &bucket_data_type,
+ &g->dirty_sectors,
+ &g->cached_sectors);
+ if (!ret)
+ g->data_type = bucket_data_type;
+
+ new = *g;
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
percpu_up_read(&c->mark_lock);
return ret;
@@ -881,13 +906,13 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
if (!m) {
bch_err(c, "error allocating memory for gc_stripes, idx %llu",
(u64) p.idx);
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_mark_stripe_ptr;
}
- spin_lock(&c->ec_stripes_heap_lock);
+ mutex_lock(&c->ec_stripes_heap_lock);
if (!m || !m->alive) {
- spin_unlock(&c->ec_stripes_heap_lock);
+ mutex_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
(u64) p.idx);
bch2_inconsistent_error(c);
@@ -897,7 +922,7 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
m->block_sectors[p.block] += sectors;
r = m->r;
- spin_unlock(&c->ec_stripes_heap_lock);
+ mutex_unlock(&c->ec_stripes_heap_lock);
r.e.data_type = data_type;
update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
@@ -905,13 +930,14 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
return 0;
}
-static int bch2_mark_extent(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@@ -938,8 +964,7 @@ static int bch2_mark_extent(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_OVERWRITE)
disk_sectors = -disk_sectors;
- ret = bch2_mark_pointer(trans, k, p, disk_sectors,
- data_type, flags);
+ ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags);
if (ret < 0)
return ret;
@@ -950,7 +975,8 @@ static int bch2_mark_extent(struct btree_trans *trans,
ret = update_cached_sectors(c, k, p.ptr.dev,
disk_sectors, journal_seq, true);
if (ret) {
- bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
+ bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
+ __func__);
return ret;
}
}
@@ -975,10 +1001,11 @@ static int bch2_mark_extent(struct btree_trans *trans,
if (r.e.nr_devs) {
ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
if (ret) {
- char buf[200];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+ printbuf_exit(&buf);
return ret;
}
}
@@ -986,9 +1013,10 @@ static int bch2_mark_extent(struct btree_trans *trans,
return 0;
}
-static int bch2_mark_stripe(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_stripe(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
u64 journal_seq = trans->journal_res.seq;
@@ -1006,26 +1034,26 @@ static int bch2_mark_stripe(struct btree_trans *trans,
if (!gc) {
struct stripe *m = genradix_ptr(&c->stripes, idx);
- if (!m || (old_s && !m->alive)) {
- char buf1[200], buf2[200];
+ if (!m) {
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf1), c, old);
- bch2_bkey_val_to_text(&PBUF(buf2), c, new);
+ bch2_bkey_val_to_text(&buf1, c, old);
+ bch2_bkey_val_to_text(&buf2, c, new);
bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
"old %s\n"
- "new %s", idx, buf1, buf2);
+ "new %s", idx, buf1.buf, buf2.buf);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
bch2_inconsistent_error(c);
return -1;
}
if (!new_s) {
- spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_del(c, m, idx);
- spin_unlock(&c->ec_stripes_heap_lock);
memset(m, 0, sizeof(*m));
} else {
- m->alive = true;
m->sectors = le16_to_cpu(new_s->sectors);
m->algorithm = new_s->algorithm;
m->nr_blocks = new_s->nr_blocks;
@@ -1035,9 +1063,10 @@ static int bch2_mark_stripe(struct btree_trans *trans,
for (i = 0; i < new_s->nr_blocks; i++)
m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
- spin_lock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_update(c, m, idx);
- spin_unlock(&c->ec_stripes_heap_lock);
+ if (!old_s)
+ bch2_stripes_heap_insert(c, m, idx);
+ else
+ bch2_stripes_heap_update(c, m, idx);
}
} else {
struct gc_stripe *m =
@@ -1046,7 +1075,7 @@ static int bch2_mark_stripe(struct btree_trans *trans,
if (!m) {
bch_err(c, "error allocating memory for gc_stripes, idx %llu",
idx);
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_mark_stripe;
}
/*
* This will be wrong when we bring back runtime gc: we should
@@ -1078,10 +1107,11 @@ static int bch2_mark_stripe(struct btree_trans *trans,
((s64) m->sectors * m->nr_redundant),
journal_seq, gc);
if (ret) {
- char buf[200];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, new);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+ bch2_bkey_val_to_text(&buf, c, new);
+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+ printbuf_exit(&buf);
return ret;
}
}
@@ -1089,19 +1119,20 @@ static int bch2_mark_stripe(struct btree_trans *trans,
return 0;
}
-static int bch2_mark_inode(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_inode(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct bch_fs_usage __percpu *fs_usage;
u64 journal_seq = trans->journal_res.seq;
if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v;
+ struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
BUG_ON(!journal_seq);
- BUG_ON(new.k->type != KEY_TYPE_inode_v2);
+ BUG_ON(new.k->type != KEY_TYPE_inode_v3);
v->bi_journal_seq = cpu_to_le64(journal_seq);
}
@@ -1120,12 +1151,13 @@ static int bch2_mark_inode(struct btree_trans *trans,
return 0;
}
-static int bch2_mark_reservation(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bch_fs_usage __percpu *fs_usage;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
@@ -1152,18 +1184,24 @@ static int bch2_mark_reservation(struct btree_trans *trans,
return 0;
}
-static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
+static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 start, u64 end,
u64 *idx, unsigned flags, size_t r_idx)
{
+ struct bch_fs *c = trans->c;
struct reflink_gc *r;
int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ u64 next_idx = end;
s64 ret = 0;
+ struct printbuf buf = PRINTBUF;
if (r_idx >= c->reflink_gc_nr)
goto not_found;
r = genradix_ptr(&c->reflink_gc_table, r_idx);
- if (*idx < r->offset - r->size)
+ next_idx = min(next_idx, r->offset - r->size);
+ if (*idx < next_idx)
goto not_found;
BUG_ON((s64) r->refcount + add < 0);
@@ -1172,37 +1210,45 @@ static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
*idx = r->offset;
return 0;
not_found:
- *idx = U64_MAX;
- ret = -EIO;
-
- /*
- * XXX: we're replacing the entire reflink pointer with an error
- * key, we should just be replacing the part that was missing:
- */
- if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
- p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
- struct bkey_i_error new;
+ if (fsck_err(c, "pointer to missing indirect extent\n"
+ " %s\n"
+ " missing range %llu-%llu",
+ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+ *idx, next_idx)) {
+ struct bkey_i_error *new;
+
+ new = bch2_trans_kmalloc(trans, sizeof(*new));
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto err;
- bkey_init(&new.k);
- new.k.type = KEY_TYPE_error;
- new.k.p = p.k->p;
- new.k.size = p.k->size;
- ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i);
+ bkey_init(&new->k);
+ new->k.type = KEY_TYPE_error;
+ new->k.p = bkey_start_pos(p.k);
+ new->k.p.offset += *idx - start;
+ bch2_key_resize(&new->k, next_idx - *idx);
+ ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i,
+ BTREE_TRIGGER_NORUN);
}
+
+ *idx = next_idx;
+err:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_mark_reflink_p(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
struct reflink_gc *ref;
size_t l, r, m;
- u64 idx = le64_to_cpu(p.v->idx);
+ u64 idx = le64_to_cpu(p.v->idx), start = idx;
u64 end = le64_to_cpu(p.v->idx) + p.k->size;
int ret = 0;
@@ -1226,111 +1272,51 @@ static int bch2_mark_reflink_p(struct btree_trans *trans,
}
while (idx < end && !ret)
- ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++);
-
- return ret;
-}
-
-int bch2_mark_key(struct btree_trans *trans,
- struct bkey_s_c old,
- struct bkey_s_c new,
- unsigned flags)
-{
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
-
- switch (k.k->type) {
- case KEY_TYPE_alloc:
- case KEY_TYPE_alloc_v2:
- case KEY_TYPE_alloc_v3:
- return bch2_mark_alloc(trans, old, new, flags);
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return bch2_mark_extent(trans, old, new, flags);
- case KEY_TYPE_stripe:
- return bch2_mark_stripe(trans, old, new, flags);
- case KEY_TYPE_inode:
- case KEY_TYPE_inode_v2:
- return bch2_mark_inode(trans, old, new, flags);
- case KEY_TYPE_reservation:
- return bch2_mark_reservation(trans, old, new, flags);
- case KEY_TYPE_reflink_p:
- return bch2_mark_reflink_p(trans, old, new, flags);
- case KEY_TYPE_snapshot:
- return bch2_mark_snapshot(trans, old, new, flags);
- default:
- return 0;
- }
-}
-
-int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
- struct bkey_i *new, unsigned flags)
-{
- struct bkey _deleted = KEY(0, 0, 0);
- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
- struct bkey_s_c old;
- struct bkey unpacked;
- int ret;
-
- _deleted.p = path->pos;
-
- if (unlikely(flags & BTREE_TRIGGER_NORUN))
- return 0;
-
- if (!btree_node_type_needs_gc(path->btree_id))
- return 0;
-
- old = bch2_btree_path_peek_slot(path, &unpacked);
-
- if (old.k->type == new->k.type &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
- ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
- } else {
- ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|flags) ?:
- bch2_mark_key(trans, old, deleted,
- BTREE_TRIGGER_OVERWRITE|flags);
- }
+ ret = __bch2_mark_reflink_p(trans, p, start, end,
+ &idx, flags, l++);
return ret;
}
-static noinline __cold
-void fs_usage_apply_warn(struct btree_trans *trans,
- unsigned disk_res_sectors,
- s64 should_not_have_added)
+void bch2_trans_fs_usage_revert(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- char buf[200];
+ struct bch_fs_usage *dst;
+ struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
+ s64 added = 0;
+ unsigned i;
- bch_err(c, "disk usage increased %lli more than %u sectors reserved",
- should_not_have_added, disk_res_sectors);
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+ dst = fs_usage_ptr(c, trans->journal_res.seq, false);
- trans_for_each_update(trans, i) {
- pr_err("while inserting");
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
- pr_err("%s", buf);
- pr_err("overlapping with");
+ /* revert changes: */
+ for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+ switch (d->r.data_type) {
+ case BCH_DATA_btree:
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ added += d->delta;
+ }
+ BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
+ }
- if (!i->cached) {
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
+ dst->nr_inodes -= deltas->nr_inodes;
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- pr_err("%s", buf);
- } else {
- struct bkey_cached *ck = (void *) i->path->l[0].b;
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ added -= deltas->persistent_reserved[i];
+ dst->reserved -= deltas->persistent_reserved[i];
+ dst->persistent_reserved[i] -= deltas->persistent_reserved[i];
+ }
- if (ck->valid) {
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
- pr_err("%s", buf);
- }
- }
+ if (added > 0) {
+ trans->disk_res->sectors += added;
+ this_cpu_add(*c->online_reserved, added);
}
- __WARN();
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
}
int bch2_trans_fs_usage_apply(struct btree_trans *trans,
@@ -1397,7 +1383,9 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
percpu_up_read(&c->mark_lock);
if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
- fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
+ bch2_trans_inconsistent(trans,
+ "disk usage increased %lli more than %u sectors reserved)",
+ should_not_have_added, disk_res_sectors);
return 0;
need_mark:
/* revert changes: */
@@ -1411,53 +1399,42 @@ need_mark:
/* trans_mark: */
-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
- const struct bch_extent_ptr *ptr,
- struct bkey_alloc_unpacked *u)
+static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ unsigned flags)
{
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bkey_s_c k;
+ bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a;
+ struct bpos bucket;
+ struct bch_backpointer bp;
+ s64 sectors;
int ret;
- bch2_trans_iter_init(trans, iter, BTREE_ID_alloc,
- POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)),
- BTREE_ITER_WITH_UPDATES|
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret) {
- bch2_trans_iter_exit(trans, iter);
- return ret;
- }
+ bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
+ sectors = bp.bucket_len;
+ if (!insert)
+ sectors = -sectors;
- *u = bch2_alloc_unpack(k);
- return 0;
-}
+ a = bch2_trans_start_alloc_update(trans, &iter, bucket);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
-static int bch2_trans_mark_pointer(struct btree_trans *trans,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- s64 sectors, enum bch_data_type data_type)
-{
- struct btree_iter iter;
- struct bkey_alloc_unpacked u;
- int ret;
-
- ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+ ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
+ a->v.gen, &a->v.data_type,
+ &a->v.dirty_sectors, &a->v.cached_sectors);
if (ret)
- return ret;
+ goto err;
- ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
- u.gen, &u.data_type,
- &u.dirty_sectors, &u.cached_sectors);
- if (ret)
- goto out;
+ if (!p.ptr.cached) {
+ ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
+ if (ret)
+ goto err;
+ }
- ret = bch2_alloc_write(trans, &iter, &u, 0);
- if (ret)
- goto out;
-out:
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1466,9 +1443,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type)
{
- struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_s_c k;
struct bkey_i_stripe *s;
struct bch_replicas_padded r;
int ret = 0;
@@ -1476,34 +1451,23 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx),
BTREE_ITER_INTENT|
BTREE_ITER_WITH_UPDATES);
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_stripe) {
- bch2_fs_inconsistent(c,
+ s = bch2_bkey_get_mut_typed(trans, &iter, stripe);
+ ret = PTR_ERR_OR_ZERO(s);
+ if (unlikely(ret)) {
+ bch2_trans_inconsistent_on(ret == -ENOENT, trans,
"pointer to nonexistent stripe %llu",
(u64) p.ec.idx);
- bch2_inconsistent_error(c);
- ret = -EIO;
goto err;
}
- if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
- bch2_fs_inconsistent(c,
+ if (!bch2_ptr_matches_stripe(&s->v, p)) {
+ bch2_trans_inconsistent(trans,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
ret = -EIO;
goto err;
}
- s = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
- ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- goto err;
-
- bkey_reassemble(&s->k_i, k);
stripe_blockcount_set(&s->v, p.ec.block,
stripe_blockcount_get(&s->v, p.ec.block) +
sectors);
@@ -1520,10 +1484,15 @@ err:
return ret;
}
-static int bch2_trans_mark_extent(struct btree_trans *trans,
- struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+ ? old
+ : bkey_i_to_s_c(new);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@@ -1548,8 +1517,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_OVERWRITE)
disk_sectors = -disk_sectors;
- ret = bch2_trans_mark_pointer(trans, k, p,
- disk_sectors, data_type);
+ ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
if (ret < 0)
return ret;
@@ -1585,7 +1553,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
struct bch_fs *c = trans->c;
const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
struct btree_iter iter;
- struct bkey_alloc_unpacked u;
+ struct bkey_i_alloc_v4 *a;
enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
? BCH_DATA_parity : 0;
s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
@@ -1594,59 +1562,61 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
if (deleting)
sectors = -sectors;
- ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
- ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
- u.gen, u.data_type,
- u.dirty_sectors, u.cached_sectors);
+ ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
+ a->v.gen, a->v.data_type,
+ a->v.dirty_sectors, a->v.cached_sectors);
if (ret)
goto err;
if (!deleting) {
- if (bch2_fs_inconsistent_on(u.stripe ||
- u.stripe_redundancy, c,
+ if (bch2_trans_inconsistent_on(a->v.stripe ||
+ a->v.stripe_redundancy, trans,
"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
- iter.pos.inode, iter.pos.offset, u.gen,
- bch2_data_types[u.data_type],
- u.dirty_sectors,
- u.stripe, s.k->p.offset)) {
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_types[a->v.data_type],
+ a->v.dirty_sectors,
+ a->v.stripe, s.k->p.offset)) {
ret = -EIO;
goto err;
}
- if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c,
+ if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
- iter.pos.inode, iter.pos.offset, u.gen,
- bch2_data_types[u.data_type],
- u.dirty_sectors,
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_types[a->v.data_type],
+ a->v.dirty_sectors,
s.k->p.offset)) {
ret = -EIO;
goto err;
}
- u.stripe = s.k->p.offset;
- u.stripe_redundancy = s.v->nr_redundant;
+ a->v.stripe = s.k->p.offset;
+ a->v.stripe_redundancy = s.v->nr_redundant;
+ a->v.data_type = BCH_DATA_stripe;
} else {
- if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset ||
- u.stripe_redundancy != s.v->nr_redundant, c,
+ if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
+ a->v.stripe_redundancy != s.v->nr_redundant, trans,
"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
- iter.pos.inode, iter.pos.offset, u.gen,
- s.k->p.offset, u.stripe)) {
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ s.k->p.offset, a->v.stripe)) {
ret = -EIO;
goto err;
}
- u.stripe = 0;
- u.stripe_redundancy = 0;
+ a->v.stripe = 0;
+ a->v.stripe_redundancy = 0;
+ a->v.data_type = alloc_data_type(a->v, BCH_DATA_user);
}
- u.dirty_sectors += sectors;
+ a->v.dirty_sectors += sectors;
if (data_type)
- u.data_type = !deleting ? data_type : 0;
+ a->v.data_type = !deleting ? data_type : 0;
- ret = bch2_alloc_write(trans, &iter, &u, 0);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
if (ret)
goto err;
err:
@@ -1654,66 +1624,69 @@ err:
return ret;
}
-static int bch2_trans_mark_stripe(struct btree_trans *trans,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+int bch2_trans_mark_stripe(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
{
- struct bkey_s_c_stripe old_s = { .k = NULL };
- struct bkey_s_c_stripe new_s = { .k = NULL };
+ const struct bch_stripe *old_s = NULL;
+ struct bch_stripe *new_s = NULL;
struct bch_replicas_padded r;
unsigned i, nr_blocks;
int ret = 0;
if (old.k->type == KEY_TYPE_stripe)
- old_s = bkey_s_c_to_stripe(old);
- if (new.k->type == KEY_TYPE_stripe)
- new_s = bkey_s_c_to_stripe(new);
+ old_s = bkey_s_c_to_stripe(old).v;
+ if (new->k.type == KEY_TYPE_stripe)
+ new_s = &bkey_i_to_stripe(new)->v;
/*
* If the pointers aren't changing, we don't need to do anything:
*/
- if (new_s.k && old_s.k &&
- new_s.v->nr_blocks == old_s.v->nr_blocks &&
- new_s.v->nr_redundant == old_s.v->nr_redundant &&
- !memcmp(old_s.v->ptrs, new_s.v->ptrs,
- new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
+ if (new_s && old_s &&
+ new_s->nr_blocks == old_s->nr_blocks &&
+ new_s->nr_redundant == old_s->nr_redundant &&
+ !memcmp(old_s->ptrs, new_s->ptrs,
+ new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
return 0;
- BUG_ON(new_s.k && old_s.k &&
- (new_s.v->nr_blocks != old_s.v->nr_blocks ||
- new_s.v->nr_redundant != old_s.v->nr_redundant));
+ BUG_ON(new_s && old_s &&
+ (new_s->nr_blocks != old_s->nr_blocks ||
+ new_s->nr_redundant != old_s->nr_redundant));
- nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks;
+ nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
- if (new_s.k) {
- s64 sectors = le16_to_cpu(new_s.v->sectors);
+ if (new_s) {
+ s64 sectors = le16_to_cpu(new_s->sectors);
- bch2_bkey_to_replicas(&r.e, new);
- update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
+ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
+ update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
}
- if (old_s.k) {
- s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
+ if (old_s) {
+ s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
bch2_bkey_to_replicas(&r.e, old);
- update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
+ update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
}
for (i = 0; i < nr_blocks; i++) {
- if (new_s.k && old_s.k &&
- !memcmp(&new_s.v->ptrs[i],
- &old_s.v->ptrs[i],
- sizeof(new_s.v->ptrs[i])))
+ if (new_s && old_s &&
+ !memcmp(&new_s->ptrs[i],
+ &old_s->ptrs[i],
+ sizeof(new_s->ptrs[i])))
continue;
- if (new_s.k) {
- ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false);
+ if (new_s) {
+ ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_i_to_s_c_stripe(new), i, false);
if (ret)
break;
}
- if (old_s.k) {
- ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true);
+ if (old_s) {
+ ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(old), i, true);
if (ret)
break;
}
@@ -1722,12 +1695,13 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
return ret;
}
-static int bch2_trans_mark_inode(struct btree_trans *trans,
- struct bkey_s_c old,
- struct bkey_s_c new,
- unsigned flags)
+int bch2_trans_mark_inode(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
{
- int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+ int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
if (nr) {
struct replicas_delta_list *d =
@@ -1738,9 +1712,15 @@ static int bch2_trans_mark_inode(struct btree_trans *trans,
return 0;
}
-static int bch2_trans_mark_reservation(struct btree_trans *trans,
- struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
{
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+ ? old
+ : bkey_i_to_s_c(new);
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
struct replicas_delta_list *d;
@@ -1764,43 +1744,35 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_i *n;
+ struct bkey_i *k;
__le64 *refcount;
int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
BTREE_ITER_INTENT|
BTREE_ITER_WITH_UPDATES);
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
- ret = PTR_ERR_OR_ZERO(n);
+ k = bch2_bkey_get_mut(trans, &iter);
+ ret = PTR_ERR_OR_ZERO(k);
if (ret)
goto err;
- bkey_reassemble(n, k);
-
- refcount = bkey_refcount(n);
+ refcount = bkey_refcount(k);
if (!refcount) {
- bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
- bch2_fs_inconsistent(c,
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
"nonexistent indirect extent at %llu while marking\n %s",
- *idx, buf);
+ *idx, buf.buf);
ret = -EIO;
goto err;
}
if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
- bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
- bch2_fs_inconsistent(c,
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
"indirect extent refcount underflow at %llu while marking\n %s",
- *idx, buf);
+ *idx, buf.buf);
ret = -EIO;
goto err;
}
@@ -1810,37 +1782,39 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
u64 pad;
pad = max_t(s64, le32_to_cpu(v->front_pad),
- le64_to_cpu(v->idx) - bkey_start_offset(k.k));
+ le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
BUG_ON(pad > U32_MAX);
v->front_pad = cpu_to_le32(pad);
pad = max_t(s64, le32_to_cpu(v->back_pad),
- k.k->p.offset - p.k->size - le64_to_cpu(v->idx));
+ k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
BUG_ON(pad > U32_MAX);
v->back_pad = cpu_to_le32(pad);
}
le64_add_cpu(refcount, add);
- if (!*refcount) {
- n->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&n->k, 0);
- }
-
bch2_btree_iter_set_pos_to_extent_start(&iter);
- ret = bch2_trans_update(trans, &iter, n, 0);
+ ret = bch2_trans_update(trans, &iter, k, 0);
if (ret)
goto err;
- *idx = k.k->p.offset;
+ *idx = k->k.p.offset;
err:
bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
return ret;
}
-static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
- struct bkey_s_c k, unsigned flags)
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
{
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
+ ? old
+ : bkey_i_to_s_c(new);
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
u64 idx, end_idx;
int ret = 0;
@@ -1861,31 +1835,6 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
return ret;
}
-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
- struct bkey_s_c new, unsigned flags)
-{
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
-
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- case KEY_TYPE_extent:
- case KEY_TYPE_reflink_v:
- return bch2_trans_mark_extent(trans, k, flags);
- case KEY_TYPE_stripe:
- return bch2_trans_mark_stripe(trans, old, new, flags);
- case KEY_TYPE_inode:
- case KEY_TYPE_inode_v2:
- return bch2_trans_mark_inode(trans, old, new, flags);
- case KEY_TYPE_reservation:
- return bch2_trans_mark_reservation(trans, k, flags);
- case KEY_TYPE_reflink_p:
- return bch2_trans_mark_reflink_p(trans, k, flags);
- default:
- return 0;
- }
-}
-
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
enum bch_data_type type,
@@ -1893,11 +1842,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_alloc_unpacked u;
- struct bch_extent_ptr ptr = {
- .dev = ca->dev_idx,
- .offset = bucket_to_sector(ca, b),
- };
+ struct bkey_i_alloc_v4 *a;
int ret = 0;
/*
@@ -1906,26 +1851,26 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
if (b >= ca->mi.nbuckets)
return 0;
- ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
- if (u.data_type && u.data_type != type) {
+ if (a->v.data_type && type && a->v.data_type != type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
- iter.pos.inode, iter.pos.offset, u.gen,
- bch2_data_types[u.data_type],
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_types[a->v.data_type],
bch2_data_types[type],
bch2_data_types[type]);
ret = -EIO;
goto out;
}
- u.data_type = type;
- u.dirty_sectors = sectors;
+ a->v.data_type = type;
+ a->v.dirty_sectors = sectors;
- ret = bch2_alloc_write(trans, &iter, &u, 0);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
if (ret)
goto out;
out:
@@ -1938,7 +1883,7 @@ int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
enum bch_data_type type,
unsigned sectors)
{
- return __bch2_trans_do(trans, NULL, NULL, 0,
+ return commit_do(trans, NULL, NULL, 0,
__bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
}
@@ -2016,15 +1961,14 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
{
- return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
- __bch2_trans_mark_dev_sb(&trans, ca));
+ return bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
}
/* Disk reservations: */
#define SECTORS_CACHE 1024
-int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
u64 sectors, int flags)
{
struct bch_fs_pcpu *pcpu;
@@ -2077,7 +2021,7 @@ recalculate:
ret = 0;
} else {
atomic64_set(&c->sectors_available, sectors_available);
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_disk_reservation;
}
mutex_unlock(&c->sectors_available_lock);
@@ -2088,65 +2032,35 @@ recalculate:
/* Startup/shutdown: */
-static void buckets_free_rcu(struct rcu_head *rcu)
-{
- struct bucket_array *buckets =
- container_of(rcu, struct bucket_array, rcu);
-
- kvpfree(buckets,
- sizeof(struct bucket_array) +
- buckets->nbuckets * sizeof(struct bucket));
-}
-
static void bucket_gens_free_rcu(struct rcu_head *rcu)
{
struct bucket_gens *buckets =
container_of(rcu, struct bucket_gens, rcu);
- kvpfree(buckets, sizeof(struct bucket_array) + buckets->nbuckets);
+ kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
}
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
- struct bucket_array *buckets = NULL, *old_buckets = NULL;
struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
unsigned long *buckets_nouse = NULL;
- alloc_fifo free[RESERVE_NR];
- alloc_fifo free_inc;
- alloc_heap alloc_heap;
-
- size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
- ca->mi.bucket_size / btree_sectors(c));
- /* XXX: these should be tunable */
- size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
- size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6);
- size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
- btree_reserve * 2);
- bool resize = ca->buckets[0] != NULL;
- int ret = -ENOMEM;
- unsigned i;
+ bool resize = ca->bucket_gens != NULL;
+ int ret;
- memset(&free, 0, sizeof(free));
- memset(&free_inc, 0, sizeof(free_inc));
- memset(&alloc_heap, 0, sizeof(alloc_heap));
+ if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
+ GFP_KERNEL|__GFP_ZERO))) {
+ ret = -BCH_ERR_ENOMEM_bucket_gens;
+ goto err;
+ }
- if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
- nbuckets * sizeof(struct bucket),
- GFP_KERNEL|__GFP_ZERO)) ||
- !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
- GFP_KERNEL|__GFP_ZERO)) ||
- !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
+ if ((c->opts.buckets_nouse &&
+ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
- GFP_KERNEL|__GFP_ZERO)) ||
- !init_fifo(&free[RESERVE_MOVINGGC],
- copygc_reserve, GFP_KERNEL) ||
- !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
- !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
- !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
+ GFP_KERNEL|__GFP_ZERO)))) {
+ ret = -BCH_ERR_ENOMEM_buckets_nouse;
goto err;
+ }
- buckets->first_bucket = ca->mi.first_bucket;
- buckets->nbuckets = nbuckets;
bucket_gens->first_bucket = ca->mi.first_bucket;
bucket_gens->nbuckets = nbuckets;
@@ -2158,64 +2072,39 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
percpu_down_write(&c->mark_lock);
}
- old_buckets = bucket_array(ca);
old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
if (resize) {
- size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
+ size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
- memcpy(buckets->b,
- old_buckets->b,
- n * sizeof(struct bucket));
memcpy(bucket_gens->b,
old_bucket_gens->b,
n);
- memcpy(buckets_nouse,
- ca->buckets_nouse,
- BITS_TO_LONGS(n) * sizeof(unsigned long));
+ if (buckets_nouse)
+ memcpy(buckets_nouse,
+ ca->buckets_nouse,
+ BITS_TO_LONGS(n) * sizeof(unsigned long));
}
- rcu_assign_pointer(ca->buckets[0], buckets);
rcu_assign_pointer(ca->bucket_gens, bucket_gens);
- buckets = old_buckets;
bucket_gens = old_bucket_gens;
swap(ca->buckets_nouse, buckets_nouse);
+ nbuckets = ca->mi.nbuckets;
+
if (resize) {
percpu_up_write(&c->mark_lock);
+ up_write(&ca->bucket_lock);
up_write(&c->gc_lock);
}
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++) {
- fifo_move(&free[i], &ca->free[i]);
- swap(ca->free[i], free[i]);
- }
- fifo_move(&free_inc, &ca->free_inc);
- swap(ca->free_inc, free_inc);
- spin_unlock(&c->freelist_lock);
-
- /* with gc lock held, alloc_heap can't be in use: */
- swap(ca->alloc_heap, alloc_heap);
-
- nbuckets = ca->mi.nbuckets;
-
- if (resize)
- up_write(&ca->bucket_lock);
-
ret = 0;
err:
- free_heap(&alloc_heap);
- free_fifo(&free_inc);
- for (i = 0; i < RESERVE_NR; i++)
- free_fifo(&free[i]);
kvpfree(buckets_nouse,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
if (bucket_gens)
- call_rcu(&old_buckets->rcu, bucket_gens_free_rcu);
- if (buckets)
- call_rcu(&old_buckets->rcu, buckets_free_rcu);
+ call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
return ret;
}
@@ -2224,15 +2113,10 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
{
unsigned i;
- free_heap(&ca->alloc_heap);
- free_fifo(&ca->free_inc);
- for (i = 0; i < RESERVE_NR; i++)
- free_fifo(&ca->free[i]);
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
- kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
- sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket));
+ kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
+ sizeof(struct bucket_gens) + ca->mi.nbuckets);
for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
free_percpu(ca->usage[i]);
@@ -2245,13 +2129,13 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
if (!ca->usage_base)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_usage_init;
for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
ca->usage[i] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[i])
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_usage_init;
}
- return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
+ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 27f7659ca754..d677b0225c52 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -9,58 +9,39 @@
#define _BUCKETS_H
#include "buckets_types.h"
+#include "extents.h"
#include "super.h"
#define for_each_bucket(_b, _buckets) \
for (_b = (_buckets)->b + (_buckets)->first_bucket; \
_b < (_buckets)->b + (_buckets)->nbuckets; _b++)
-#define bucket_cmpxchg(g, new, expr) \
-({ \
- struct bucket *_g = g; \
- u64 _v = atomic64_read(&(g)->_mark.v); \
- struct bucket_mark _old; \
- \
- do { \
- (new).v.counter = _old.v.counter = _v; \
- expr; \
- } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \
- _old.v.counter, \
- (new).v.counter)) != _old.v.counter);\
- _old; \
-})
-
-static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
- bool gc)
+static inline void bucket_unlock(struct bucket *b)
{
- return rcu_dereference_check(ca->buckets[gc],
- !ca->fs ||
- percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->gc_lock) ||
- lockdep_is_held(&ca->bucket_lock));
+ smp_store_release(&b->lock, 0);
}
-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline void bucket_lock(struct bucket *b)
{
- return __bucket_array(ca, false);
+ while (xchg(&b->lock, 1))
+ cpu_relax();
}
-static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
+static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
{
- struct bucket_array *buckets = __bucket_array(ca, gc);
-
- BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
- return buckets->b + b;
+ return rcu_dereference_check(ca->buckets_gc,
+ !ca->fs ||
+ percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+ lockdep_is_held(&ca->fs->gc_lock) ||
+ lockdep_is_held(&ca->bucket_lock));
}
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
- return __bucket(ca, b, true);
-}
+ struct bucket_array *buckets = gc_bucket_array(ca);
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
-{
- return __bucket(ca, b, false);
+ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+ return buckets->b + b;
}
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
@@ -70,7 +51,6 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
lockdep_is_held(&ca->fs->gc_lock) ||
lockdep_is_held(&ca->bucket_lock));
-
}
static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
@@ -81,26 +61,27 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
return gens->b + b;
}
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bucket *g)
+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+ const struct bch_extent_ptr *ptr)
{
- return g->mark.gen - g->oldest_gen;
+ return sector_to_bucket(ca, ptr->offset);
}
-static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
const struct bch_extent_ptr *ptr)
{
- return sector_to_bucket(ca, ptr->offset);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
}
-static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
+ const struct bch_extent_ptr *ptr,
+ u32 *bucket_offset)
{
- return bucket(ca, PTR_BUCKET_NR(ca, ptr));
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+ return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
}
static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
@@ -112,13 +93,22 @@ static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
static inline enum bch_data_type ptr_data_type(const struct bkey *k,
const struct bch_extent_ptr *ptr)
{
- if (k->type == KEY_TYPE_btree_ptr ||
- k->type == KEY_TYPE_btree_ptr_v2)
+ if (bkey_is_btree_ptr(k))
return BCH_DATA_btree;
return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
}
+static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
+{
+ EBUG_ON(sectors < 0);
+
+ return crc_is_compressed(p.crc)
+ ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
+ p.crc.uncompressed_size)
+ : sectors;
+}
+
static inline int gen_cmp(u8 a, u8 b)
{
return (s8) (a - b);
@@ -147,67 +137,78 @@ static inline u8 ptr_stale(struct bch_dev *ca,
return ret;
}
-/* bucket gc marks */
+/* Device usage: */
-static inline unsigned bucket_sectors_used(struct bucket_mark mark)
+void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
+static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
{
- return mark.dirty_sectors + mark.cached_sectors;
-}
+ struct bch_dev_usage ret;
-static inline bool is_available_bucket(struct bucket_mark mark)
-{
- return !mark.dirty_sectors && !mark.stripe;
+ bch2_dev_usage_read_fast(ca, &ret);
+ return ret;
}
-/* Device usage: */
-
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
+void bch2_dev_usage_init(struct bch_dev *);
-static inline u64 __dev_buckets_available(struct bch_dev *ca,
- struct bch_dev_usage stats)
+static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reserve reserve)
{
- u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
-
- if (WARN_ONCE(stats.buckets_unavailable > total,
- "buckets_unavailable overflow (%llu > %llu)\n",
- stats.buckets_unavailable, total))
- return 0;
-
- return total - stats.buckets_unavailable;
+ s64 reserved = 0;
+
+ switch (reserve) {
+ case RESERVE_NR:
+ unreachable();
+ case RESERVE_stripe:
+ reserved += ca->mi.nbuckets >> 6;
+ fallthrough;
+ case RESERVE_none:
+ reserved += ca->mi.nbuckets >> 6;
+ fallthrough;
+ case RESERVE_movinggc:
+ reserved += ca->nr_btree_reserve;
+ fallthrough;
+ case RESERVE_btree:
+ reserved += ca->nr_btree_reserve;
+ fallthrough;
+ case RESERVE_btree_movinggc:
+ break;
+ }
+
+ return reserved;
}
-static inline u64 dev_buckets_available(struct bch_dev *ca)
+static inline u64 dev_buckets_free(struct bch_dev *ca,
+ struct bch_dev_usage usage,
+ enum alloc_reserve reserve)
{
- return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
+ return max_t(s64, 0,
+ usage.d[BCH_DATA_free].buckets -
+ ca->nr_open_buckets -
+ bch2_dev_buckets_reserved(ca, reserve));
}
-static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
- struct bch_dev_usage stats)
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
+ struct bch_dev_usage usage,
+ enum alloc_reserve reserve)
{
- struct bch_fs *c = ca->fs;
- s64 available = __dev_buckets_available(ca, stats);
- unsigned i;
-
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++)
- available -= fifo_used(&ca->free[i]);
- available -= fifo_used(&ca->free_inc);
- available -= ca->nr_open_buckets;
- spin_unlock(&c->freelist_lock);
-
- return max(available, 0LL);
+ return max_t(s64, 0,
+ usage.d[BCH_DATA_free].buckets
+ + usage.d[BCH_DATA_cached].buckets
+ + usage.d[BCH_DATA_need_gc_gens].buckets
+ + usage.d[BCH_DATA_need_discard].buckets
+ - ca->nr_open_buckets
+ - bch2_dev_buckets_reserved(ca, reserve));
}
-static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+ enum alloc_reserve reserve)
{
- return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
+ return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
}
/* Filesystem usage: */
static inline unsigned fs_usage_u64s(struct bch_fs *c)
{
-
return sizeof(struct bch_fs_usage) / sizeof(u64) +
READ_ONCE(c->replicas.nr);
}
@@ -235,18 +236,30 @@ bch2_fs_usage_read_short(struct bch_fs *);
void bch2_fs_usage_initialize(struct bch_fs *);
-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
-void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
- size_t, enum bch_data_type, unsigned,
- struct gc_pos, unsigned);
-
-int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
-
-int bch2_mark_update(struct btree_trans *, struct btree_path *,
- struct bkey_i *, unsigned);
-
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
- struct bkey_s_c, unsigned);
+int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+ size_t, enum bch_data_type, unsigned,
+ struct gc_pos, unsigned);
+
+int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
+
+int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+
+void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
@@ -258,15 +271,39 @@ int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
static inline void bch2_disk_reservation_put(struct bch_fs *c,
struct disk_reservation *res)
{
- this_cpu_sub(*c->online_reserved, res->sectors);
- res->sectors = 0;
+ if (res->sectors) {
+ this_cpu_sub(*c->online_reserved, res->sectors);
+ res->sectors = 0;
+ }
}
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
-int bch2_disk_reservation_add(struct bch_fs *,
- struct disk_reservation *,
- u64, int);
+int __bch2_disk_reservation_add(struct bch_fs *,
+ struct disk_reservation *,
+ u64, int);
+
+static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+ u64 sectors, int flags)
+{
+#ifdef __KERNEL__
+ u64 old, new;
+
+ do {
+ old = this_cpu_read(c->pcpu->sectors_available);
+ if (sectors > old)
+ return __bch2_disk_reservation_add(c, res, sectors, flags);
+
+ new = old - sectors;
+ } while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
+
+ this_cpu_add(*c->online_reserved, sectors);
+ res->sectors += sectors;
+ return 0;
+#else
+ return __bch2_disk_reservation_add(c, res, sectors, flags);
+#endif
+}
static inline struct disk_reservation
bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 24139831226d..2a9dab9006ef 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -7,33 +7,15 @@
#define BUCKET_JOURNAL_SEQ_BITS 16
-struct bucket_mark {
- union {
- atomic64_t v;
-
- struct {
- u8 gen;
- u8 data_type:3,
- owned_by_allocator:1,
- stripe:1;
- u16 dirty_sectors;
- u16 cached_sectors;
- };
- };
-};
-
struct bucket {
- union {
- struct bucket_mark _mark;
- const struct bucket_mark mark;
- };
-
- u64 io_time[2];
- u8 oldest_gen;
- u8 gc_gen;
- unsigned gen_valid:1;
- u8 stripe_redundancy;
- u32 stripe;
+ u8 lock;
+ u8 gen_valid:1;
+ u8 data_type:7;
+ u8 gen;
+ u8 stripe_redundancy;
+ u32 stripe;
+ u32 dirty_sectors;
+ u32 cached_sectors;
};
struct bucket_array {
@@ -52,7 +34,6 @@ struct bucket_gens {
struct bch_dev_usage {
u64 buckets_ec;
- u64 buckets_unavailable;
struct {
u64 buckets;
@@ -108,15 +89,4 @@ struct disk_reservation {
unsigned nr_replicas;
};
-struct copygc_heap_entry {
- u8 dev;
- u8 gen;
- u8 replicas;
- u16 fragmentation;
- u32 sectors;
- u64 offset;
-};
-
-typedef HEAP(struct copygc_heap_entry) copygc_heap;
-
#endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
index 7eb8482d1258..81ab685cdef9 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -2,36 +2,43 @@
#include "bcachefs.h"
#include "buckets_waiting_for_journal.h"
-#include <linux/jhash.h>
+#include <linux/hash.h>
+#include <linux/random.h>
-static u32 hash_seeds[] = {
- 2168153708,
- 1262039142,
- 1183479835,
-};
+static inline struct bucket_hashed *
+bucket_hash(struct buckets_waiting_for_journal_table *t,
+ unsigned hash_seed_idx, u64 dev_bucket)
+{
+ return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits);
+}
-static inline unsigned bucket_hash(u64 dev_bucket, unsigned hash_seed_idx)
+static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits)
{
- return jhash_2words(dev_bucket << 32, dev_bucket, hash_seeds[hash_seed_idx]);
+ unsigned i;
+
+ t->bits = bits;
+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
+ get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
+ memset(t->d, 0, sizeof(t->d[0]) << t->bits);
}
-bool bch2_bucket_needs_journal_commit(struct bch_fs *c,
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
u64 flushed_seq,
unsigned dev, u64 bucket)
{
- struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+ struct buckets_waiting_for_journal_table *t;
u64 dev_bucket = (u64) dev << 56 | bucket;
bool ret = false;
unsigned i;
mutex_lock(&b->lock);
- BUG_ON(!is_power_of_2(b->nr));
+ t = b->t;
- for (i = 0; i < ARRAY_SIZE(hash_seeds); i++) {
- u32 h = bucket_hash(dev_bucket, i) & (b->nr - 1);
+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+ struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
- if (b->d[h].dev_bucket == dev_bucket) {
- ret = b->d[h].journal_seq > flushed_seq;
+ if (h->dev_bucket == dev_bucket) {
+ ret = h->journal_seq > flushed_seq;
break;
}
}
@@ -41,78 +48,93 @@ bool bch2_bucket_needs_journal_commit(struct bch_fs *c,
return ret;
}
-static int bch2_buckets_waiting_for_journal_rehash(struct bch_fs *c)
+static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
+ struct bucket_hashed *new,
+ u64 flushed_seq)
{
- struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
- u64 flushed_seq = c->journal.flushed_seq_ondisk;
- unsigned i, j, h, new_nr = b->nr * 2, elements = 0;
- struct bucket_hashed *new_table;
+ struct bucket_hashed *last_evicted = NULL;
+ unsigned tries, i;
- new_table = kvmalloc_array(new_nr, sizeof(*new_table), __GFP_ZERO);
- if (!new_table)
- return -ENOMEM;
+ for (tries = 0; tries < 10; tries++) {
+ struct bucket_hashed *old, *victim = NULL;
- for (i = 0; i < b->nr; i++) {
- if (b->d[i].journal_seq < flushed_seq)
- continue;
+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+ old = bucket_hash(t, i, new->dev_bucket);
- for (j = 0; j < ARRAY_SIZE(hash_seeds); j++) {
- h = bucket_hash(b->d[i].dev_bucket, j);
- if ((h & (b->nr - 1)) == i)
- break;
- }
+ if (old->dev_bucket == new->dev_bucket ||
+ old->journal_seq <= flushed_seq) {
+ *old = *new;
+ return true;
+ }
- BUG_ON(j == ARRAY_SIZE(hash_seeds));
- BUG_ON(new_table[h & (new_nr - 1)].dev_bucket);
+ if (last_evicted != old)
+ victim = old;
+ }
- new_table[h & (new_nr - 1)] = b->d[i];
+ /* hashed to same slot 3 times: */
+ if (!victim)
+ break;
- elements++;
+ /* Failed to find an empty slot: */
+ swap(*new, *victim);
+ last_evicted = victim;
}
- kvfree(b->d);
- b->nr = new_nr;
- b->d = new_table;
- return 0;
+ return false;
}
-int bch2_set_bucket_needs_journal_commit(struct bch_fs *c, unsigned dev, u64 bucket,
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+ u64 flushed_seq,
+ unsigned dev, u64 bucket,
u64 journal_seq)
{
- struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
- struct bucket_hashed new = {
+ struct buckets_waiting_for_journal_table *t, *n;
+ struct bucket_hashed tmp, new = {
.dev_bucket = (u64) dev << 56 | bucket,
.journal_seq = journal_seq,
- }, *old, *victim, *last_evicted = NULL;
- u64 flushed_seq = c->journal.flushed_seq_ondisk;
- unsigned tries, i;
+ };
+ size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0;
int ret = 0;
mutex_lock(&b->lock);
- BUG_ON(!is_power_of_2(b->nr));
-retry:
- for (tries = 0; tries < 5; tries++) {
- for (i = 0; i < ARRAY_SIZE(hash_seeds); i++) {
- old = b->d + (bucket_hash(new.dev_bucket, i) & (b->nr - 1));
- if (old->dev_bucket == new.dev_bucket ||
- old->journal_seq <= flushed_seq) {
- *old = new;
- goto out;
- }
+ if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
+ goto out;
- if (last_evicted != old)
- victim = old;
- }
+ t = b->t;
+ size = 1UL << t->bits;
+ for (i = 0; i < size; i++)
+ nr_elements += t->d[i].journal_seq > flushed_seq;
- /* Failed to find an empty slot: */
- swap(new, *victim);
- last_evicted = victim;
+ new_bits = t->bits + (nr_elements * 3 > size);
+
+ n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
+ if (!n) {
+ ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
+ goto out;
}
- ret = bch2_buckets_waiting_for_journal_rehash(c);
- if (!ret)
- goto retry;
+retry_rehash:
+ nr_rehashes++;
+ bucket_table_init(n, new_bits);
+
+ tmp = new;
+ BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
+
+ for (i = 0; i < 1UL << t->bits; i++) {
+ if (t->d[i].journal_seq <= flushed_seq)
+ continue;
+
+ tmp = t->d[i];
+ if (!bucket_table_insert(n, &tmp, flushed_seq))
+ goto retry_rehash;
+ }
+
+ b->t = n;
+ kvfree(t);
+
+ pr_debug("took %zu rehashes, table at %zu/%zu elements",
+ nr_rehashes, nr_elements, 1UL << b->t->bits);
out:
mutex_unlock(&b->lock);
@@ -123,19 +145,22 @@ void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
{
struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
- kvfree(b->d);
+ kvfree(b->t);
}
+#define INITIAL_TABLE_BITS 3
+
int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
{
struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
mutex_init(&b->lock);
- b->nr = 8;
- b->d = kvmalloc_array(b->nr, sizeof(*b->d), __GFP_ZERO);
- if (!b->d)
- return -ENOMEM;
+ b->t = kvmalloc(sizeof(*b->t) +
+ (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
+ if (!b->t)
+ return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init;
+ bucket_table_init(b->t, INITIAL_TABLE_BITS);
return 0;
}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
index 079a591c500a..d2ae19cbe18c 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.h
+++ b/fs/bcachefs/buckets_waiting_for_journal.h
@@ -4,8 +4,10 @@
#include "buckets_waiting_for_journal_types.h"
-bool bch2_bucket_needs_journal_commit(struct bch_fs *, u64, unsigned, u64);
-int bch2_set_bucket_needs_journal_commit(struct bch_fs *, unsigned, u64, u64);
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+ u64, unsigned, u64);
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+ u64, unsigned, u64, u64);
void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
index 99d17ffb7e94..e593db061d81 100644
--- a/fs/bcachefs/buckets_waiting_for_journal_types.h
+++ b/fs/bcachefs/buckets_waiting_for_journal_types.h
@@ -2,15 +2,22 @@
#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+#include <linux/siphash.h>
+
struct bucket_hashed {
u64 dev_bucket;
u64 journal_seq;
};
+struct buckets_waiting_for_journal_table {
+ unsigned bits;
+ u64 hash_seeds[3];
+ struct bucket_hashed d[];
+};
+
struct buckets_waiting_for_journal {
struct mutex lock;
- size_t nr;
- struct bucket_hashed *d;
+ struct buckets_waiting_for_journal_table *t;
};
#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index aa26588ed5ed..eecc35505d6e 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -284,6 +284,8 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
return PTR_ERR(ca);
ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
+ if (ret)
+ bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
percpu_ref_put(&ca->ref);
return ret;
@@ -501,13 +503,12 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
- arg.available_buckets = arg.nr_buckets - src.buckets_unavailable;
- arg.ec_buckets = src.buckets_ec;
- arg.ec_sectors = 0;
+ arg.buckets_ec = src.buckets_ec;
for (i = 0; i < BCH_DATA_NR; i++) {
- arg.buckets[i] = src.d[i].buckets;
- arg.sectors[i] = src.d[i].sectors;
+ arg.d[i].buckets = src.d[i].buckets;
+ arg.d[i].sectors = src.d[i].sectors;
+ arg.d[i].fragmented = src.d[i].fragmented;
}
percpu_ref_put(&ca->ref);
@@ -632,11 +633,14 @@ do { \
\
if (copy_from_user(&i, arg, sizeof(i))) \
return -EFAULT; \
- return bch2_ioctl_##_name(c, i); \
+ ret = bch2_ioctl_##_name(c, i); \
+ goto out; \
} while (0)
long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
{
+ long ret;
+
switch (cmd) {
case BCH_IOCTL_QUERY_UUID:
return bch2_ioctl_query_uuid(c, arg);
@@ -680,6 +684,10 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
default:
return -ENOTTY;
}
+out:
+ if (ret < 0)
+ ret = bch2_err_class(ret);
+ return ret;
}
static DEFINE_IDR(bch_chardev_minor);
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 95e80dbfed4c..843e138862f6 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "checksum.h"
+#include "errcode.h"
#include "super.h"
#include "super-io.h"
@@ -11,7 +12,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
#include <crypto/skcipher.h>
@@ -93,35 +94,69 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void *
}
}
-static inline void do_encrypt_sg(struct crypto_skcipher *tfm,
- struct nonce nonce,
- struct scatterlist *sg, size_t len)
+static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+ struct nonce nonce,
+ struct scatterlist *sg, size_t len)
{
- SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
- BUG_ON(ret);
+ if (ret)
+ pr_err("got error %i from crypto_skcipher_encrypt()", ret);
+
+ return ret;
}
-static inline void do_encrypt(struct crypto_skcipher *tfm,
+static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
- struct scatterlist sg;
+ if (!is_vmalloc_addr(buf)) {
+ struct scatterlist sg;
+
+ sg_init_table(&sg, 1);
+ sg_set_page(&sg,
+ is_vmalloc_addr(buf)
+ ? vmalloc_to_page(buf)
+ : virt_to_page(buf),
+ len, offset_in_page(buf));
+ return do_encrypt_sg(tfm, nonce, &sg, len);
+ } else {
+ unsigned pages = buf_pages(buf, len);
+ struct scatterlist *sg;
+ size_t orig_len = len;
+ int ret, i;
+
+ sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
+ if (!sg)
+ return -BCH_ERR_ENOMEM_do_encrypt;
+
+ sg_init_table(sg, pages);
+
+ for (i = 0; i < pages; i++) {
+ unsigned offset = offset_in_page(buf);
+ unsigned pg_len = min(len, PAGE_SIZE - offset);
+
+ sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
+ buf += pg_len;
+ len -= pg_len;
+ }
- sg_init_one(&sg, buf, len);
- do_encrypt_sg(tfm, nonce, &sg, len);
+ ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
+ kfree(sg);
+ return ret;
+ }
}
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
- struct crypto_skcipher *chacha20 =
- crypto_alloc_skcipher("chacha20", 0, 0);
+ struct crypto_sync_skcipher *chacha20 =
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20) {
@@ -129,31 +164,36 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
return PTR_ERR(chacha20);
}
- ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
+ ret = crypto_skcipher_setkey(&chacha20->base,
+ (void *) key, sizeof(*key));
if (ret) {
pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
}
- do_encrypt(chacha20, nonce, buf, len);
+ ret = do_encrypt(chacha20, nonce, buf, len);
err:
- crypto_free_skcipher(chacha20);
+ crypto_free_sync_skcipher(chacha20);
return ret;
}
-static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
- struct nonce nonce)
+static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+ struct nonce nonce)
{
u8 key[POLY1305_KEY_SIZE];
+ int ret;
nonce.d[3] ^= BCH_NONCE_POLY;
memset(key, 0, sizeof(key));
- do_encrypt(c->chacha20, nonce, key, sizeof(key));
+ ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
+ if (ret)
+ return ret;
desc->tfm = c->poly1305;
crypto_shash_init(desc);
crypto_shash_update(desc, key, sizeof(key));
+ return 0;
}
struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
@@ -195,13 +235,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
}
}
-void bch2_encrypt(struct bch_fs *c, unsigned type,
+int bch2_encrypt(struct bch_fs *c, unsigned type,
struct nonce nonce, void *data, size_t len)
{
if (!bch2_csum_type_is_encryption(type))
- return;
+ return 0;
- do_encrypt(c->chacha20, nonce, data, len);
+ return do_encrypt(c->chacha20, nonce, data, len);
}
static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
@@ -230,7 +270,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
#endif
@@ -253,7 +293,7 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
kunmap_atomic(p);
}
#else
- __bio_for_each_contig_segment(bv, bio, *iter, *iter)
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
@@ -276,23 +316,27 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
return __bch2_checksum_bio(c, type, nonce, bio, &iter);
}
-void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
- struct nonce nonce, struct bio *bio)
+int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+ struct nonce nonce, struct bio *bio)
{
struct bio_vec bv;
struct bvec_iter iter;
struct scatterlist sgl[16], *sg = sgl;
size_t bytes = 0;
+ int ret = 0;
if (!bch2_csum_type_is_encryption(type))
- return;
+ return 0;
sg_init_table(sgl, ARRAY_SIZE(sgl));
bio_for_each_segment(bv, bio, iter) {
if (sg == sgl + ARRAY_SIZE(sgl)) {
sg_mark_end(sg - 1);
- do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+
+ ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ if (ret)
+ return ret;
nonce = nonce_add(nonce, bytes);
bytes = 0;
@@ -306,7 +350,7 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
}
sg_mark_end(sg - 1);
- do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
}
struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
@@ -382,8 +426,17 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
merged = bch2_checksum_bio(c, crc_old.csum_type,
extent_nonce(version, crc_old), bio);
- if (bch2_crc_cmp(merged, crc_old.csum))
+ if (bch2_crc_cmp(merged, crc_old.csum)) {
+ bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n"
+ "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
+ crc_old.csum.hi,
+ crc_old.csum.lo,
+ merged.hi,
+ merged.lo,
+ bch2_csum_types[crc_old.csum_type],
+ bch2_csum_types[new_csum_type]);
return -EIO;
+ }
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
if (i->crc)
@@ -412,7 +465,7 @@ static int __bch2_request_key(char *key_description, struct bch_key *key)
const struct user_key_payload *ukp;
int ret;
- keyring_key = request_key(&key_type_logon, key_description, NULL);
+ keyring_key = request_key(&key_type_user, key_description, NULL);
if (IS_ERR(keyring_key))
return PTR_ERR(keyring_key);
@@ -450,13 +503,15 @@ static int __bch2_request_key(char *key_description, struct bch_key *key)
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
{
- char key_description[60];
- char uuid[40];
+ struct printbuf key_description = PRINTBUF;
+ int ret;
- uuid_unparse_lower(sb->user_uuid.b, uuid);
- sprintf(key_description, "bcachefs:%s", uuid);
+ prt_printf(&key_description, "bcachefs:");
+ pr_uuid(&key_description, sb->user_uuid.b);
- return __bch2_request_key(key_description, key);
+ ret = __bch2_request_key(key_description.buf, key);
+ printbuf_exit(&key_description);
+ return ret;
}
int bch2_decrypt_sb_key(struct bch_fs *c,
@@ -473,7 +528,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
ret = bch2_request_key(c->disk_sb.sb, &user_key);
if (ret) {
- bch_err(c, "error requesting encryption key: %i", ret);
+ bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
goto err;
}
@@ -498,20 +553,24 @@ err:
static int bch2_alloc_ciphers(struct bch_fs *c)
{
+ int ret;
+
if (!c->chacha20)
- c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
- if (IS_ERR(c->chacha20)) {
- bch_err(c, "error requesting chacha20 module: %li",
- PTR_ERR(c->chacha20));
- return PTR_ERR(c->chacha20);
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
+ ret = PTR_ERR_OR_ZERO(c->chacha20);
+
+ if (ret) {
+ bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
+ return ret;
}
if (!c->poly1305)
c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
- if (IS_ERR(c->poly1305)) {
- bch_err(c, "error requesting poly1305 module: %li",
- PTR_ERR(c->poly1305));
- return PTR_ERR(c->poly1305);
+ ret = PTR_ERR_OR_ZERO(c->poly1305);
+
+ if (ret) {
+ bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
+ return ret;
}
return 0;
@@ -572,7 +631,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
if (keyed) {
ret = bch2_request_key(c->disk_sb.sb, &user_key);
if (ret) {
- bch_err(c, "error requesting encryption key: %i", ret);
+ bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
goto err;
}
@@ -582,14 +641,14 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
goto err;
}
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
if (!crypt) {
- ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
+ ret = -BCH_ERR_ENOSPC_sb_crypt;
goto err;
}
@@ -610,7 +669,7 @@ void bch2_fs_encryption_exit(struct bch_fs *c)
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
- crypto_free_skcipher(c->chacha20);
+ crypto_free_sync_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
@@ -624,9 +683,9 @@ int bch2_fs_encryption_init(struct bch_fs *c)
pr_verbose_init(c->opts, "");
c->sha256 = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(c->sha256)) {
- bch_err(c, "error requesting sha256 module");
- ret = PTR_ERR(c->sha256);
+ ret = PTR_ERR_OR_ZERO(c->sha256);
+ if (ret) {
+ bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
goto out;
}
@@ -642,7 +701,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
if (ret)
goto out;
- ret = crypto_skcipher_setkey(c->chacha20,
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto out;
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 5e0e77ca71a9..409ad534d9f4 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -7,7 +7,7 @@
#include "super-io.h"
#include <linux/crc64.h>
-#include <crypto/chacha20.h>
+#include <crypto/chacha.h>
static inline bool bch2_checksum_mergeable(unsigned type)
{
@@ -49,7 +49,7 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
int bch2_request_key(struct bch_sb *, struct bch_key *);
-void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
void *data, size_t);
struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
@@ -61,8 +61,16 @@ int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
struct bch_extent_crc_unpacked *,
unsigned, unsigned, unsigned);
-void bch2_encrypt_bio(struct bch_fs *, unsigned,
- struct nonce, struct bio *);
+int __bch2_encrypt_bio(struct bch_fs *, unsigned,
+ struct nonce, struct bio *);
+
+static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+ struct nonce nonce, struct bio *bio)
+{
+ return bch2_csum_type_is_encryption(type)
+ ? __bch2_encrypt_bio(c, type, nonce, bio)
+ : 0;
+}
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
struct bch_key *);
@@ -78,27 +86,30 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
{
switch (type) {
case BCH_CSUM_OPT_none:
- return BCH_CSUM_none;
+ return BCH_CSUM_none;
case BCH_CSUM_OPT_crc32c:
- return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
+ return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
case BCH_CSUM_OPT_crc64:
- return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
+ return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
case BCH_CSUM_OPT_xxhash:
- return BCH_CSUM_xxhash;
+ return BCH_CSUM_xxhash;
default:
- BUG();
+ BUG();
}
}
static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
- unsigned opt)
+ struct bch_io_opts opts)
{
+ if (opts.nocow)
+ return 0;
+
if (c->sb.encryption_type)
return c->opts.wide_macs
? BCH_CSUM_chacha20_poly1305_128
: BCH_CSUM_chacha20_poly1305_80;
- return bch2_csum_opt_to_type(opt, true);
+ return bch2_csum_opt_to_type(opts.data_checksum, true);
}
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
@@ -140,9 +151,9 @@ static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
/* for skipping ahead and encrypting/decrypting at an offset: */
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
{
- EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
- le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
return nonce;
}
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 4324cfe7eed0..f41889093a2c 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -122,7 +122,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
}
__set_current_state(TASK_RUNNING);
- del_singleshot_timer_sync(&wait.cpu_timer);
+ del_timer_sync(&wait.cpu_timer);
destroy_timer_on_stack(&wait.cpu_timer);
bch2_io_timer_del(clock, &wait.io_timer);
}
@@ -157,14 +157,16 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
unsigned long now;
unsigned i;
+ out->atomic++;
spin_lock(&clock->timer_lock);
now = atomic64_read(&clock->now);
for (i = 0; i < clock->timers.used; i++)
- pr_buf(out, "%ps:\t%li\n",
+ prt_printf(out, "%ps:\t%li\n",
clock->timers.data[i]->fn,
clock->timers.data[i]->expire - now);
spin_unlock(&clock->timer_lock);
+ --out->atomic;
}
void bch2_io_clock_exit(struct io_clock *clock)
@@ -182,10 +184,10 @@ int bch2_io_clock_init(struct io_clock *clock)
clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
if (!clock->pcpu_buf)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_io_clock_init;
if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_io_clock_init;
return 0;
}
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 691009fc2431..6bec38440249 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -45,7 +45,7 @@ static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
struct bvec_iter iter;
void *expected_start = NULL;
- __bio_for_each_segment(bv, bio, iter, start) {
+ __bio_for_each_bvec(bv, bio, iter, start) {
if (expected_start &&
expected_start != page_address(bv.bv_page) + bv.bv_offset)
return false;
@@ -197,9 +197,9 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
goto err;
workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
- ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
+ ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
- ret = ZSTD_decompressDCtx(ctx,
+ ret = zstd_decompress_dctx(ctx,
dst_data, dst_len,
src_data.b + 4, real_src_len);
@@ -270,7 +270,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
{
struct bbuf dst_data = { NULL };
size_t dst_len = crc.uncompressed_size << 9;
- int ret = -ENOMEM;
+ int ret;
if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max ||
crc.compressed_size << 9 > c->opts.encoded_extent_max)
@@ -333,8 +333,8 @@ static int attempt_compress(struct bch_fs *c,
return strm.total_out;
}
case BCH_COMPRESSION_TYPE_zstd: {
- ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
- ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
+ ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
+ zstd_cctx_workspace_bound(&c->zstd_params.cParams));
/*
* ZSTD requires that when we decompress we pass in the exact
@@ -347,11 +347,11 @@ static int attempt_compress(struct bch_fs *c,
* factor (7 bytes) from the dst buffer size to account for
* that.
*/
- size_t len = ZSTD_compressCCtx(ctx,
+ size_t len = zstd_compress_cctx(ctx,
dst + 4, dst_len - 4 - 7,
src, src_len,
- c->zstd_params);
- if (ZSTD_isError(len))
+ &c->zstd_params);
+ if (zstd_is_error(len))
return 0;
*((__le32 *) dst) = cpu_to_le32(len);
@@ -377,7 +377,7 @@ static unsigned __bio_compress(struct bch_fs *c,
/* If it's only one block, don't bother trying to compress: */
if (src->bi_iter.bi_size <= c->opts.block_size)
- return 0;
+ return BCH_COMPRESSION_TYPE_incompressible;
dst_data = bio_map_or_bounce(c, dst, WRITE);
src_data = bio_map_or_bounce(c, src, READ);
@@ -542,11 +542,11 @@ void bch2_fs_compress_exit(struct bch_fs *c)
mempool_exit(&c->compression_bounce[READ]);
}
-static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
+static int _bch2_fs_compress_init(struct bch_fs *c, u64 features)
{
size_t decompress_workspace_size = 0;
bool decompress_workspace_needed;
- ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0);
+ ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max);
struct {
unsigned feature;
unsigned type;
@@ -558,37 +558,30 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
zlib_inflate_workspacesize(), },
{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
- ZSTD_CCtxWorkspaceBound(params.cParams),
- ZSTD_DCtxWorkspaceBound() },
+ zstd_cctx_workspace_bound(&params.cParams),
+ zstd_dctx_workspace_bound() },
}, *i;
- int ret = 0;
-
- pr_verbose_init(c->opts, "");
+ bool have_compressed = false;
c->zstd_params = params;
for (i = compression_types;
i < compression_types + ARRAY_SIZE(compression_types);
i++)
- if (features & (1 << i->feature))
- goto have_compressed;
+ have_compressed |= (features & (1 << i->feature)) != 0;
- goto out;
-have_compressed:
+ if (!have_compressed)
+ return 0;
- if (!mempool_initialized(&c->compression_bounce[READ])) {
- ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
- 1, c->opts.encoded_extent_max);
- if (ret)
- goto out;
- }
+ if (!mempool_initialized(&c->compression_bounce[READ]) &&
+ mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
+ 1, c->opts.encoded_extent_max))
+ return -BCH_ERR_ENOMEM_compression_bounce_read_init;
- if (!mempool_initialized(&c->compression_bounce[WRITE])) {
- ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
- 1, c->opts.encoded_extent_max);
- if (ret)
- goto out;
- }
+ if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
+ mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
+ 1, c->opts.encoded_extent_max))
+ return -BCH_ERR_ENOMEM_compression_bounce_write_init;
for (i = compression_types;
i < compression_types + ARRAY_SIZE(compression_types);
@@ -605,22 +598,28 @@ have_compressed:
if (mempool_initialized(&c->compress_workspace[i->type]))
continue;
- ret = mempool_init_kvpmalloc_pool(
+ if (mempool_init_kvpmalloc_pool(
&c->compress_workspace[i->type],
- 1, i->compress_workspace);
- if (ret)
- goto out;
+ 1, i->compress_workspace))
+ return -BCH_ERR_ENOMEM_compression_workspace_init;
}
- if (!mempool_initialized(&c->decompress_workspace)) {
- ret = mempool_init_kvpmalloc_pool(
- &c->decompress_workspace,
- 1, decompress_workspace_size);
- if (ret)
- goto out;
- }
-out:
+ if (!mempool_initialized(&c->decompress_workspace) &&
+ mempool_init_kvpmalloc_pool(&c->decompress_workspace,
+ 1, decompress_workspace_size))
+ return -BCH_ERR_ENOMEM_decompression_workspace_init;
+
+ return 0;
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
+{
+ int ret;
+
+ pr_verbose_init(c->opts, "");
+ ret = _bch2_fs_compress_init(c, features);
pr_verbose_init(c->opts, "ret %i", ret);
+
return ret;
}
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
new file mode 100644
index 000000000000..e5587bc5a2b7
--- /dev/null
+++ b/fs/bcachefs/counters.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "super-io.h"
+#include "counters.h"
+
+/* BCH_SB_FIELD_counters */
+
+const char * const bch2_counter_names[] = {
+#define x(t, n, ...) (#t),
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ NULL
+};
+
+static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
+{
+ if (!ctrs)
+ return 0;
+
+ return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
+};
+
+static int bch2_sb_counters_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ return 0;
+};
+
+void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
+ unsigned int i;
+ unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+ for (i = 0; i < nr; i++) {
+ if (i < BCH_COUNTER_NR)
+ prt_printf(out, "%s ", bch2_counter_names[i]);
+ else
+ prt_printf(out, "(unknown)");
+
+ prt_tab(out);
+ prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
+ prt_newline(out);
+ };
+};
+
+int bch2_sb_counters_to_cpu(struct bch_fs *c)
+{
+ struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+ unsigned int i;
+ unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+ u64 val = 0;
+
+ for (i = 0; i < BCH_COUNTER_NR; i++)
+ c->counters_on_mount[i] = 0;
+
+ for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
+ val = le64_to_cpu(ctrs->d[i]);
+ percpu_u64_set(&c->counters[i], val);
+ c->counters_on_mount[i] = val;
+ }
+ return 0;
+};
+
+int bch2_sb_counters_from_cpu(struct bch_fs *c)
+{
+ struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+ struct bch_sb_field_counters *ret;
+ unsigned int i;
+ unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+ if (nr < BCH_COUNTER_NR) {
+ ret = bch2_sb_resize_counters(&c->disk_sb,
+ sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
+
+ if (ret) {
+ ctrs = ret;
+ nr = bch2_sb_counter_nr_entries(ctrs);
+ }
+ }
+
+
+ for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
+ ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
+ return 0;
+}
+
+void bch2_fs_counters_exit(struct bch_fs *c)
+{
+ free_percpu(c->counters);
+}
+
+int bch2_fs_counters_init(struct bch_fs *c)
+{
+ c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
+ if (!c->counters)
+ return -BCH_ERR_ENOMEM_fs_counters_init;
+
+ return bch2_sb_counters_to_cpu(c);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_counters = {
+ .validate = bch2_sb_counters_validate,
+ .to_text = bch2_sb_counters_to_text,
+};
diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/counters.h
new file mode 100644
index 000000000000..4778aa19bf34
--- /dev/null
+++ b/fs/bcachefs/counters.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COUNTERS_H
+#define _BCACHEFS_COUNTERS_H
+
+#include "bcachefs.h"
+#include "super-io.h"
+
+
+int bch2_sb_counters_to_cpu(struct bch_fs *);
+int bch2_sb_counters_from_cpu(struct bch_fs *);
+
+void bch2_fs_counters_exit(struct bch_fs *);
+int bch2_fs_counters_init(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
+
+#endif // _BCACHEFS_COUNTERS_H
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
new file mode 100644
index 000000000000..978ab7961f1b
--- /dev/null
+++ b/fs/bcachefs/darray.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DARRAY_H
+#define _BCACHEFS_DARRAY_H
+
+/*
+ * Dynamic arrays:
+ *
+ * Inspired by CCAN's darray
+ */
+
+#include "util.h"
+#include <linux/slab.h>
+
+#define DARRAY(type) \
+struct { \
+ size_t nr, size; \
+ type *data; \
+}
+
+typedef DARRAY(void) darray_void;
+
+static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
+{
+ if (d->nr + more > d->size) {
+ size_t new_size = roundup_pow_of_two(d->nr + more);
+ void *data = krealloc_array(d->data, new_size, t_size, gfp);
+
+ if (!data)
+ return -ENOMEM;
+
+ d->data = data;
+ d->size = new_size;
+ }
+
+ return 0;
+}
+
+#define darray_make_room_gfp(_d, _more, _gfp) \
+ __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
+
+#define darray_make_room(_d, _more) \
+ darray_make_room_gfp(_d, _more, GFP_KERNEL)
+
+#define darray_top(_d) ((_d).data[(_d).nr])
+
+#define darray_push_gfp(_d, _item, _gfp) \
+({ \
+ int _ret = darray_make_room_gfp((_d), 1, _gfp); \
+ \
+ if (!_ret) \
+ (_d)->data[(_d)->nr++] = (_item); \
+ _ret; \
+})
+
+#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL)
+
+#define darray_insert_item(_d, _pos, _item) \
+({ \
+ size_t pos = (_pos); \
+ int _ret = darray_make_room((_d), 1); \
+ \
+ if (!_ret) \
+ array_insert_item((_d)->data, (_d)->nr, pos, (_item)); \
+ _ret; \
+})
+
+#define darray_for_each(_d, _i) \
+ for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
+#define darray_init(_d) \
+do { \
+ (_d)->data = NULL; \
+ (_d)->nr = (_d)->size = 0; \
+} while (0)
+
+#define darray_exit(_d) \
+do { \
+ kfree((_d)->data); \
+ darray_init(_d); \
+} while (0)
+
+#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
new file mode 100644
index 000000000000..6b0a5fbe377c
--- /dev/null
+++ b/fs/bcachefs/data_update.c
@@ -0,0 +1,638 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "data_update.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "nocow_locking.h"
+#include "subvolume.h"
+
+#include <trace/events/bcachefs.h>
+
+static int insert_snapshot_whiteouts(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos old_pos,
+ struct bpos new_pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter, iter2;
+ struct bkey_s_c k, k2;
+ snapshot_id_list s;
+ struct bkey_i *update;
+ int ret;
+
+ if (!btree_type_has_snapshots(id))
+ return 0;
+
+ darray_init(&s);
+
+ if (!bch2_snapshot_has_children(c, old_pos.snapshot))
+ return 0;
+
+ bch2_trans_iter_init(trans, &iter, id, old_pos,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ while (1) {
+ k = bch2_btree_iter_prev(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ break;
+
+ if (!k.k)
+ break;
+
+ if (!bkey_eq(old_pos, k.k->p))
+ break;
+
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot) &&
+ !snapshot_list_has_ancestor(c, &s, k.k->p.snapshot)) {
+ struct bpos whiteout_pos = new_pos;
+
+ whiteout_pos.snapshot = k.k->p.snapshot;
+
+ bch2_trans_iter_init(trans, &iter2, id, whiteout_pos,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+ k2 = bch2_btree_iter_peek_slot(&iter2);
+ ret = bkey_err(k2);
+
+ if (!ret && k2.k->type == KEY_TYPE_deleted) {
+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ break;
+
+ bkey_init(&update->k);
+ update->k.p = whiteout_pos;
+ update->k.type = KEY_TYPE_whiteout;
+
+ ret = bch2_trans_update(trans, &iter2, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ }
+ bch2_trans_iter_exit(trans, &iter2);
+
+ if (ret)
+ break;
+
+ ret = snapshot_list_add(c, &s, k.k->p.snapshot);
+ if (ret)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ darray_exit(&s);
+
+ return ret;
+}
+
+static void trace_move_extent_fail2(struct data_update *m,
+ struct bkey_s_c new,
+ struct bkey_s_c wrote,
+ struct bkey_i *insert,
+ const char *msg)
+{
+ struct bch_fs *c = m->op.c;
+ struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+ const union bch_extent_entry *entry;
+ struct bch_extent_ptr *ptr;
+ struct extent_ptr_decoded p;
+ struct printbuf buf = PRINTBUF;
+ unsigned i, rewrites_found = 0;
+
+ if (!trace_move_extent_fail_enabled())
+ return;
+
+ prt_str(&buf, msg);
+
+ if (insert) {
+ i = 0;
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+ struct bkey_s new_s;
+ new_s.k = (void *) new.k;
+ new_s.v = (void *) new.v;
+
+ if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+ !ptr->cached)
+ rewrites_found |= 1U << i;
+ i++;
+ }
+ }
+
+ prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u",
+ (m->data_opts.rewrite_ptrs & (1 << 0)) != 0,
+ (m->data_opts.rewrite_ptrs & (1 << 1)) != 0,
+ (m->data_opts.rewrite_ptrs & (1 << 2)) != 0,
+ (m->data_opts.rewrite_ptrs & (1 << 3)) != 0);
+
+ prt_printf(&buf, "\nrewrites found: %u%u%u%u",
+ (rewrites_found & (1 << 0)) != 0,
+ (rewrites_found & (1 << 1)) != 0,
+ (rewrites_found & (1 << 2)) != 0,
+ (rewrites_found & (1 << 3)) != 0);
+
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, new);
+
+ prt_str(&buf, "\nwrote: ");
+ bch2_bkey_val_to_text(&buf, c, wrote);
+
+ if (insert) {
+ prt_str(&buf, "\ninsert: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+ }
+
+ trace_move_extent_fail(c, buf.buf);
+ printbuf_exit(&buf);
+}
+
+static int __bch2_data_update_index_update(struct btree_trans *trans,
+ struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct btree_iter iter;
+ struct data_update *m =
+ container_of(op, struct data_update, op);
+ struct keylist *keys = &op->insert_keys;
+ struct bkey_buf _new, _insert;
+ int ret = 0;
+
+ bch2_bkey_buf_init(&_new);
+ bch2_bkey_buf_init(&_insert);
+ bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
+ bch2_trans_iter_init(trans, &iter, m->btree_id,
+ bkey_start_pos(&bch2_keylist_front(keys)->k),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+ while (1) {
+ struct bkey_s_c k;
+ struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+ struct bkey_i *insert = NULL;
+ struct bkey_i_extent *new;
+ const union bch_extent_entry *entry_c;
+ union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ struct bch_extent_ptr *ptr;
+ const struct bch_extent_ptr *ptr_c;
+ struct bpos next_pos;
+ bool should_check_enospc;
+ s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+ unsigned rewrites_found = 0, durability, i;
+
+ bch2_trans_begin(trans);
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ new = bkey_i_to_extent(bch2_keylist_front(keys));
+
+ if (!bch2_extents_match(k, old)) {
+ trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
+ NULL, "no match:");
+ goto nowork;
+ }
+
+ bkey_reassemble(_insert.k, k);
+ insert = _insert.k;
+
+ bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+ new = bkey_i_to_extent(_new.k);
+ bch2_cut_front(iter.pos, &new->k_i);
+
+ bch2_cut_front(iter.pos, insert);
+ bch2_cut_back(new->k.p, insert);
+ bch2_cut_back(insert->k.p, &new->k_i);
+
+ /*
+ * @old: extent that we read from
+ * @insert: key that we're going to update, initialized from
+ * extent currently in btree - same as @old unless we raced with
+ * other updates
+ * @new: extent with new pointers that we'll be adding to @insert
+ *
+ * Fist, drop rewrite_ptrs from @new:
+ */
+ i = 0;
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
+ if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+ (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+ !ptr->cached) {
+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
+ /*
+ * See comment below:
+ bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
+ */
+ rewrites_found |= 1U << i;
+ }
+ i++;
+ }
+
+ if (m->data_opts.rewrite_ptrs &&
+ !rewrites_found &&
+ bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
+ trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
+ goto nowork;
+ }
+
+ /*
+ * A replica that we just wrote might conflict with a replica
+ * that we want to keep, due to racing with another move:
+ */
+restart_drop_conflicting_replicas:
+ extent_for_each_ptr(extent_i_to_s(new), ptr)
+ if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
+ !ptr_c->cached) {
+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
+ goto restart_drop_conflicting_replicas;
+ }
+
+ if (!bkey_val_u64s(&new->k)) {
+ trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
+ goto nowork;
+ }
+
+ /* Now, drop pointers that conflict with what we just wrote: */
+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+ if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
+
+ durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
+ bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
+
+ /* Now, drop excess replicas: */
+restart_drop_extra_replicas:
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
+ unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
+
+ if (!p.ptr.cached &&
+ durability - ptr_durability >= m->op.opts.data_replicas) {
+ durability -= ptr_durability;
+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
+ /*
+ * Currently, we're dropping unneeded replicas
+ * instead of marking them as cached, since
+ * cached data in stripe buckets prevents them
+ * from being reused:
+ bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
+ */
+ goto restart_drop_extra_replicas;
+ }
+ }
+
+ /* Finally, add the pointers we just wrote: */
+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+ bch2_extent_ptr_decoded_append(insert, &p);
+
+ bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
+ bch2_extent_normalize(c, bkey_i_to_s(insert));
+
+ ret = bch2_sum_sector_overwrites(trans, &iter, insert,
+ &should_check_enospc,
+ &i_sectors_delta,
+ &disk_sectors_delta);
+ if (ret)
+ goto err;
+
+ if (disk_sectors_delta > (s64) op->res.sectors) {
+ ret = bch2_disk_reservation_add(c, &op->res,
+ disk_sectors_delta - op->res.sectors,
+ !should_check_enospc
+ ? BCH_DISK_RESERVATION_NOFAIL : 0);
+ if (ret)
+ goto out;
+ }
+
+ next_pos = insert->k.p;
+
+ if (!bkey_eq(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
+ ret = insert_snapshot_whiteouts(trans, m->btree_id, k.k->p,
+ bkey_start_pos(&insert->k));
+ if (ret)
+ goto err;
+ }
+
+ if (!bkey_eq(insert->k.p, k.k->p)) {
+ ret = insert_snapshot_whiteouts(trans, m->btree_id,
+ k.k->p, insert->k.p);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_trans_update(trans, &iter, insert,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, &op->res,
+ NULL,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL|
+ m->data_opts.btree_insert_flags);
+ if (!ret) {
+ bch2_btree_iter_set_pos(&iter, next_pos);
+
+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
+ trace_move_extent_finish(&new->k);
+ }
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
+ if (ret)
+ break;
+next:
+ while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) {
+ bch2_keylist_pop_front(keys);
+ if (bch2_keylist_empty(keys))
+ goto out;
+ }
+ continue;
+nowork:
+ if (m->ctxt && m->ctxt->stats) {
+ BUG_ON(k.k->p.offset <= iter.pos.offset);
+ atomic64_inc(&m->ctxt->stats->keys_raced);
+ atomic64_add(k.k->p.offset - iter.pos.offset,
+ &m->ctxt->stats->sectors_raced);
+ }
+
+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_fail], new->k.size);
+
+ bch2_btree_iter_advance(&iter);
+ goto next;
+ }
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_bkey_buf_exit(&_insert, c);
+ bch2_bkey_buf_exit(&_new, c);
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+ return ret;
+}
+
+int bch2_data_update_index_update(struct bch_write_op *op)
+{
+ return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op));
+}
+
+void bch2_data_update_read_done(struct data_update *m,
+ struct bch_extent_crc_unpacked crc)
+{
+ /* write bio must own pages: */
+ BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+ m->op.crc = crc;
+ m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
+
+ closure_call(&m->op.cl, bch2_write, NULL, NULL);
+}
+
+void bch2_data_update_exit(struct data_update *update)
+{
+ struct bch_fs *c = update->op.c;
+ struct bkey_ptrs_c ptrs =
+ bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (c->opts.nocow_enabled)
+ bch2_bucket_nocow_unlock(&c->nocow_locks,
+ PTR_BUCKET_POS(c, ptr), 0);
+ percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+ }
+
+ bch2_bkey_buf_exit(&update->k, c);
+ bch2_disk_reservation_put(c, &update->op.res);
+ bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
+}
+
+void bch2_update_unwritten_extent(struct btree_trans *trans,
+ struct data_update *update)
+{
+ struct bch_fs *c = update->op.c;
+ struct bio *bio = &update->op.wbio.bio;
+ struct bkey_i_extent *e;
+ struct write_point *wp;
+ struct bch_extent_ptr *ptr;
+ struct closure cl;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ closure_init_stack(&cl);
+ bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
+
+ while (bio_sectors(bio)) {
+ unsigned sectors = bio_sectors(bio);
+
+ bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
+ BTREE_ITER_SLOTS);
+ ret = lockrestart_do(trans, ({
+ k = bch2_btree_iter_peek_slot(&iter);
+ bkey_err(k);
+ }));
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k)))
+ break;
+
+ e = bkey_extent_init(update->op.insert_keys.top);
+ e->k.p = update->op.pos;
+
+ ret = bch2_alloc_sectors_start_trans(trans,
+ update->op.target,
+ false,
+ update->op.write_point,
+ &update->op.devs_have,
+ update->op.nr_replicas,
+ update->op.nr_replicas,
+ update->op.alloc_reserve,
+ 0, &cl, &wp);
+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
+ bch2_trans_unlock(trans);
+ closure_sync(&cl);
+ continue;
+ }
+
+ if (ret)
+ return;
+
+ sectors = min(sectors, wp->sectors_free);
+
+ bch2_key_resize(&e->k, sectors);
+
+ bch2_open_bucket_get(c, wp, &update->op.open_buckets);
+ bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+ bch2_alloc_sectors_done(c, wp);
+
+ bio_advance(bio, sectors << 9);
+ update->op.pos.offset += sectors;
+
+ extent_for_each_ptr(extent_i_to_s(e), ptr)
+ ptr->unwritten = true;
+ bch2_keylist_push(&update->op.insert_keys);
+
+ ret = __bch2_data_update_index_update(trans, &update->op);
+
+ bch2_open_buckets_put(c, &update->op.open_buckets);
+
+ if (ret)
+ break;
+ }
+
+ if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
+ bch2_trans_unlock(trans);
+ closure_sync(&cl);
+ }
+}
+
+int bch2_data_update_init(struct btree_trans *trans,
+ struct moving_context *ctxt,
+ struct data_update *m,
+ struct write_point_specifier wp,
+ struct bch_io_opts io_opts,
+ struct data_update_opts data_opts,
+ enum btree_id btree_id,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ const struct bch_extent_ptr *ptr;
+ unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
+ unsigned ptrs_locked = 0;
+ int ret;
+
+ bch2_bkey_buf_init(&m->k);
+ bch2_bkey_buf_reassemble(&m->k, c, k);
+ m->btree_id = btree_id;
+ m->data_opts = data_opts;
+
+ bch2_write_op_init(&m->op, c, io_opts);
+ m->op.pos = bkey_start_pos(k.k);
+ m->op.version = k.k->version;
+ m->op.target = data_opts.target;
+ m->op.write_point = wp;
+ m->op.nr_replicas = 0;
+ m->op.flags |= BCH_WRITE_PAGES_STABLE|
+ BCH_WRITE_PAGES_OWNED|
+ BCH_WRITE_DATA_ENCODED|
+ BCH_WRITE_MOVE|
+ m->data_opts.write_flags;
+ m->op.compression_type =
+ bch2_compression_opt_to_type[io_opts.background_compression ?:
+ io_opts.compression];
+ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+ m->op.alloc_reserve = RESERVE_movinggc;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ bool locked;
+
+ if (((1U << i) & m->data_opts.rewrite_ptrs)) {
+ BUG_ON(p.ptr.cached);
+
+ if (crc_is_compressed(p.crc))
+ reserve_sectors += k.k->size;
+
+ m->op.nr_replicas += bch2_extent_ptr_durability(c, &p);
+ } else if (!p.ptr.cached) {
+ bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+ }
+
+ /*
+ * op->csum_type is normally initialized from the fs/file's
+ * current options - but if an extent is encrypted, we require
+ * that it stays encrypted:
+ */
+ if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
+ m->op.nonce = p.crc.nonce + p.crc.offset;
+ m->op.csum_type = p.crc.csum_type;
+ }
+
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+ m->op.incompressible = true;
+
+ if (c->opts.nocow_enabled) {
+ if (ctxt) {
+ move_ctxt_wait_event(ctxt, trans,
+ (locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
+ PTR_BUCKET_POS(c, &p.ptr), 0)) ||
+ !atomic_read(&ctxt->read_sectors));
+
+ if (!locked)
+ bch2_bucket_nocow_lock(&c->nocow_locks,
+ PTR_BUCKET_POS(c, &p.ptr), 0);
+ } else {
+ if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
+ PTR_BUCKET_POS(c, &p.ptr), 0)) {
+ ret = -BCH_ERR_nocow_lock_blocked;
+ goto err;
+ }
+ }
+ ptrs_locked |= (1U << i);
+ }
+
+ i++;
+ }
+
+ if (reserve_sectors) {
+ ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
+ m->data_opts.extra_replicas
+ ? 0
+ : BCH_DISK_RESERVATION_NOFAIL);
+ if (ret)
+ goto err;
+ }
+
+ m->op.nr_replicas += m->data_opts.extra_replicas;
+ m->op.nr_replicas_required = m->op.nr_replicas;
+
+ BUG_ON(!m->op.nr_replicas);
+
+ /* Special handling required: */
+ if (bkey_extent_is_unwritten(k))
+ return -BCH_ERR_unwritten_extent_update;
+ return 0;
+err:
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if ((1U << i) & ptrs_locked)
+ bch2_bucket_nocow_unlock(&c->nocow_locks,
+ PTR_BUCKET_POS(c, &p.ptr), 0);
+ percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref);
+ i++;
+ }
+
+ bch2_bkey_buf_exit(&m->k, c);
+ bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
+ return ret;
+}
+
+void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+ unsigned i = 0;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
+ opts->kill_ptrs |= 1U << i;
+ opts->rewrite_ptrs ^= 1U << i;
+ }
+
+ i++;
+ }
+}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
new file mode 100644
index 000000000000..49e9055cbb52
--- /dev/null
+++ b/fs/bcachefs/data_update.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BCACHEFS_DATA_UPDATE_H
+#define _BCACHEFS_DATA_UPDATE_H
+
+#include "bkey_buf.h"
+#include "io_types.h"
+
+struct moving_context;
+
+struct data_update_opts {
+ unsigned rewrite_ptrs;
+ unsigned kill_ptrs;
+ u16 target;
+ u8 extra_replicas;
+ unsigned btree_insert_flags;
+ unsigned write_flags;
+};
+
+struct data_update {
+ /* extent being updated: */
+ enum btree_id btree_id;
+ struct bkey_buf k;
+ struct data_update_opts data_opts;
+ struct moving_context *ctxt;
+ struct bch_write_op op;
+};
+
+int bch2_data_update_index_update(struct bch_write_op *);
+
+void bch2_data_update_read_done(struct data_update *,
+ struct bch_extent_crc_unpacked);
+
+void bch2_data_update_exit(struct data_update *);
+void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *);
+int bch2_data_update_init(struct btree_trans *, struct moving_context *,
+ struct data_update *,
+ struct write_point_specifier,
+ struct bch_io_opts, struct data_update_opts,
+ enum btree_id, struct bkey_s_c);
+void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
+
+#endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index ee5b7f696796..d1563caf7fb7 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -11,6 +11,7 @@
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_iter.h"
+#include "btree_locking.h"
#include "btree_update.h"
#include "buckets.h"
#include "debug.h"
@@ -38,16 +39,16 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
struct bset *sorted, *inmemory = &b->data->keys;
struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
struct bio *bio;
- bool failed = false;
+ bool failed = false, saw_error = false;
if (!bch2_dev_get_ioref(ca, READ))
return false;
- bio = bio_alloc_bioset(GFP_NOIO,
- buf_pages(n_sorted, btree_bytes(c)),
- &c->btree_bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_opf = REQ_OP_READ|REQ_META;
+ bio = bio_alloc_bioset(ca->disk_sb.bdev,
+ buf_pages(n_sorted, btree_bytes(c)),
+ REQ_OP_READ|REQ_META,
+ GFP_NOIO,
+ &c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
bch2_bio_map(bio, n_sorted, btree_bytes(c));
@@ -59,7 +60,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
memcpy(n_ondisk, n_sorted, btree_bytes(c));
v->written = 0;
- if (bch2_btree_node_read_done(c, ca, v, false))
+ if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
return false;
n_sorted = c->verify_data->data;
@@ -152,7 +153,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
BUG_ON(b->nsets != 1);
- for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_next(k))
+ for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
if (k->type == KEY_TYPE_btree_ptr_v2) {
struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k);
v->mem_ptr = 0;
@@ -169,38 +170,159 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
failed |= bch2_btree_verify_replica(c, b, p);
if (failed) {
- char buf[200];
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
- bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+ printbuf_exit(&buf);
}
out:
mutex_unlock(&c->verify_lock);
bch2_btree_node_io_unlock(b);
}
+void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
+ const struct btree *b)
+{
+ struct btree_node *n_ondisk = NULL;
+ struct extent_ptr_decoded pick;
+ struct bch_dev *ca;
+ struct bio *bio = NULL;
+ unsigned offset = 0;
+ int ret;
+
+ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
+ prt_printf(out, "error getting device to read from: invalid device\n");
+ return;
+ }
+
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ if (!bch2_dev_get_ioref(ca, READ)) {
+ prt_printf(out, "error getting device to read from: not online\n");
+ return;
+ }
+
+ n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ if (!n_ondisk) {
+ prt_printf(out, "memory allocation failure\n");
+ goto out;
+ }
+
+ bio = bio_alloc_bioset(ca->disk_sb.bdev,
+ buf_pages(n_ondisk, btree_bytes(c)),
+ REQ_OP_READ|REQ_META,
+ GFP_NOIO,
+ &c->btree_bio);
+ bio->bi_iter.bi_sector = pick.ptr.offset;
+ bch2_bio_map(bio, n_ondisk, btree_bytes(c));
+
+ ret = submit_bio_wait(bio);
+ if (ret) {
+ prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret));
+ goto out;
+ }
+
+ while (offset < btree_sectors(c)) {
+ struct bset *i;
+ struct nonce nonce;
+ struct bch_csum csum;
+ struct bkey_packed *k;
+ unsigned sectors;
+
+ if (!offset) {
+ i = &n_ondisk->keys;
+
+ if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
+ prt_printf(out, "unknown checksum type at offset %u: %llu\n",
+ offset, BSET_CSUM_TYPE(i));
+ goto out;
+ }
+
+ nonce = btree_nonce(i, offset << 9);
+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
+
+ if (bch2_crc_cmp(csum, n_ondisk->csum)) {
+ prt_printf(out, "invalid checksum\n");
+ goto out;
+ }
+
+ bset_encrypt(c, i, offset << 9);
+
+ sectors = vstruct_sectors(n_ondisk, c->block_bits);
+ } else {
+ struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9);
+
+ i = &bne->keys;
+
+ if (i->seq != n_ondisk->keys.seq)
+ break;
+
+ if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
+ prt_printf(out, "unknown checksum type at offset %u: %llu\n",
+ offset, BSET_CSUM_TYPE(i));
+ goto out;
+ }
+
+ nonce = btree_nonce(i, offset << 9);
+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+ if (bch2_crc_cmp(csum, bne->csum)) {
+ prt_printf(out, "invalid checksum");
+ goto out;
+ }
+
+ bset_encrypt(c, i, offset << 9);
+
+ sectors = vstruct_sectors(bne, c->block_bits);
+ }
+
+ prt_printf(out, " offset %u version %u, journal seq %llu\n",
+ offset,
+ le16_to_cpu(i->version),
+ le64_to_cpu(i->journal_seq));
+ offset += sectors;
+
+ printbuf_indent_add(out, 4);
+
+ for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) {
+ struct bkey u;
+
+ bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u));
+ prt_newline(out);
+ }
+
+ printbuf_indent_sub(out, 4);
+ }
+out:
+ if (bio)
+ bio_put(bio);
+ kvpfree(n_ondisk, btree_bytes(c));
+ percpu_ref_put(&ca->io_ref);
+}
+
#ifdef CONFIG_DEBUG_FS
/* XXX: bch_fs refcounting */
struct dump_iter {
- struct bpos from;
- struct bch_fs *c;
+ struct bch_fs *c;
enum btree_id id;
+ struct bpos from;
+ struct bpos prev_node;
+ u64 iter;
- char buf[1 << 12];
- size_t bytes; /* what's currently in buf */
+ struct printbuf buf;
char __user *ubuf; /* destination user buffer */
size_t size; /* size of requested read */
ssize_t ret; /* bytes read so far */
};
-static int flush_buf(struct dump_iter *i)
+static ssize_t flush_buf(struct dump_iter *i)
{
- if (i->bytes) {
- size_t bytes = min(i->bytes, i->size);
- int err = copy_to_user(i->ubuf, i->buf, bytes);
+ if (i->buf.pos) {
+ size_t bytes = min_t(size_t, i->buf.pos, i->size);
+ int err = copy_to_user(i->ubuf, i->buf.buf, bytes);
if (err)
return err;
@@ -208,11 +330,11 @@ static int flush_buf(struct dump_iter *i)
i->ret += bytes;
i->ubuf += bytes;
i->size -= bytes;
- i->bytes -= bytes;
- memmove(i->buf, i->buf + bytes, i->bytes);
+ i->buf.pos -= bytes;
+ memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
}
- return 0;
+ return i->size ? 0 : i->ret;
}
static int bch2_dump_open(struct inode *inode, struct file *file)
@@ -226,15 +348,20 @@ static int bch2_dump_open(struct inode *inode, struct file *file)
file->private_data = i;
i->from = POS_MIN;
+ i->iter = 0;
i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]);
i->id = bd->id;
+ i->buf = PRINTBUF;
return 0;
}
static int bch2_dump_release(struct inode *inode, struct file *file)
{
- kfree(file->private_data);
+ struct dump_iter *i = file->private_data;
+
+ printbuf_exit(&i->buf);
+ kfree(i);
return 0;
}
@@ -245,48 +372,33 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- int err;
+ ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- err = flush_buf(i);
- if (err)
- return err;
-
- if (!i->size)
- return i->ret;
-
bch2_trans_init(&trans, i->c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
- k = bch2_btree_iter_peek(&iter);
-
- while (k.k && !(err = bkey_err(k))) {
- bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
- i->bytes = strlen(i->buf);
- BUG_ON(i->bytes >= sizeof(i->buf));
- i->buf[i->bytes] = '\n';
- i->bytes++;
-
- k = bch2_btree_iter_next(&iter);
- i->from = iter.pos;
-
- err = flush_buf(i);
- if (err)
+ ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ ret = flush_buf(i);
+ if (ret)
break;
- if (!i->size)
- break;
- }
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_bkey_val_to_text(&i->buf, i->c, k);
+ prt_newline(&i->buf);
+ 0;
+ }));
+ i->from = iter.pos;
+
+ if (!ret)
+ ret = flush_buf(i);
bch2_trans_exit(&trans);
- return err < 0 ? err : i->ret;
+ return ret ?: i->ret;
}
static const struct file_operations btree_debug_ops = {
@@ -303,44 +415,39 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
struct btree_trans trans;
struct btree_iter iter;
struct btree *b;
- int err;
+ ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- err = flush_buf(i);
- if (err)
- return err;
+ ret = flush_buf(i);
+ if (ret)
+ return ret;
- if (!i->size || !bpos_cmp(SPOS_MAX, i->from))
+ if (bpos_eq(SPOS_MAX, i->from))
return i->ret;
bch2_trans_init(&trans, i->c, 0, 0);
- for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) {
- bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
- i->bytes = strlen(i->buf);
- err = flush_buf(i);
- if (err)
+ for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) {
+ ret = flush_buf(i);
+ if (ret)
break;
- /*
- * can't easily correctly restart a btree node traversal across
- * all nodes, meh
- */
- i->from = bpos_cmp(SPOS_MAX, b->key.k.p)
+ bch2_btree_node_to_text(&i->buf, i->c, b);
+ i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
? bpos_successor(b->key.k.p)
: b->key.k.p;
-
- if (!i->size)
- break;
}
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- return err < 0 ? err : i->ret;
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
}
static const struct file_operations btree_format_debug_ops = {
@@ -357,75 +464,400 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct btree *prev_node = NULL;
- int err;
+ ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- err = flush_buf(i);
- if (err)
- return err;
-
- if (!i->size)
- return i->ret;
+ ret = flush_buf(i);
+ if (ret)
+ return ret;
bch2_trans_init(&trans, i->c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- while ((k = bch2_btree_iter_peek(&iter)).k &&
- !(err = bkey_err(k))) {
+ ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
struct btree_path_level *l = &iter.path->l[0];
struct bkey_packed *_k =
bch2_btree_node_iter_peek(&l->iter, l->b);
- if (l->b != prev_node) {
- bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b);
- i->bytes = strlen(i->buf);
- err = flush_buf(i);
- if (err)
- break;
+ ret = flush_buf(i);
+ if (ret)
+ break;
+
+ if (bpos_gt(l->b->key.k.p, i->prev_node)) {
+ bch2_btree_node_to_text(&i->buf, i->c, l->b);
+ i->prev_node = l->b->key.k.p;
}
- prev_node = l->b;
- bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k);
- i->bytes = strlen(i->buf);
+ bch2_bfloat_to_text(&i->buf, l->b, _k);
+ 0;
+ }));
+ i->from = iter.pos;
+
+ bch2_trans_exit(&trans);
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations bfloat_failed_debug_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_read_bfloat_failed,
+};
+
+static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+ struct btree *b)
+{
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
+ prt_printf(out, "%px btree=%s l=%u ",
+ b,
+ bch2_btree_ids[b->c.btree_id],
+ b->c.level);
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+ prt_newline(out);
+
+ prt_printf(out, "flags: ");
+ prt_tab(out);
+ prt_bitflags(out, bch2_btree_node_flags, b->flags);
+ prt_newline(out);
+
+ prt_printf(out, "pcpu read locks: ");
+ prt_tab(out);
+ prt_printf(out, "%u", b->c.lock.readers != NULL);
+ prt_newline(out);
+
+ prt_printf(out, "written:");
+ prt_tab(out);
+ prt_printf(out, "%u", b->written);
+ prt_newline(out);
+
+ prt_printf(out, "writes blocked:");
+ prt_tab(out);
+ prt_printf(out, "%u", !list_empty_careful(&b->write_blocked));
+ prt_newline(out);
+
+ prt_printf(out, "will make reachable:");
+ prt_tab(out);
+ prt_printf(out, "%lx", b->will_make_reachable);
+ prt_newline(out);
+
+ prt_printf(out, "journal pin %px:", &b->writes[0].journal);
+ prt_tab(out);
+ prt_printf(out, "%llu", b->writes[0].journal.seq);
+ prt_newline(out);
+
+ prt_printf(out, "journal pin %px:", &b->writes[1].journal);
+ prt_tab(out);
+ prt_printf(out, "%llu", b->writes[1].journal.seq);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ bool done = false;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ do {
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct btree *b;
+
+ ret = flush_buf(i);
+ if (ret)
+ return ret;
+
+ rcu_read_lock();
+ i->buf.atomic++;
+ tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
+ &c->btree_cache.table);
+ if (i->iter < tbl->size) {
+ rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+ bch2_cached_btree_node_to_text(&i->buf, c, b);
+ i->iter++;
+ } else {
+ done = true;
+ }
+ --i->buf.atomic;
+ rcu_read_unlock();
+ } while (!done);
+
+ if (i->buf.allocation_failure)
+ ret = -ENOMEM;
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations cached_btree_nodes_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_cached_btree_nodes_read,
+};
+
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
+static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ struct btree_trans *trans;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ mutex_lock(&c->btree_trans_lock);
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if (trans->locking_wait.task->pid <= i->iter)
+ continue;
+
+ ret = flush_buf(i);
+ if (ret)
+ break;
+
+ bch2_btree_trans_to_text(&i->buf, trans);
+
+ prt_printf(&i->buf, "backtrace:");
+ prt_newline(&i->buf);
+ printbuf_indent_add(&i->buf, 2);
+ bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task);
+ printbuf_indent_sub(&i->buf, 2);
+ prt_newline(&i->buf);
+
+ i->iter = trans->locking_wait.task->pid;
+ }
+ mutex_unlock(&c->btree_trans_lock);
+
+ if (i->buf.allocation_failure)
+ ret = -ENOMEM;
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations btree_transactions_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_btree_transactions_read,
+};
+#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */
+
+static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ bool done = false;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ do {
err = flush_buf(i);
if (err)
+ return err;
+
+ if (!i->size)
break;
- bch2_btree_iter_advance(&iter);
- i->from = iter.pos;
+ done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
+ i->iter++;
+ } while (!done);
+
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
+
+ return i->ret;
+}
+
+static const struct file_operations journal_pins_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_journal_pins_read,
+};
+
+static int lock_held_stats_open(struct inode *inode, struct file *file)
+{
+ struct bch_fs *c = inode->i_private;
+ struct dump_iter *i;
+
+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+
+ if (!i)
+ return -ENOMEM;
+
+ i->iter = 0;
+ i->c = c;
+ i->buf = PRINTBUF;
+ file->private_data = i;
+
+ return 0;
+}
+
+static int lock_held_stats_release(struct inode *inode, struct file *file)
+{
+ struct dump_iter *i = file->private_data;
+
+ printbuf_exit(&i->buf);
+ kfree(i);
+
+ return 0;
+}
+
+static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ while (1) {
+ struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
err = flush_buf(i);
if (err)
- break;
+ return err;
if (!i->size)
break;
+
+ if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
+ !bch2_btree_transaction_fns[i->iter])
+ break;
+
+ prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]);
+ prt_newline(&i->buf);
+ printbuf_indent_add(&i->buf, 2);
+
+ mutex_lock(&s->lock);
+
+ prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
+ prt_newline(&i->buf);
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
+ prt_printf(&i->buf, "Lock hold times:");
+ prt_newline(&i->buf);
+
+ printbuf_indent_add(&i->buf, 2);
+ bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
+ printbuf_indent_sub(&i->buf, 2);
+ }
+
+ if (s->max_paths_text) {
+ prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths);
+ prt_newline(&i->buf);
+
+ printbuf_indent_add(&i->buf, 2);
+ prt_str_indented(&i->buf, s->max_paths_text);
+ printbuf_indent_sub(&i->buf, 2);
+ }
+
+ mutex_unlock(&s->lock);
+
+ printbuf_indent_sub(&i->buf, 2);
+ prt_newline(&i->buf);
+ i->iter++;
}
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
- return err < 0 ? err : i->ret;
+ return i->ret;
}
-static const struct file_operations bfloat_failed_debug_ops = {
+static const struct file_operations lock_held_stats_op = {
+ .owner = THIS_MODULE,
+ .open = lock_held_stats_open,
+ .release = lock_held_stats_release,
+ .read = lock_held_stats_read,
+};
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ struct btree_trans *trans;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (i->iter)
+ goto out;
+
+ mutex_lock(&c->btree_trans_lock);
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if (trans->locking_wait.task->pid <= i->iter)
+ continue;
+
+ ret = flush_buf(i);
+ if (ret)
+ break;
+
+ bch2_check_for_deadlock(trans, &i->buf);
+
+ i->iter = trans->locking_wait.task->pid;
+ }
+ mutex_unlock(&c->btree_trans_lock);
+out:
+ if (i->buf.allocation_failure)
+ ret = -ENOMEM;
+
+ if (!ret)
+ ret = flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations btree_deadlock_ops = {
.owner = THIS_MODULE,
.open = bch2_dump_open,
.release = bch2_dump_release,
- .read = bch2_read_bfloat_failed,
+ .read = bch2_btree_deadlock_read,
};
void bch2_fs_debug_exit(struct bch_fs *c)
{
- if (!IS_ERR_OR_NULL(c->debug))
- debugfs_remove_recursive(c->debug);
+ if (!IS_ERR_OR_NULL(c->fs_debug_dir))
+ debugfs_remove_recursive(c->fs_debug_dir);
}
void bch2_fs_debug_init(struct bch_fs *c)
@@ -437,29 +869,50 @@ void bch2_fs_debug_init(struct bch_fs *c)
return;
snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
- c->debug = debugfs_create_dir(name, bch_debug);
- if (IS_ERR_OR_NULL(c->debug))
+ c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
+ if (IS_ERR_OR_NULL(c->fs_debug_dir))
+ return;
+
+ debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
+ c->btree_debug, &cached_btree_nodes_ops);
+
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
+ debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
+ c->btree_debug, &btree_transactions_ops);
+#endif
+
+ debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
+ c->btree_debug, &journal_pins_ops);
+
+ debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
+ c, &lock_held_stats_op);
+
+ debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
+ c->btree_debug, &btree_deadlock_ops);
+
+ c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
+ if (IS_ERR_OR_NULL(c->btree_debug_dir))
return;
for (bd = c->btree_debug;
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
- 0400, c->debug, bd,
- &btree_debug_ops);
+ debugfs_create_file(bch2_btree_ids[bd->id],
+ 0400, c->btree_debug_dir, bd,
+ &btree_debug_ops);
snprintf(name, sizeof(name), "%s-formats",
bch2_btree_ids[bd->id]);
- bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
- &btree_format_debug_ops);
+ debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+ &btree_format_debug_ops);
snprintf(name, sizeof(name), "%s-bfloat-failed",
bch2_btree_ids[bd->id]);
- bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
- &bfloat_failed_debug_ops);
+ debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+ &bfloat_failed_debug_ops);
}
}
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
index 0b86736e5e1b..2c37143b5fd1 100644
--- a/fs/bcachefs/debug.h
+++ b/fs/bcachefs/debug.h
@@ -9,6 +9,8 @@ struct btree;
struct bch_fs;
void __bch2_btree_verify(struct bch_fs *, struct btree *);
+void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
+ const struct btree *);
static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
{
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 6f699b736b34..4c85d3399fb4 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -83,38 +83,58 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
.is_visible = dirent_is_visible,
};
-const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
unsigned len;
- if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
- return "value too small";
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*d.v));
+ return -BCH_ERR_invalid_bkey;
+ }
len = bch2_dirent_name_bytes(d);
- if (!len)
- return "empty name";
+ if (!len) {
+ prt_printf(err, "empty name");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (bkey_val_u64s(k.k) > dirent_val_u64s(len))
- return "value too big";
+ if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
+ prt_printf(err, "value too big (%zu > %u)",
+ bkey_val_u64s(k.k), dirent_val_u64s(len));
+ return -BCH_ERR_invalid_bkey;
+ }
- if (len > BCH_NAME_MAX)
- return "dirent name too big";
+ if (len > BCH_NAME_MAX) {
+ prt_printf(err, "dirent name too big (%u > %u)",
+ len, BCH_NAME_MAX);
+ return -BCH_ERR_invalid_bkey;
+ }
- if (len == 1 && !memcmp(d.v->d_name, ".", 1))
- return "invalid name";
+ if (len == 1 && !memcmp(d.v->d_name, ".", 1)) {
+ prt_printf(err, "invalid name");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (len == 2 && !memcmp(d.v->d_name, "..", 2))
- return "invalid name";
+ if (len == 2 && !memcmp(d.v->d_name, "..", 2)) {
+ prt_printf(err, "invalid name");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (memchr(d.v->d_name, '/', len))
- return "invalid name";
+ if (memchr(d.v->d_name, '/', len)) {
+ prt_printf(err, "invalid name");
+ return -BCH_ERR_invalid_bkey;
+ }
if (d.v->d_type != DT_SUBVOL &&
- le64_to_cpu(d.v->d_inum) == d.k->p.inode)
- return "dirent points to own directory";
+ le64_to_cpu(d.v->d_inum) == d.k->p.inode) {
+ prt_printf(err, "dirent points to own directory");
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ return 0;
}
void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
@@ -122,9 +142,9 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- bch_scnmemcpy(out, d.v->d_name,
- bch2_dirent_name_bytes(d));
- pr_buf(out, " -> %llu type %s",
+ prt_printf(out, "%.*s -> %llu type %s",
+ bch2_dirent_name_bytes(d),
+ d.v->d_name,
d.v->d_type != DT_SUBVOL
? le64_to_cpu(d.v->d_inum)
: le32_to_cpu(d.v->d_child_subvol),
@@ -330,8 +350,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
bkey_init(&new_src->k);
new_src->k.p = src_iter.pos;
- if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
- bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
+ if (bkey_le(dst_pos, src_iter.pos) &&
+ bkey_lt(src_iter.pos, dst_iter.pos)) {
/*
* We have a hash collision for the new dst key,
* and new_src - the key we're deleting - is between
@@ -451,7 +471,7 @@ retry:
ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
name, inum, 0);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (!ret)
bch2_trans_iter_exit(&trans, &iter);
@@ -470,16 +490,13 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
if (ret)
return ret;
- for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(dir.inum, 0, snapshot), 0, k, ret) {
- if (k.k->p.inode > dir.inum)
- break;
-
+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+ SPOS(dir.inum, 0, snapshot),
+ POS(dir.inum, U64_MAX), 0, k, ret)
if (k.k->type == KEY_TYPE_dirent) {
ret = -ENOTEMPTY;
break;
}
- }
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -503,11 +520,9 @@ retry:
if (ret)
goto err;
- for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents,
- SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
- if (k.k->p.inode > inum.inum)
- break;
-
+ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
+ SPOS(inum.inum, ctx->pos, snapshot),
+ POS(inum.inum, U64_MAX), 0, k, ret) {
if (k.k->type != KEY_TYPE_dirent)
continue;
@@ -541,7 +556,7 @@ retry:
}
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 1bb4d802bc1d..ad131e8edc29 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -6,13 +6,13 @@
extern const struct bch_hash_desc bch2_dirent_hash_desc;
-const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-#define bch2_bkey_ops_dirent (struct bkey_ops) { \
+#define bch2_bkey_ops_dirent ((struct bkey_ops) { \
.key_invalid = bch2_dirent_invalid, \
.val_to_text = bch2_dirent_to_text, \
-}
+})
struct qstr;
struct file;
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 6c84297ef265..1a8f8b3750da 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -27,7 +27,7 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
unsigned nr_groups = disk_groups_nr(groups);
unsigned i, len;
- int ret = -EINVAL;
+ int ret = 0;
for (i = 0; i < sb->nr_devices; i++) {
struct bch_member *m = mi->members + i;
@@ -39,14 +39,14 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
g = BCH_MEMBER_GROUP(m) - 1;
if (g >= nr_groups) {
- pr_buf(err, "disk %u has invalid label %u (have %u)",
+ prt_printf(err, "disk %u has invalid label %u (have %u)",
i, g, nr_groups);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_disk_groups;
}
if (BCH_GROUP_DELETED(&groups->entries[g])) {
- pr_buf(err, "disk %u has deleted label %u", i, g);
- return -EINVAL;
+ prt_printf(err, "disk %u has deleted label %u", i, g);
+ return -BCH_ERR_invalid_sb_disk_groups;
}
}
@@ -61,14 +61,14 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
len = strnlen(g->label, sizeof(g->label));
if (!len) {
- pr_buf(err, "label %u empty", i);
- return -EINVAL;
+ prt_printf(err, "label %u empty", i);
+ return -BCH_ERR_invalid_sb_disk_groups;
}
}
sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
if (!sorted)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_disk_groups_validate;
memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
@@ -76,15 +76,15 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
for (g = sorted; g + 1 < sorted + nr_groups; g++)
if (!BCH_GROUP_DELETED(g) &&
!group_cmp(&g[0], &g[1])) {
- pr_buf(err, "duplicate label %llu.", BCH_GROUP_PARENT(g));
- bch_scnmemcpy(err, g->label, strnlen(g->label, sizeof(g->label)));
+ prt_printf(err, "duplicate label %llu.%.*s",
+ BCH_GROUP_PARENT(g),
+ (int) sizeof(g->label), g->label);
+ ret = -BCH_ERR_invalid_sb_disk_groups;
goto err;
}
-
- ret = 0;
err:
kfree(sorted);
- return 0;
+ return ret;
}
static void bch2_sb_disk_groups_to_text(struct printbuf *out,
@@ -100,12 +100,12 @@ static void bch2_sb_disk_groups_to_text(struct printbuf *out,
g < groups->entries + nr_groups;
g++) {
if (g != groups->entries)
- pr_buf(out, " ");
+ prt_printf(out, " ");
if (BCH_GROUP_DELETED(g))
- pr_buf(out, "[deleted]");
+ prt_printf(out, "[deleted]");
else
- pr_buf(out, "[parent %llu name %s]",
+ prt_printf(out, "[parent %llu name %s]",
BCH_GROUP_PARENT(g), g->label);
}
}
@@ -134,7 +134,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
cpu_g = kzalloc(sizeof(*cpu_g) +
sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
if (!cpu_g)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
cpu_g->nr = nr_groups;
@@ -275,7 +275,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
groups = bch2_sb_resize_disk_groups(sb, u64s);
if (!groups)
- return -ENOSPC;
+ return -BCH_ERR_ENOSPC_disk_label_add;
nr_groups = disk_groups_nr(groups);
}
@@ -342,12 +342,10 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
return v;
}
-void bch2_disk_path_to_text(struct printbuf *out,
- struct bch_sb_handle *sb,
- unsigned v)
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct bch_sb_field_disk_groups *groups =
- bch2_sb_get_disk_groups(sb->sb);
+ bch2_sb_get_disk_groups(sb);
struct bch_disk_group *g;
unsigned nr = 0;
u16 path[32];
@@ -376,43 +374,43 @@ void bch2_disk_path_to_text(struct printbuf *out,
v = path[--nr];
g = groups->entries + v;
- bch_scnmemcpy(out, g->label,
- strnlen(g->label, sizeof(g->label)));
-
+ prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
if (nr)
- pr_buf(out, ".");
+ prt_printf(out, ".");
}
return;
inval:
- pr_buf(out, "invalid group %u", v);
+ prt_printf(out, "invalid label %u", v);
}
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
{
struct bch_member *mi;
- int v = -1;
- int ret = 0;
-
- mutex_lock(&c->sb_lock);
+ int ret, v = -1;
if (!strlen(name) || !strcmp(name, "none"))
- goto write_sb;
+ return 0;
v = bch2_disk_path_find_or_create(&c->disk_sb, name);
- if (v < 0) {
- mutex_unlock(&c->sb_lock);
+ if (v < 0)
return v;
- }
ret = bch2_sb_disk_groups_to_cpu(c);
if (ret)
- goto unlock;
-write_sb:
+ return ret;
+
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
SET_BCH_MEMBER_GROUP(mi, v + 1);
+ return 0;
+}
- bch2_write_super(c);
-unlock:
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+ int ret;
+
+ mutex_lock(&c->sb_lock);
+ ret = __bch2_dev_group_set(c, ca, name) ?:
+ bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return ret;
@@ -448,41 +446,57 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
return -EINVAL;
}
-void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v)
+void bch2_opt_target_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_sb *sb,
+ u64 v)
{
struct target t = target_decode(v);
switch (t.type) {
case TARGET_NULL:
- pr_buf(out, "none");
+ prt_printf(out, "none");
break;
- case TARGET_DEV: {
- struct bch_dev *ca;
-
- rcu_read_lock();
- ca = t.dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[t.dev])
- : NULL;
-
- if (ca && percpu_ref_tryget(&ca->io_ref)) {
- char b[BDEVNAME_SIZE];
-
- pr_buf(out, "/dev/%s",
- bdevname(ca->disk_sb.bdev, b));
- percpu_ref_put(&ca->io_ref);
- } else if (ca) {
- pr_buf(out, "offline device %u", t.dev);
+ case TARGET_DEV:
+ if (c) {
+ struct bch_dev *ca;
+
+ rcu_read_lock();
+ ca = t.dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[t.dev])
+ : NULL;
+
+ if (ca && percpu_ref_tryget(&ca->io_ref)) {
+ prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+ percpu_ref_put(&ca->io_ref);
+ } else if (ca) {
+ prt_printf(out, "offline device %u", t.dev);
+ } else {
+ prt_printf(out, "invalid device %u", t.dev);
+ }
+
+ rcu_read_unlock();
} else {
- pr_buf(out, "invalid device %u", t.dev);
+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+ struct bch_member *m = mi->members + t.dev;
+
+ if (bch2_dev_exists(sb, mi, t.dev)) {
+ prt_printf(out, "Device ");
+ pr_uuid(out, m->uuid.b);
+ prt_printf(out, " (%u)", t.dev);
+ } else {
+ prt_printf(out, "Bad device %u", t.dev);
+ }
}
-
- rcu_read_unlock();
break;
- }
case TARGET_GROUP:
- mutex_lock(&c->sb_lock);
- bch2_disk_path_to_text(out, &c->disk_sb, t.group);
- mutex_unlock(&c->sb_lock);
+ if (c) {
+ mutex_lock(&c->sb_lock);
+ bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
+ mutex_unlock(&c->sb_lock);
+ } else {
+ bch2_disk_path_to_text(out, sb, t.group);
+ }
break;
default:
BUG();
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
index 3d84f23c34ed..e4470c357a66 100644
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@@ -75,14 +75,14 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *);
/* Exported for userspace bcachefs-tools: */
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *,
- unsigned);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
-void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64);
+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
const char *bch2_sb_validate_disk_groups(struct bch_sb *,
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 220ced2f9a17..1855d08efd4b 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -4,10 +4,12 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "bkey_buf.h"
#include "bset.h"
#include "btree_gc.h"
#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "disk_groups.h"
#include "ec.h"
@@ -102,44 +104,62 @@ struct ec_bio {
/* Stripes btree keys: */
-const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- if (!bkey_cmp(k.k->p, POS_MIN))
- return "stripe at pos 0";
+ if (bkey_eq(k.k->p, POS_MIN)) {
+ prt_printf(err, "stripe at POS_MIN");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (k.k->p.inode)
- return "invalid stripe key";
+ if (k.k->p.inode) {
+ prt_printf(err, "nonzero inode field");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (bkey_val_bytes(k.k) < sizeof(*s))
- return "incorrect value size";
+ if (bkey_val_bytes(k.k) < sizeof(*s)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*s));
+ return -BCH_ERR_invalid_bkey;
+ }
- if (bkey_val_bytes(k.k) < sizeof(*s) ||
- bkey_val_u64s(k.k) < stripe_val_u64s(s))
- return "incorrect value size";
+ if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
+ prt_printf(err, "incorrect value size (%zu < %u)",
+ bkey_val_u64s(k.k), stripe_val_u64s(s));
+ return -BCH_ERR_invalid_bkey;
+ }
- return bch2_bkey_ptrs_invalid(c, k);
+ return bch2_bkey_ptrs_invalid(c, k, flags, err);
}
void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned i;
+ unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
- pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+ prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
s->algorithm,
le16_to_cpu(s->sectors),
- s->nr_blocks - s->nr_redundant,
+ nr_data,
s->nr_redundant,
s->csum_type,
1U << s->csum_granularity_bits);
- for (i = 0; i < s->nr_blocks; i++)
- pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
- (u64) s->ptrs[i].offset,
- stripe_blockcount_get(s, i));
+ for (i = 0; i < s->nr_blocks; i++) {
+ const struct bch_extent_ptr *ptr = s->ptrs + i;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ u32 offset;
+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+ prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
+ if (i < nr_data)
+ prt_printf(out, "#%u", stripe_blockcount_get(s, i));
+ if (ptr_stale(ca, ptr))
+ prt_printf(out, " stale");
+ }
}
/* returns blocknr in stripe that we matched: */
@@ -193,8 +213,9 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
}
}
+/* XXX: this is a non-mempoolified memory allocation: */
static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
- unsigned offset, unsigned size)
+ unsigned offset, unsigned size)
{
struct bch_stripe *v = &buf->key.v;
unsigned csum_granularity = 1U << v->csum_granularity_bits;
@@ -221,7 +242,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
return 0;
err:
ec_stripe_buf_exit(buf);
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_stripe_buf;
}
/* Checksumming: */
@@ -286,14 +307,15 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
- char buf2[200];
+ struct printbuf buf2 = PRINTBUF;
- bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i));
bch_err_ratelimited(c,
"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
(void *) _RET_IP_, i, j, v->csum_type,
- want.lo, got.lo, buf2);
+ want.lo, got.lo, buf2.buf);
+ printbuf_exit(&buf2);
clear_bit(i, buf->valid);
break;
}
@@ -395,13 +417,16 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
while (offset < bytes) {
- unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES,
+ unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
DIV_ROUND_UP(bytes, PAGE_SIZE));
unsigned b = min_t(size_t, bytes - offset,
nr_iovecs << PAGE_SHIFT);
struct ec_bio *ec_bio;
- ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs,
+ ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
+ nr_iovecs,
+ rw,
+ GFP_KERNEL,
&c->ec_bioset),
struct ec_bio, bio);
@@ -409,9 +434,6 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
ec_bio->buf = buf;
ec_bio->idx = idx;
- bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev);
- bio_set_op_attrs(&ec_bio->bio, rw, 0);
-
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
ec_bio->bio.bi_end_io = ec_block_endio;
ec_bio->bio.bi_private = cl;
@@ -429,15 +451,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
percpu_ref_put(&ca->io_ref);
}
-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
+static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
+ struct ec_stripe_buf *stripe)
{
- struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes,
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
POS(0, idx), BTREE_ITER_SLOTS);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
@@ -449,11 +470,15 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip
}
bkey_reassemble(&stripe->key.k_i, k);
err:
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
+{
+ return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe));
+}
+
/* recovery read path: */
int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
{
@@ -469,7 +494,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
buf = kzalloc(sizeof(*buf), GFP_NOIO);
if (!buf)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_ec_read_extent;
ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
if (ret) {
@@ -534,25 +559,25 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
if (idx >= h->size) {
if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
- spin_lock(&c->ec_stripes_heap_lock);
+ mutex_lock(&c->ec_stripes_heap_lock);
if (n.size > h->size) {
memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
n.used = h->used;
swap(*h, n);
}
- spin_unlock(&c->ec_stripes_heap_lock);
+ mutex_unlock(&c->ec_stripes_heap_lock);
free_heap(&n);
}
if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
return 0;
}
@@ -561,26 +586,88 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans,
struct btree_iter *iter)
{
size_t idx = iter->pos.offset;
- int ret = 0;
if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN))
- return ret;
+ return 0;
bch2_trans_unlock(trans);
- ret = -EINTR;
- if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL))
- return ret;
+ return __ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL) ?:
+ bch2_trans_relock(trans);
+}
+
+/*
+ * Hash table of open stripes:
+ * Stripes that are being created or modified are kept in a hash table, so that
+ * stripe deletion can skip them.
+ */
+
+static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
+{
+ unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
+ struct ec_stripe_new *s;
- return -ENOMEM;
+ hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
+ if (s->idx == idx)
+ return true;
+ return false;
}
-static ssize_t stripe_idx_to_delete(struct bch_fs *c)
+static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
+{
+ bool ret = false;
+
+ spin_lock(&c->ec_stripes_new_lock);
+ ret = __bch2_stripe_is_open(c, idx);
+ spin_unlock(&c->ec_stripes_new_lock);
+
+ return ret;
+}
+
+static bool bch2_try_open_stripe(struct bch_fs *c,
+ struct ec_stripe_new *s,
+ u64 idx)
+{
+ bool ret;
+
+ spin_lock(&c->ec_stripes_new_lock);
+ ret = !__bch2_stripe_is_open(c, idx);
+ if (ret) {
+ unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
+
+ s->idx = idx;
+ hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
+ }
+ spin_unlock(&c->ec_stripes_new_lock);
+
+ return ret;
+}
+
+static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
+{
+ BUG_ON(!s->idx);
+
+ spin_lock(&c->ec_stripes_new_lock);
+ hlist_del_init(&s->hash);
+ spin_unlock(&c->ec_stripes_new_lock);
+
+ s->idx = 0;
+}
+
+/* Heap of all existing stripes, ordered by blocks_nonempty */
+
+static u64 stripe_idx_to_delete(struct bch_fs *c)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
- return h->used && h->data[0].blocks_nonempty == 0
- ? h->data[0].idx : -1;
+ lockdep_assert_held(&c->ec_stripes_heap_lock);
+
+ if (h->used &&
+ h->data[0].blocks_nonempty == 0 &&
+ !bch2_stripe_is_open(c, h->data[0].idx))
+ return h->data[0].idx;
+
+ return 0;
}
static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
@@ -604,7 +691,6 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
ec_stripes_heap *h = &c->ec_stripes_heap;
struct stripe *m = genradix_ptr(&c->stripes, idx);
- BUG_ON(!m->alive);
BUG_ON(m->heap_idx >= h->used);
BUG_ON(h->data[m->heap_idx].idx != idx);
}
@@ -612,28 +698,21 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
void bch2_stripes_heap_del(struct bch_fs *c,
struct stripe *m, size_t idx)
{
- if (!m->on_heap)
- return;
-
- m->on_heap = false;
-
+ mutex_lock(&c->ec_stripes_heap_lock);
heap_verify_backpointer(c, idx);
heap_del(&c->ec_stripes_heap, m->heap_idx,
ec_stripes_heap_cmp,
ec_stripes_heap_set_backpointer);
+ mutex_unlock(&c->ec_stripes_heap_lock);
}
void bch2_stripes_heap_insert(struct bch_fs *c,
struct stripe *m, size_t idx)
{
- if (m->on_heap)
- return;
-
+ mutex_lock(&c->ec_stripes_heap_lock);
BUG_ON(heap_full(&c->ec_stripes_heap));
- m->on_heap = true;
-
heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
.idx = idx,
.blocks_nonempty = m->blocks_nonempty,
@@ -642,17 +721,17 @@ void bch2_stripes_heap_insert(struct bch_fs *c,
ec_stripes_heap_set_backpointer);
heap_verify_backpointer(c, idx);
+ mutex_unlock(&c->ec_stripes_heap_lock);
}
void bch2_stripes_heap_update(struct bch_fs *c,
struct stripe *m, size_t idx)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
+ bool do_deletes;
size_t i;
- if (!m->on_heap)
- return;
-
+ mutex_lock(&c->ec_stripes_heap_lock);
heap_verify_backpointer(c, idx);
h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
@@ -665,228 +744,326 @@ void bch2_stripes_heap_update(struct bch_fs *c,
heap_verify_backpointer(c, idx);
- if (stripe_idx_to_delete(c) >= 0 &&
- !percpu_ref_is_dying(&c->writes))
- schedule_work(&c->ec_stripe_delete_work);
+ do_deletes = stripe_idx_to_delete(c) != 0;
+ mutex_unlock(&c->ec_stripes_heap_lock);
+
+ if (do_deletes)
+ bch2_do_stripe_deletes(c);
}
/* stripe deletion */
-static int ec_stripe_delete(struct bch_fs *c, size_t idx)
+static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
{
- return bch2_btree_delete_range(c, BTREE_ID_stripes,
- POS(0, idx),
- POS(0, idx + 1),
- 0, NULL);
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_stripe s;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, idx),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_stripe) {
+ bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ s = bkey_s_c_to_stripe(k);
+ for (unsigned i = 0; i < s.v->nr_blocks; i++)
+ if (stripe_blockcount_get(s.v, i)) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
static void ec_stripe_delete_work(struct work_struct *work)
{
struct bch_fs *c =
container_of(work, struct bch_fs, ec_stripe_delete_work);
- ssize_t idx;
+ struct btree_trans trans;
+ int ret;
+ u64 idx;
+
+ bch2_trans_init(&trans, c, 0, 0);
while (1) {
- spin_lock(&c->ec_stripes_heap_lock);
+ mutex_lock(&c->ec_stripes_heap_lock);
idx = stripe_idx_to_delete(c);
- if (idx < 0) {
- spin_unlock(&c->ec_stripes_heap_lock);
- break;
- }
+ mutex_unlock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx);
- spin_unlock(&c->ec_stripes_heap_lock);
+ if (!idx)
+ break;
- if (ec_stripe_delete(c, idx))
+ ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ec_stripe_delete(&trans, idx));
+ if (ret) {
+ bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
break;
+ }
}
+
+ bch2_trans_exit(&trans);
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+}
+
+void bch2_do_stripe_deletes(struct bch_fs *c)
+{
+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
+ !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
}
/* stripe creation: */
-static int ec_stripe_bkey_insert(struct btree_trans *trans,
- struct bkey_i_stripe *stripe,
- struct disk_reservation *res)
+static int ec_stripe_key_update(struct btree_trans *trans,
+ struct bkey_i_stripe *new,
+ bool create)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- struct bpos min_pos = POS(0, 1);
- struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
- if (start_pos.offset) {
- start_pos = min_pos;
- bch2_btree_iter_set_pos(&iter, start_pos);
- continue;
- }
-
- ret = -ENOSPC;
- break;
- }
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
+ new->k.p, BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- if (bkey_deleted(k.k))
- goto found_slot;
+ if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
+ bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
+ create ? "creating" : "updating",
+ bch2_bkey_types[k.k->type]);
+ ret = -EINVAL;
+ goto err;
}
- goto err;
-found_slot:
- start_pos = iter.pos;
+ if (k.k->type == KEY_TYPE_stripe) {
+ const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
+ unsigned i;
- ret = ec_stripe_mem_alloc(trans, &iter);
- if (ret)
- goto err;
+ if (old->nr_blocks != new->v.nr_blocks) {
+ bch_err(c, "error updating stripe: nr_blocks does not match");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ for (i = 0; i < new->v.nr_blocks; i++) {
+ unsigned v = stripe_blockcount_get(old, i);
- stripe->k.p = iter.pos;
+ BUG_ON(v &&
+ (old->ptrs[i].dev != new->v.ptrs[i].dev ||
+ old->ptrs[i].gen != new->v.ptrs[i].gen ||
+ old->ptrs[i].offset != new->v.ptrs[i].offset));
- ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0);
+ stripe_blockcount_set(&new->v, i, v);
+ }
+ }
- c->ec_stripe_hint = start_pos.offset;
+ ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
err:
bch2_trans_iter_exit(trans, &iter);
-
return ret;
}
-static int ec_stripe_bkey_update(struct btree_trans *trans,
- struct bkey_i_stripe *new,
- struct disk_reservation *res)
+static int ec_stripe_update_extent(struct btree_trans *trans,
+ struct bpos bucket, u8 gen,
+ struct ec_stripe_buf *s,
+ struct bpos *bp_pos)
{
+ struct bch_fs *c = trans->c;
+ struct bch_backpointer bp;
struct btree_iter iter;
struct bkey_s_c k;
- const struct bch_stripe *existing;
- unsigned i;
- int ret;
+ const struct bch_extent_ptr *ptr_c;
+ struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+ struct bch_extent_stripe_ptr stripe_ptr;
+ struct bkey_i *n;
+ int ret, dev, block;
+
+ ret = bch2_get_next_backpointer(trans, bucket, gen,
+ bp_pos, &bp, BTREE_ITER_CACHED);
+ if (ret)
+ return ret;
+ if (bpos_eq(*bp_pos, SPOS_MAX))
+ return 0;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
- new->k.p, BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(&iter);
+ if (bp.level) {
+ struct printbuf buf = PRINTBUF;
+ struct btree_iter node_iter;
+ struct btree *b;
+
+ b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
+ bch2_trans_iter_exit(trans, &node_iter);
+
+ if (!b)
+ return 0;
+
+ prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
+ bch2_backpointer_to_text(&buf, &bp);
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -EIO;
+ }
+
+ k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT);
ret = bkey_err(k);
if (ret)
- goto err;
-
- if (!k.k || k.k->type != KEY_TYPE_stripe) {
- bch_err(trans->c, "error updating stripe: not found");
- ret = -ENOENT;
- goto err;
+ return ret;
+ if (!k.k) {
+ /*
+ * extent no longer exists - we could flush the btree
+ * write buffer and retry to verify, but no need:
+ */
+ return 0;
}
- existing = bkey_s_c_to_stripe(k).v;
+ if (extent_has_stripe_ptr(k, s->key.k.p.offset))
+ goto out;
- if (existing->nr_blocks != new->v.nr_blocks) {
- bch_err(trans->c, "error updating stripe: nr_blocks does not match");
- ret = -EINVAL;
- goto err;
- }
+ ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
+ /*
+ * It doesn't generally make sense to erasure code cached ptrs:
+ * XXX: should we be incrementing a counter?
+ */
+ if (!ptr_c || ptr_c->cached)
+ goto out;
- for (i = 0; i < new->v.nr_blocks; i++)
- stripe_blockcount_set(&new->v, i,
- stripe_blockcount_get(existing, i));
+ dev = s->key.v.ptrs[block].dev;
- ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
+ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto out;
-static void extent_stripe_ptr_add(struct bkey_s_extent e,
- struct ec_stripe_buf *s,
- struct bch_extent_ptr *ptr,
- unsigned block)
-{
- struct bch_extent_stripe_ptr *dst = (void *) ptr;
- union bch_extent_entry *end = extent_entry_last(e);
+ bkey_reassemble(n, k);
- memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
- e.k->u64s += sizeof(*dst) / sizeof(u64);
+ bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
+ ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
+ BUG_ON(!ec_ptr);
- *dst = (struct bch_extent_stripe_ptr) {
+ stripe_ptr = (struct bch_extent_stripe_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
.block = block,
.redundancy = s->key.v.nr_redundant,
.idx = s->key.k.p.offset,
};
-}
-
-static int ec_stripe_update_ptrs(struct bch_fs *c,
- struct ec_stripe_buf *s,
- struct bkey *pos)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_extent e;
- struct bkey_buf sk;
- struct bpos next_pos;
- int ret = 0, dev, block;
- bch2_bkey_buf_init(&sk);
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+ __extent_entry_insert(n,
+ (union bch_extent_entry *) ec_ptr,
+ (union bch_extent_entry *) &stripe_ptr);
- /* XXX this doesn't support the reflink btree */
+ ret = bch2_trans_update(trans, &iter, n, 0);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
- bkey_start_pos(pos),
- BTREE_ITER_INTENT);
-retry:
- while (bch2_trans_begin(&trans),
- (k = bch2_btree_iter_peek(&iter)).k &&
- !(ret = bkey_err(k)) &&
- bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
- const struct bch_extent_ptr *ptr_c;
- struct bch_extent_ptr *ptr, *ec_ptr = NULL;
-
- if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
- bch2_btree_iter_advance(&iter);
- continue;
- }
+static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
+ unsigned block)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_extent_ptr bucket = s->key.v.ptrs[block];
+ struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
+ struct bpos bp_pos = POS_MIN;
+ int ret = 0;
- ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
- /*
- * It doesn't generally make sense to erasure code cached ptrs:
- * XXX: should we be incrementing a counter?
- */
- if (!ptr_c || ptr_c->cached) {
- bch2_btree_iter_advance(&iter);
- continue;
- }
+ while (1) {
+ ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL,
+ ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
+ s, &bp_pos));
+ if (ret)
+ break;
+ if (bkey_eq(bp_pos, POS_MAX))
+ break;
- dev = s->key.v.ptrs[block].dev;
+ bp_pos = bpos_nosnap_successor(bp_pos);
+ }
- bch2_bkey_buf_reassemble(&sk, c, k);
- e = bkey_i_to_s_extent(sk.k);
+ return ret;
+}
- bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
- ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
- BUG_ON(!ec_ptr);
+static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
+{
+ struct btree_trans trans;
+ struct bch_stripe *v = &s->key.v;
+ unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+ int ret = 0;
- extent_stripe_ptr_add(e, s, ec_ptr, block);
+ bch2_trans_init(&trans, c, 0, 0);
- bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
- next_pos = sk.k->k.p;
+ ret = bch2_btree_write_buffer_flush(&trans);
+ if (ret)
+ goto err;
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(&trans, &iter, sk.k, 0) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
- if (!ret)
- bch2_btree_iter_set_pos(&iter, next_pos);
+ for (i = 0; i < nr_data; i++) {
+ ret = ec_stripe_update_bucket(&trans, s, i);
if (ret)
break;
}
- if (ret == -EINTR)
- goto retry;
- bch2_trans_iter_exit(&trans, &iter);
-
+err:
bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(&sk, c);
return ret;
}
+static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
+ struct ec_stripe_new *s,
+ unsigned block,
+ struct open_bucket *ob)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ unsigned offset = ca->mi.bucket_size - ob->sectors_free;
+ int ret;
+
+ if (!bch2_dev_get_ioref(ca, WRITE)) {
+ s->err = -EROFS;
+ return;
+ }
+
+ memset(s->new_stripe.data[block] + (offset << 9),
+ 0,
+ ob->sectors_free << 9);
+
+ ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
+ ob->bucket * ca->mi.bucket_size + offset,
+ ob->sectors_free,
+ GFP_KERNEL, 0);
+
+ percpu_ref_put(&ca->io_ref);
+
+ if (ret)
+ s->err = ret;
+}
+
+void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
+{
+ if (s->idx)
+ bch2_stripe_close(c, s);
+ kfree(s);
+}
+
/*
* data buckets of new stripe all written: create the stripe
*/
@@ -894,8 +1071,6 @@ static void ec_stripe_create(struct ec_stripe_new *s)
{
struct bch_fs *c = s->c;
struct open_bucket *ob;
- struct bkey_i *k;
- struct stripe *m;
struct bch_stripe *v = &s->new_stripe.key.v;
unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
int ret;
@@ -904,8 +1079,18 @@ static void ec_stripe_create(struct ec_stripe_new *s)
closure_sync(&s->iodone);
+ if (!s->err) {
+ for (i = 0; i < nr_data; i++)
+ if (s->blocks[i]) {
+ ob = c->open_buckets + s->blocks[i];
+
+ if (ob->sectors_free)
+ zero_out_rest_of_ec_bucket(c, s, i, ob);
+ }
+ }
+
if (s->err) {
- if (s->err != -EROFS)
+ if (!bch2_err_matches(s->err, EROFS))
bch_err(c, "error creating stripe: error writing data buckets");
goto err;
}
@@ -927,9 +1112,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
}
BUG_ON(!s->allocated);
-
- if (!percpu_ref_tryget(&c->writes))
- goto err;
+ BUG_ON(!s->idx);
ec_generate_ec(&s->new_stripe);
@@ -942,34 +1125,25 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_nr_failed(&s->new_stripe)) {
bch_err(c, "error creating stripe: error writing redundancy buckets");
- goto err_put_writes;
+ goto err;
}
- ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
- s->have_existing_stripe
- ? ec_stripe_bkey_update(&trans, &s->new_stripe.key, &s->res)
- : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res));
+ ret = bch2_trans_do(c, &s->res, NULL,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL,
+ ec_stripe_key_update(&trans, &s->new_stripe.key,
+ !s->have_existing_stripe));
if (ret) {
bch_err(c, "error creating stripe: error creating stripe key");
- goto err_put_writes;
+ goto err;
}
- for_each_keylist_key(&s->keys, k) {
- ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k);
- if (ret) {
- bch_err(c, "error creating stripe: error %i updating pointers", ret);
- break;
- }
+ ret = ec_stripe_update_extents(c, &s->new_stripe);
+ if (ret) {
+ bch_err(c, "error creating stripe: error updating pointers: %s",
+ bch2_err_str(ret));
+ goto err;
}
-
- spin_lock(&c->ec_stripes_heap_lock);
- m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
-
- BUG_ON(m->on_heap);
- bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
- spin_unlock(&c->ec_stripes_heap_lock);
-err_put_writes:
- percpu_ref_put(&c->writes);
err:
bch2_disk_reservation_put(c, &s->res);
@@ -985,39 +1159,50 @@ err:
}
}
- bch2_keylist_free(&s->keys, s->inline_keys);
+ mutex_lock(&c->ec_stripe_new_lock);
+ list_del(&s->list);
+ mutex_unlock(&c->ec_stripe_new_lock);
ec_stripe_buf_exit(&s->existing_stripe);
ec_stripe_buf_exit(&s->new_stripe);
closure_debug_destroy(&s->iodone);
- kfree(s);
+
+ ec_stripe_new_put(c, s, STRIPE_REF_stripe);
+}
+
+static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
+{
+ struct ec_stripe_new *s;
+
+ mutex_lock(&c->ec_stripe_new_lock);
+ list_for_each_entry(s, &c->ec_stripe_new_list, list)
+ if (!atomic_read(&s->ref[STRIPE_REF_io]))
+ goto out;
+ s = NULL;
+out:
+ mutex_unlock(&c->ec_stripe_new_lock);
+
+ return s;
}
static void ec_stripe_create_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work,
struct bch_fs, ec_stripe_create_work);
- struct ec_stripe_new *s, *n;
-restart:
- mutex_lock(&c->ec_stripe_new_lock);
- list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list)
- if (!atomic_read(&s->pin)) {
- list_del(&s->list);
- mutex_unlock(&c->ec_stripe_new_lock);
- ec_stripe_create(s);
- goto restart;
- }
- mutex_unlock(&c->ec_stripe_new_lock);
+ struct ec_stripe_new *s;
+
+ while ((s = get_pending_stripe(c)))
+ ec_stripe_create(s);
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
}
-static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
+void bch2_ec_do_stripe_creates(struct bch_fs *c)
{
- BUG_ON(atomic_read(&s->pin) <= 0);
+ bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
- if (atomic_dec_and_test(&s->pin)) {
- BUG_ON(!s->pending);
- queue_work(system_long_wq, &c->ec_stripe_create_work);
- }
+ if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
}
static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
@@ -1033,18 +1218,7 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
list_add(&s->list, &c->ec_stripe_new_list);
mutex_unlock(&c->ec_stripe_new_lock);
- ec_stripe_new_put(c, s);
-}
-
-/* have a full bucket - hand it off to be erasure coded: */
-void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
-{
- struct ec_stripe_new *s = ob->ec;
-
- if (ob->sectors_free)
- s->err = -1;
-
- ec_stripe_new_put(c, s);
+ ec_stripe_new_put(c, s, STRIPE_REF_io);
}
void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
@@ -1063,36 +1237,14 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
if (!ob)
return NULL;
+ BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
+
ca = bch_dev_bkey_exists(c, ob->dev);
offset = ca->mi.bucket_size - ob->sectors_free;
return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
}
-void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob,
- struct bkey *k)
-{
- struct ec_stripe_new *ec = ob->ec;
-
- if (!ec)
- return;
-
- mutex_lock(&ec->lock);
-
- if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
- ARRAY_SIZE(ec->inline_keys),
- BKEY_U64s)) {
- BUG();
- }
-
- bkey_init(&ec->keys.top->k);
- ec->keys.top->k.p = k->p;
- ec->keys.top->k.size = k->size;
- bch2_keylist_push(&ec->keys);
-
- mutex_unlock(&ec->lock);
-}
-
static int unsigned_cmp(const void *_l, const void *_r)
{
unsigned l = *((const unsigned *) _l);
@@ -1174,19 +1326,18 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
mutex_init(&s->lock);
closure_init(&s->iodone, NULL);
- atomic_set(&s->pin, 1);
+ atomic_set(&s->ref[STRIPE_REF_stripe], 1);
+ atomic_set(&s->ref[STRIPE_REF_io], 1);
s->c = c;
s->h = h;
s->nr_data = min_t(unsigned, h->nr_active_devs,
BCH_BKEY_PTRS_MAX) - h->redundancy;
s->nr_parity = h->redundancy;
- bch2_keylist_init(&s->keys, s->inline_keys);
-
ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
s->nr_parity, h->blocksize);
@@ -1197,7 +1348,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
static struct ec_stripe_head *
ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
unsigned algo, unsigned redundancy,
- bool copygc)
+ enum alloc_reserve reserve)
{
struct ec_stripe_head *h;
struct bch_dev *ca;
@@ -1208,12 +1359,12 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
return NULL;
mutex_init(&h->lock);
- mutex_lock(&h->lock);
+ BUG_ON(!mutex_trylock(&h->lock));
h->target = target;
h->algo = algo;
h->redundancy = redundancy;
- h->copygc = copygc;
+ h->reserve = reserve;
rcu_read_lock();
h->devs = target_rw_devs(c, BCH_DATA_user, target);
@@ -1244,36 +1395,49 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
mutex_unlock(&h->lock);
}
-struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
+struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
unsigned target,
unsigned algo,
unsigned redundancy,
- bool copygc)
+ enum alloc_reserve reserve)
{
+ struct bch_fs *c = trans->c;
struct ec_stripe_head *h;
+ int ret;
if (!redundancy)
return NULL;
- mutex_lock(&c->ec_stripe_head_lock);
+ ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
+ h = ERR_PTR(-EROFS);
+ goto found;
+ }
+
list_for_each_entry(h, &c->ec_stripe_head_list, list)
if (h->target == target &&
h->algo == algo &&
h->redundancy == redundancy &&
- h->copygc == copygc) {
- mutex_lock(&h->lock);
+ h->reserve == reserve) {
+ ret = bch2_trans_mutex_lock(trans, &h->lock);
+ if (ret)
+ h = ERR_PTR(ret);
goto found;
}
- h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc);
+ h = ec_new_stripe_head_alloc(c, target, algo, redundancy, reserve);
found:
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
-static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
- struct closure *cl)
+static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
+ enum alloc_reserve reserve, struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
struct open_buckets buckets;
@@ -1281,34 +1445,30 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
bool have_cache = true;
int ret = 0;
- for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
- if (test_bit(i, h->s->blocks_gotten)) {
- __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
- if (i < h->s->nr_data)
- nr_have_data++;
- else
- nr_have_parity++;
- }
+ BUG_ON(h->s->new_stripe.key.v.nr_blocks != h->s->nr_data + h->s->nr_parity);
+ BUG_ON(h->s->new_stripe.key.v.nr_redundant != h->s->nr_parity);
+
+ for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) {
+ __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
+ if (i < h->s->nr_data)
+ nr_have_data++;
+ else
+ nr_have_parity++;
}
BUG_ON(nr_have_data > h->s->nr_data);
BUG_ON(nr_have_parity > h->s->nr_parity);
- percpu_down_read(&c->mark_lock);
- rcu_read_lock();
-
buckets.nr = 0;
if (nr_have_parity < h->s->nr_parity) {
- ret = bch2_bucket_alloc_set(c, &buckets,
+ ret = bch2_bucket_alloc_set_trans(trans, &buckets,
&h->parity_stripe,
&devs,
h->s->nr_parity,
&nr_have_parity,
- &have_cache,
- h->copygc
- ? RESERVE_MOVINGGC
- : RESERVE_NONE,
- 0,
+ &have_cache, 0,
+ BCH_DATA_parity,
+ reserve,
cl);
open_bucket_for_each(c, &buckets, ob, i) {
@@ -1323,21 +1483,19 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
}
if (ret)
- goto err;
+ return ret;
}
buckets.nr = 0;
if (nr_have_data < h->s->nr_data) {
- ret = bch2_bucket_alloc_set(c, &buckets,
+ ret = bch2_bucket_alloc_set_trans(trans, &buckets,
&h->block_stripe,
&devs,
h->s->nr_data,
&nr_have_data,
- &have_cache,
- h->copygc
- ? RESERVE_MOVINGGC
- : RESERVE_NONE,
- 0,
+ &have_cache, 0,
+ BCH_DATA_user,
+ reserve,
cl);
open_bucket_for_each(c, &buckets, ob, i) {
@@ -1351,12 +1509,10 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
}
if (ret)
- goto err;
+ return ret;
}
-err:
- rcu_read_unlock();
- percpu_up_read(&c->mark_lock);
- return ret;
+
+ return 0;
}
/* XXX: doesn't obey target: */
@@ -1372,59 +1528,76 @@ static s64 get_existing_stripe(struct bch_fs *c,
if (may_create_new_stripe(c))
return -1;
- spin_lock(&c->ec_stripes_heap_lock);
+ mutex_lock(&c->ec_stripes_heap_lock);
for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
/* No blocks worth reusing, stripe will just be deleted: */
if (!h->data[heap_idx].blocks_nonempty)
continue;
stripe_idx = h->data[heap_idx].idx;
+
m = genradix_ptr(&c->stripes, stripe_idx);
if (m->algorithm == head->algo &&
m->nr_redundant == head->redundancy &&
m->sectors == head->blocksize &&
- m->blocks_nonempty < m->nr_blocks - m->nr_redundant) {
- bch2_stripes_heap_del(c, m, stripe_idx);
+ m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
+ bch2_try_open_stripe(c, head->s, stripe_idx)) {
ret = stripe_idx;
break;
}
}
- spin_unlock(&c->ec_stripes_heap_lock);
+ mutex_unlock(&c->ec_stripes_heap_lock);
return ret;
}
-static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
- struct ec_stripe_head *h)
+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
{
+ struct bch_fs *c = trans->c;
unsigned i;
s64 idx;
int ret;
+ /*
+ * If we can't allocate a new stripe, and there's no stripes with empty
+ * blocks for us to reuse, that means we have to wait on copygc:
+ */
idx = get_existing_stripe(c, h);
- if (idx < 0) {
- bch_err(c, "failed to find an existing stripe");
- return -ENOSPC;
- }
+ if (idx < 0)
+ return -BCH_ERR_stripe_alloc_blocked;
- h->s->have_existing_stripe = true;
- ret = get_stripe_key(c, idx, &h->s->existing_stripe);
+ ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
if (ret) {
- bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
+ bch2_stripe_close(c, h->s);
+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
return ret;
}
- if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) {
- /*
- * this is a problem: we have deleted from the
- * stripes heap already
- */
- BUG();
+ BUG_ON(h->s->existing_stripe.key.v.nr_redundant != h->s->nr_parity);
+ h->s->nr_data = h->s->existing_stripe.key.v.nr_blocks -
+ h->s->existing_stripe.key.v.nr_redundant;
+
+ ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
+ if (ret) {
+ bch2_stripe_close(c, h->s);
+ return ret;
}
BUG_ON(h->s->existing_stripe.size != h->blocksize);
BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
+ /*
+ * Free buckets we initially allocated - they might conflict with
+ * blocks from the stripe we're reusing:
+ */
+ for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) {
+ bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
+ h->s->blocks[i] = 0;
+ }
+ memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
+ memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
+
for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
__set_bit(i, h->s->blocks_gotten);
@@ -1434,90 +1607,161 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
}
- bkey_copy(&h->s->new_stripe.key.k_i,
- &h->s->existing_stripe.key.k_i);
+ bkey_copy(&h->s->new_stripe.key.k_i, &h->s->existing_stripe.key.k_i);
+ h->s->have_existing_stripe = true;
return 0;
}
-static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
- struct ec_stripe_head *h)
+static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bpos min_pos = POS(0, 1);
+ struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
int ret;
- ret = bch2_disk_reservation_get(c, &h->s->res,
- h->blocksize,
- h->s->nr_parity, 0);
+ if (!h->s->res.sectors) {
+ ret = bch2_disk_reservation_get(c, &h->s->res,
+ h->blocksize,
+ h->s->nr_parity,
+ BCH_DISK_RESERVATION_NOFAIL);
+ if (ret)
+ return ret;
+ }
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
+ if (start_pos.offset) {
+ start_pos = min_pos;
+ bch2_btree_iter_set_pos(&iter, start_pos);
+ continue;
+ }
+
+ ret = -BCH_ERR_ENOSPC_stripe_create;
+ break;
+ }
+ if (bkey_deleted(k.k) &&
+ bch2_try_open_stripe(c, h->s, k.k->p.offset))
+ break;
+ }
+
+ c->ec_stripe_hint = iter.pos.offset;
+
+ if (ret)
+ goto err;
+
+ ret = ec_stripe_mem_alloc(trans, &iter);
if (ret) {
- /*
- * This means we need to wait for copygc to
- * empty out buckets from existing stripes:
- */
- bch_err(c, "failed to reserve stripe");
+ bch2_stripe_close(c, h->s);
+ goto err;
}
+ h->s->new_stripe.key.k.p = iter.pos;
+out:
+ bch2_trans_iter_exit(trans, &iter);
return ret;
+err:
+ bch2_disk_reservation_put(c, &h->s->res);
+ goto out;
}
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
unsigned target,
unsigned algo,
unsigned redundancy,
- bool copygc,
+ enum alloc_reserve reserve,
struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct ec_stripe_head *h;
+ bool waiting = false;
int ret;
- bool needs_stripe_new;
- h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc);
- if (!h) {
+ h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, reserve);
+ if (!h)
bch_err(c, "no stripe head");
- return NULL;
- }
+ if (IS_ERR_OR_NULL(h))
+ return h;
- needs_stripe_new = !h->s;
- if (needs_stripe_new) {
- if (ec_new_stripe_alloc(c, h)) {
- ret = -ENOMEM;
+ if (!h->s) {
+ ret = ec_new_stripe_alloc(c, h);
+ if (ret) {
bch_err(c, "failed to allocate new stripe");
goto err;
}
+ }
+
+ if (h->s->allocated)
+ goto allocated;
- if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize))
- BUG();
+ if (h->s->have_existing_stripe)
+ goto alloc_existing;
+
+ /* First, try to allocate a full stripe: */
+ ret = new_stripe_alloc_buckets(trans, h, RESERVE_stripe, NULL) ?:
+ __bch2_ec_stripe_head_reserve(trans, h);
+ if (!ret)
+ goto allocate_buf;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret, ENOMEM))
+ goto err;
+
+ /*
+ * Not enough buckets available for a full stripe: we must reuse an
+ * existing stripe:
+ */
+ while (1) {
+ ret = __bch2_ec_stripe_head_reuse(trans, h);
+ if (!ret)
+ break;
+ if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
+ goto err;
+
+ if (reserve == RESERVE_movinggc) {
+ ret = new_stripe_alloc_buckets(trans, h, reserve, NULL) ?:
+ __bch2_ec_stripe_head_reserve(trans, h);
+ if (ret)
+ goto err;
+ goto allocate_buf;
+ }
+
+ /* XXX freelist_wait? */
+ closure_wait(&c->freelist_wait, cl);
+ waiting = true;
}
+ if (waiting)
+ closure_wake_up(&c->freelist_wait);
+alloc_existing:
/*
- * Try reserve a new stripe before reusing an
- * existing stripe. This will prevent unnecessary
- * read amplification during write oriented workloads.
+ * Retry allocating buckets, with the reserve watermark for this
+ * particular write:
*/
- ret = 0;
- if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe)
- ret = __bch2_ec_stripe_head_reserve(c, h);
- if (ret && needs_stripe_new)
- ret = __bch2_ec_stripe_head_reuse(c, h);
+ ret = new_stripe_alloc_buckets(trans, h, reserve, cl);
if (ret)
goto err;
- if (!h->s->allocated) {
- ret = new_stripe_alloc_buckets(c, h, cl);
- if (ret)
- goto err;
-
- h->s->allocated = true;
- }
+allocate_buf:
+ ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
+ if (ret)
+ goto err;
+ h->s->allocated = true;
+allocated:
+ BUG_ON(!h->s->idx);
+ BUG_ON(!h->s->new_stripe.data[0]);
+ BUG_ON(trans->restarted);
return h;
-
err:
bch2_ec_stripe_head_put(c, h);
return ERR_PTR(ret);
}
-void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
{
struct ec_stripe_head *h;
struct open_bucket *ob;
@@ -1525,11 +1769,13 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
mutex_lock(&c->ec_stripe_head_lock);
list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-
mutex_lock(&h->lock);
if (!h->s)
goto unlock;
+ if (!ca)
+ goto found;
+
for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
if (!h->s->blocks[i])
continue;
@@ -1548,14 +1794,30 @@ unlock:
mutex_unlock(&c->ec_stripe_head_lock);
}
-void bch2_stripes_heap_start(struct bch_fs *c)
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
{
- struct genradix_iter iter;
- struct stripe *m;
+ __bch2_ec_stop(c, ca);
+}
+
+void bch2_fs_ec_stop(struct bch_fs *c)
+{
+ __bch2_ec_stop(c, NULL);
+}
+
+static bool bch2_fs_ec_flush_done(struct bch_fs *c)
+{
+ bool ret;
+
+ mutex_lock(&c->ec_stripe_new_lock);
+ ret = list_empty(&c->ec_stripe_new_list);
+ mutex_unlock(&c->ec_stripe_new_lock);
+
+ return ret;
+}
- genradix_for_each(&c->stripes, iter, m)
- if (m->alive)
- bch2_stripes_heap_insert(c, m, iter.pos);
+void bch2_fs_ec_flush(struct bch_fs *c)
+{
+ wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
}
int bch2_stripes_read(struct bch_fs *c)
@@ -1582,7 +1844,6 @@ int bch2_stripes_read(struct bch_fs *c)
s = bkey_s_c_to_stripe(k).v;
m = genradix_ptr(&c->stripes, k.k->p.offset);
- m->alive = true;
m->sectors = le16_to_cpu(s->sectors);
m->algorithm = s->algorithm;
m->nr_blocks = s->nr_blocks;
@@ -1592,9 +1853,7 @@ int bch2_stripes_read(struct bch_fs *c)
for (i = 0; i < s->nr_blocks; i++)
m->blocks_nonempty += !!stripe_blockcount_get(s, i);
- spin_lock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_update(c, m, k.k->p.offset);
- spin_unlock(&c->ec_stripes_heap_lock);
+ bch2_stripes_heap_insert(c, m, k.k->p.offset);
}
bch2_trans_iter_exit(&trans, &iter);
@@ -1612,16 +1871,19 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
struct stripe *m;
size_t i;
- spin_lock(&c->ec_stripes_heap_lock);
- for (i = 0; i < min_t(size_t, h->used, 20); i++) {
+ mutex_lock(&c->ec_stripes_heap_lock);
+ for (i = 0; i < min_t(size_t, h->used, 50); i++) {
m = genradix_ptr(&c->stripes, h->data[i].idx);
- pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
+ prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
h->data[i].blocks_nonempty,
m->nr_blocks - m->nr_redundant,
m->nr_redundant);
+ if (bch2_stripe_is_open(c, h->data[i].idx))
+ prt_str(out, " open");
+ prt_newline(out);
}
- spin_unlock(&c->ec_stripes_heap_lock);
+ mutex_unlock(&c->ec_stripes_heap_lock);
}
void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
@@ -1631,22 +1893,27 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->ec_stripe_head_lock);
list_for_each_entry(h, &c->ec_stripe_head_list, list) {
- pr_buf(out, "target %u algo %u redundancy %u:\n",
- h->target, h->algo, h->redundancy);
+ prt_printf(out, "target %u algo %u redundancy %u %s:\n",
+ h->target, h->algo, h->redundancy,
+ bch2_alloc_reserves[h->reserve]);
if (h->s)
- pr_buf(out, "\tpending: blocks %u+%u allocated %u\n",
- h->s->nr_data, h->s->nr_parity,
+ prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
+ h->s->idx, h->s->nr_data, h->s->nr_parity,
bitmap_weight(h->s->blocks_allocated,
h->s->nr_data));
}
mutex_unlock(&c->ec_stripe_head_lock);
+ prt_printf(out, "in flight:\n");
+
mutex_lock(&c->ec_stripe_new_lock);
list_for_each_entry(s, &c->ec_stripe_new_list, list) {
- pr_buf(out, "\tin flight: blocks %u+%u pin %u\n",
- s->nr_data, s->nr_parity,
- atomic_read(&s->pin));
+ prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
+ s->idx, s->nr_data, s->nr_parity,
+ atomic_read(&s->ref[STRIPE_REF_io]),
+ atomic_read(&s->ref[STRIPE_REF_stripe]),
+ bch2_alloc_reserves[s->h->reserve]);
}
mutex_unlock(&c->ec_stripe_new_lock);
}
@@ -1654,6 +1921,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
void bch2_fs_ec_exit(struct bch_fs *c)
{
struct ec_stripe_head *h;
+ unsigned i;
while (1) {
mutex_lock(&c->ec_stripe_head_lock);
@@ -1665,7 +1933,12 @@ void bch2_fs_ec_exit(struct bch_fs *c)
if (!h)
break;
- BUG_ON(h->s);
+ if (h->s) {
+ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++)
+ BUG_ON(h->s->blocks[i]);
+
+ kfree(h->s);
+ }
kfree(h);
}
@@ -1676,11 +1949,24 @@ void bch2_fs_ec_exit(struct bch_fs *c)
bioset_exit(&c->ec_bioset);
}
-int bch2_fs_ec_init(struct bch_fs *c)
+void bch2_fs_ec_init_early(struct bch_fs *c)
{
+ spin_lock_init(&c->ec_stripes_new_lock);
+ mutex_init(&c->ec_stripes_heap_lock);
+
+ INIT_LIST_HEAD(&c->ec_stripe_head_list);
+ mutex_init(&c->ec_stripe_head_lock);
+
+ INIT_LIST_HEAD(&c->ec_stripe_new_list);
+ mutex_init(&c->ec_stripe_new_lock);
+ init_waitqueue_head(&c->ec_stripe_new_wait);
+
INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
+}
+int bch2_fs_ec_init(struct bch_fs *c)
+{
return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
BIOSET_NEED_BVECS);
}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 78d468c7680a..7c08a49d7419 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -4,17 +4,20 @@
#include "ec_types.h"
#include "buckets_types.h"
-#include "keylist_types.h"
+#include "extents_types.h"
-const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
+ unsigned, struct printbuf *);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-#define bch2_bkey_ops_stripe (struct bkey_ops) { \
+#define bch2_bkey_ops_stripe ((struct bkey_ops) { \
.key_invalid = bch2_stripe_invalid, \
.val_to_text = bch2_stripe_to_text, \
.swab = bch2_ptr_swab, \
-}
+ .trans_trigger = bch2_trans_mark_stripe, \
+ .atomic_trigger = bch2_mark_stripe, \
+})
static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
{
@@ -140,15 +143,24 @@ struct ec_stripe_buf {
struct ec_stripe_head;
+enum ec_stripe_ref {
+ STRIPE_REF_io,
+ STRIPE_REF_stripe,
+ STRIPE_REF_NR
+};
+
struct ec_stripe_new {
struct bch_fs *c;
struct ec_stripe_head *h;
struct mutex lock;
struct list_head list;
+
+ struct hlist_node hash;
+ u64 idx;
+
struct closure iodone;
- /* counts in flight writes, stripe is created when pin == 0 */
- atomic_t pin;
+ atomic_t ref[STRIPE_REF_NR];
int err;
@@ -163,9 +175,6 @@ struct ec_stripe_new {
open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX];
struct disk_reservation res;
- struct keylist keys;
- u64 inline_keys[BKEY_U64s * 8];
-
struct ec_stripe_buf new_stripe;
struct ec_stripe_buf existing_stripe;
};
@@ -177,7 +186,7 @@ struct ec_stripe_head {
unsigned target;
unsigned algo;
unsigned redundancy;
- bool copygc;
+ enum alloc_reserve reserve;
struct bch_devs_mask devs;
unsigned nr_active_devs;
@@ -193,27 +202,51 @@ struct ec_stripe_head {
int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *,
- struct bkey *);
-void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *,
- unsigned, unsigned, unsigned, bool, struct closure *);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
+ unsigned, unsigned, unsigned,
+ enum alloc_reserve, struct closure *);
void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
-void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
+void bch2_do_stripe_deletes(struct bch_fs *);
+void bch2_ec_do_stripe_creates(struct bch_fs *);
+void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
-void bch2_ec_flush_new_stripes(struct bch_fs *);
+static inline void ec_stripe_new_get(struct ec_stripe_new *s,
+ enum ec_stripe_ref ref)
+{
+ atomic_inc(&s->ref[ref]);
+}
+
+static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
+ enum ec_stripe_ref ref)
+{
+ BUG_ON(atomic_read(&s->ref[ref]) <= 0);
+
+ if (atomic_dec_and_test(&s->ref[ref]))
+ switch (ref) {
+ case STRIPE_REF_stripe:
+ bch2_ec_stripe_new_free(c, s);
+ break;
+ case STRIPE_REF_io:
+ bch2_ec_do_stripe_creates(c);
+ break;
+ default:
+ unreachable();
+ }
+}
-void bch2_stripes_heap_start(struct bch_fs *);
+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
+void bch2_fs_ec_stop(struct bch_fs *);
+void bch2_fs_ec_flush(struct bch_fs *);
int bch2_stripes_read(struct bch_fs *);
@@ -221,6 +254,7 @@ void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_ec_exit(struct bch_fs *);
+void bch2_fs_ec_init_early(struct bch_fs *);
int bch2_fs_ec_init(struct bch_fs *);
#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index edd93da663c1..e2b02a82de32 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -2,7 +2,7 @@
#ifndef _BCACHEFS_EC_TYPES_H
#define _BCACHEFS_EC_TYPES_H
-#include <linux/llist.h>
+#include "bcachefs_format.h"
struct bch_replicas_padded {
struct bch_replicas_entry e;
@@ -11,15 +11,10 @@ struct bch_replicas_padded {
struct stripe {
size_t heap_idx;
-
u16 sectors;
u8 algorithm;
-
u8 nr_blocks;
u8 nr_redundant;
-
- unsigned alive:1; /* does a corresponding key exist in stripes btree? */
- unsigned on_heap:1;
u8 blocks_nonempty;
};
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
new file mode 100644
index 000000000000..dc906fc9176f
--- /dev/null
+++ b/fs/bcachefs/errcode.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "errcode.h"
+
+#include <linux/errname.h>
+
+static const char * const bch2_errcode_strs[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
+ BCH_ERRCODES()
+#undef x
+ NULL
+};
+
+#define BCH_ERR_0 0
+
+static unsigned bch2_errcode_parents[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
+ BCH_ERRCODES()
+#undef x
+};
+
+const char *bch2_err_str(int err)
+{
+ const char *errstr;
+
+ err = abs(err);
+
+ BUG_ON(err >= BCH_ERR_MAX);
+
+ if (err >= BCH_ERR_START)
+ errstr = bch2_errcode_strs[err - BCH_ERR_START];
+ else if (err)
+ errstr = errname(err);
+ else
+ errstr = "(No error)";
+ return errstr ?: "(Invalid error)";
+}
+
+bool __bch2_err_matches(int err, int class)
+{
+ err = abs(err);
+ class = abs(class);
+
+ BUG_ON(err >= BCH_ERR_MAX);
+ BUG_ON(class >= BCH_ERR_MAX);
+
+ while (err >= BCH_ERR_START && err != class)
+ err = bch2_errcode_parents[err - BCH_ERR_START];
+
+ return err == class;
+}
+
+int __bch2_err_class(int err)
+{
+ err = -err;
+ BUG_ON((unsigned) err >= BCH_ERR_MAX);
+
+ while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
+ err = bch2_errcode_parents[err - BCH_ERR_START];
+
+ return -err;
+}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index f7d12915c1cc..4304e25a6b24 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -2,11 +2,229 @@
#ifndef _BCACHEFS_ERRCODE_H
#define _BCACHEFS_ERRCODE_H
-enum {
- /* Bucket allocator: */
- OPEN_BUCKETS_EMPTY = 2048,
- FREELIST_EMPTY, /* Allocator thread not keeping up */
- INSUFFICIENT_DEVICES,
+#define BCH_ERRCODES() \
+ x(ENOMEM, ENOMEM_stripe_buf) \
+ x(ENOMEM, ENOMEM_replicas_table) \
+ x(ENOMEM, ENOMEM_cpu_replicas) \
+ x(ENOMEM, ENOMEM_replicas_gc) \
+ x(ENOMEM, ENOMEM_disk_groups_validate) \
+ x(ENOMEM, ENOMEM_disk_groups_to_cpu) \
+ x(ENOMEM, ENOMEM_mark_snapshot) \
+ x(ENOMEM, ENOMEM_mark_stripe) \
+ x(ENOMEM, ENOMEM_mark_stripe_ptr) \
+ x(ENOMEM, ENOMEM_btree_key_cache_create) \
+ x(ENOMEM, ENOMEM_btree_key_cache_fill) \
+ x(ENOMEM, ENOMEM_btree_key_cache_insert) \
+ x(ENOMEM, ENOMEM_trans_kmalloc) \
+ x(ENOMEM, ENOMEM_trans_log_msg) \
+ x(ENOMEM, ENOMEM_do_encrypt) \
+ x(ENOMEM, ENOMEM_ec_read_extent) \
+ x(ENOMEM, ENOMEM_ec_stripe_mem_alloc) \
+ x(ENOMEM, ENOMEM_ec_new_stripe_alloc) \
+ x(ENOMEM, ENOMEM_fs_btree_cache_init) \
+ x(ENOMEM, ENOMEM_fs_btree_key_cache_init) \
+ x(ENOMEM, ENOMEM_fs_counters_init) \
+ x(ENOMEM, ENOMEM_fs_btree_write_buffer_init) \
+ x(ENOMEM, ENOMEM_io_clock_init) \
+ x(ENOMEM, ENOMEM_blacklist_table_init) \
+ x(ENOMEM, ENOMEM_sb_realloc_injected) \
+ x(ENOMEM, ENOMEM_sb_bio_realloc) \
+ x(ENOMEM, ENOMEM_sb_buf_realloc) \
+ x(ENOMEM, ENOMEM_sb_journal_validate) \
+ x(ENOMEM, ENOMEM_sb_journal_v2_validate) \
+ x(ENOMEM, ENOMEM_journal_entry_add) \
+ x(ENOMEM, ENOMEM_journal_read_buf_realloc) \
+ x(ENOMEM, ENOMEM_btree_interior_update_worker_init)\
+ x(ENOMEM, ENOMEM_btree_interior_update_pool_init) \
+ x(ENOMEM, ENOMEM_bio_read_init) \
+ x(ENOMEM, ENOMEM_bio_read_split_init) \
+ x(ENOMEM, ENOMEM_bio_write_init) \
+ x(ENOMEM, ENOMEM_bio_bounce_pages_init) \
+ x(ENOMEM, ENOMEM_writepage_bioset_init) \
+ x(ENOMEM, ENOMEM_dio_read_bioset_init) \
+ x(ENOMEM, ENOMEM_dio_write_bioset_init) \
+ x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \
+ x(ENOMEM, ENOMEM_promote_table_init) \
+ x(ENOMEM, ENOMEM_compression_bounce_read_init) \
+ x(ENOMEM, ENOMEM_compression_bounce_write_init) \
+ x(ENOMEM, ENOMEM_compression_workspace_init) \
+ x(ENOMEM, ENOMEM_decompression_workspace_init) \
+ x(ENOMEM, ENOMEM_bucket_gens) \
+ x(ENOMEM, ENOMEM_buckets_nouse) \
+ x(ENOMEM, ENOMEM_usage_init) \
+ x(ENOMEM, ENOMEM_btree_node_read_all_replicas) \
+ x(ENOMEM, ENOMEM_btree_node_reclaim) \
+ x(ENOMEM, ENOMEM_btree_node_mem_alloc) \
+ x(ENOMEM, ENOMEM_btree_cache_cannibalize_lock) \
+ x(ENOMEM, ENOMEM_buckets_waiting_for_journal_init)\
+ x(ENOMEM, ENOMEM_buckets_waiting_for_journal_set) \
+ x(ENOMEM, ENOMEM_set_nr_journal_buckets) \
+ x(ENOMEM, ENOMEM_dev_journal_init) \
+ x(ENOMEM, ENOMEM_journal_pin_fifo) \
+ x(ENOMEM, ENOMEM_journal_buf) \
+ x(ENOMEM, ENOMEM_gc_start) \
+ x(ENOMEM, ENOMEM_gc_alloc_start) \
+ x(ENOMEM, ENOMEM_gc_reflink_start) \
+ x(ENOMEM, ENOMEM_gc_gens) \
+ x(ENOMEM, ENOMEM_gc_repair_key) \
+ x(ENOMEM, ENOMEM_fsck_extent_ends_at) \
+ x(ENOMEM, ENOMEM_fsck_add_nlink) \
+ x(ENOMEM, ENOMEM_journal_key_insert) \
+ x(ENOMEM, ENOMEM_journal_keys_sort) \
+ x(ENOMEM, ENOMEM_journal_replay) \
+ x(ENOMEM, ENOMEM_read_superblock_clean) \
+ x(ENOMEM, ENOMEM_fs_alloc) \
+ x(ENOMEM, ENOMEM_fs_name_alloc) \
+ x(ENOMEM, ENOMEM_fs_other_alloc) \
+ x(ENOMEM, ENOMEM_dev_alloc) \
+ x(ENOSPC, ENOSPC_disk_reservation) \
+ x(ENOSPC, ENOSPC_bucket_alloc) \
+ x(ENOSPC, ENOSPC_disk_label_add) \
+ x(ENOSPC, ENOSPC_stripe_create) \
+ x(ENOSPC, ENOSPC_inode_create) \
+ x(ENOSPC, ENOSPC_str_hash_create) \
+ x(ENOSPC, ENOSPC_snapshot_create) \
+ x(ENOSPC, ENOSPC_subvolume_create) \
+ x(ENOSPC, ENOSPC_sb) \
+ x(ENOSPC, ENOSPC_sb_journal) \
+ x(ENOSPC, ENOSPC_sb_journal_seq_blacklist) \
+ x(ENOSPC, ENOSPC_sb_quota) \
+ x(ENOSPC, ENOSPC_sb_replicas) \
+ x(ENOSPC, ENOSPC_sb_members) \
+ x(ENOSPC, ENOSPC_sb_crypt) \
+ x(0, open_buckets_empty) \
+ x(0, freelist_empty) \
+ x(BCH_ERR_freelist_empty, no_buckets_found) \
+ x(0, transaction_restart) \
+ x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \
+ x(BCH_ERR_transaction_restart, transaction_restart_relock) \
+ x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \
+ x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \
+ x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \
+ x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \
+ x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \
+ x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \
+ x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\
+ x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \
+ x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \
+ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \
+ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\
+ x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\
+ x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \
+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \
+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \
+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \
+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\
+ x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \
+ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
+ x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
+ x(BCH_ERR_transaction_restart, transaction_restart_nested) \
+ x(0, no_btree_node) \
+ x(BCH_ERR_no_btree_node, no_btree_node_relock) \
+ x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \
+ x(BCH_ERR_no_btree_node, no_btree_node_drop) \
+ x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \
+ x(BCH_ERR_no_btree_node, no_btree_node_up) \
+ x(BCH_ERR_no_btree_node, no_btree_node_down) \
+ x(BCH_ERR_no_btree_node, no_btree_node_init) \
+ x(BCH_ERR_no_btree_node, no_btree_node_cached) \
+ x(BCH_ERR_no_btree_node, no_btree_node_srcu_reset) \
+ x(0, btree_insert_fail) \
+ x(BCH_ERR_btree_insert_fail, btree_insert_btree_node_full) \
+ x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \
+ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
+ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
+ x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \
+ x(0, backpointer_to_overwritten_btree_node) \
+ x(0, lock_fail_root_changed) \
+ x(0, journal_reclaim_would_deadlock) \
+ x(0, fsck) \
+ x(BCH_ERR_fsck, fsck_fix) \
+ x(BCH_ERR_fsck, fsck_ignore) \
+ x(BCH_ERR_fsck, fsck_errors_not_fixed) \
+ x(BCH_ERR_fsck, fsck_repair_unimplemented) \
+ x(BCH_ERR_fsck, fsck_repair_impossible) \
+ x(0, need_snapshot_cleanup) \
+ x(0, need_topology_repair) \
+ x(0, unwritten_extent_update) \
+ x(EINVAL, device_state_not_allowed) \
+ x(EINVAL, member_info_missing) \
+ x(EINVAL, mismatched_block_size) \
+ x(EINVAL, block_size_too_small) \
+ x(EINVAL, bucket_size_too_small) \
+ x(EINVAL, device_size_too_small) \
+ x(EINVAL, device_not_a_member_of_filesystem) \
+ x(EINVAL, device_has_been_removed) \
+ x(EINVAL, device_already_online) \
+ x(EINVAL, insufficient_devices_to_start) \
+ x(EINVAL, invalid) \
+ x(EROFS, erofs_trans_commit) \
+ x(EROFS, erofs_no_writes) \
+ x(EROFS, erofs_journal_err) \
+ x(EROFS, erofs_sb_err) \
+ x(EROFS, insufficient_devices) \
+ x(0, operation_blocked) \
+ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \
+ x(BCH_ERR_operation_blocked, journal_res_get_blocked) \
+ x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \
+ x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \
+ x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \
+ x(BCH_ERR_invalid, invalid_sb) \
+ x(BCH_ERR_invalid_sb, invalid_sb_magic) \
+ x(BCH_ERR_invalid_sb, invalid_sb_version) \
+ x(BCH_ERR_invalid_sb, invalid_sb_features) \
+ x(BCH_ERR_invalid_sb, invalid_sb_too_big) \
+ x(BCH_ERR_invalid_sb, invalid_sb_csum_type) \
+ x(BCH_ERR_invalid_sb, invalid_sb_csum) \
+ x(BCH_ERR_invalid_sb, invalid_sb_block_size) \
+ x(BCH_ERR_invalid_sb, invalid_sb_uuid) \
+ x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \
+ x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \
+ x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \
+ x(BCH_ERR_invalid_sb, invalid_sb_field_size) \
+ x(BCH_ERR_invalid_sb, invalid_sb_layout) \
+ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \
+ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \
+ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \
+ x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \
+ x(BCH_ERR_invalid_sb, invalid_sb_members) \
+ x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \
+ x(BCH_ERR_invalid_sb, invalid_sb_replicas) \
+ x(BCH_ERR_invalid_sb, invalid_sb_journal) \
+ x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \
+ x(BCH_ERR_invalid_sb, invalid_sb_crypt) \
+ x(BCH_ERR_invalid_sb, invalid_sb_clean) \
+ x(BCH_ERR_invalid_sb, invalid_sb_quota) \
+ x(BCH_ERR_invalid, invalid_bkey) \
+ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
+
+enum bch_errcode {
+ BCH_ERR_START = 2048,
+#define x(class, err) BCH_ERR_##err,
+ BCH_ERRCODES()
+#undef x
+ BCH_ERR_MAX
};
+const char *bch2_err_str(int);
+bool __bch2_err_matches(int, int);
+
+static inline bool _bch2_err_matches(int err, int class)
+{
+ return err && __bch2_err_matches(err, class);
+}
+
+#define bch2_err_matches(_err, _class) \
+({ \
+ BUILD_BUG_ON(!__builtin_constant_p(_class)); \
+ _bch2_err_matches(_err, _class); \
+})
+
+int __bch2_err_class(int);
+
+static inline long bch2_err_class(long err)
+{
+ return err < 0 ? __bch2_err_class(err) : err;
+}
+
#endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 2cea694575e9..1dae649ff0e2 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -15,7 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
return false;
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "emergency read only");
+ bch_err(c, "inconsistency detected - emergency read only");
return true;
case BCH_ON_ERROR_panic:
panic(bch2_fmt(c, "panic after error"));
@@ -27,15 +27,18 @@ bool bch2_inconsistent_error(struct bch_fs *c)
void bch2_topology_error(struct bch_fs *c)
{
+ if (!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags))
+ return;
+
set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
- if (test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
bch2_inconsistent_error(c);
}
void bch2_fatal_error(struct bch_fs *c)
{
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "emergency read only");
+ bch_err(c, "fatal error - emergency read only");
}
void bch2_io_error_work(struct work_struct *work)
@@ -68,103 +71,150 @@ void bch2_io_error(struct bch_dev *ca)
#include "tools-util.h"
#endif
-enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
- const char *fmt, ...)
+static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
{
- struct fsck_err_state *s = NULL;
- va_list args;
- bool fix = false, print = true, suppressing = false;
- char _buf[sizeof(s->buf)], *buf = _buf;
-
- if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
- va_start(args, fmt);
- vprintk(fmt, args);
- va_end(args);
+ struct fsck_err_state *s;
- if (c->opts.errors == BCH_ON_ERROR_continue) {
- bch_err(c, "fixing");
- return FSCK_ERR_FIX;
- } else {
- bch2_inconsistent_error(c);
- return FSCK_ERR_EXIT;
- }
- }
-
- mutex_lock(&c->fsck_error_lock);
+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ return NULL;
list_for_each_entry(s, &c->fsck_errors, list)
- if (s->fmt == fmt)
- goto found;
+ if (s->fmt == fmt) {
+ /*
+ * move it to the head of the list: repeated fsck errors
+ * are common
+ */
+ list_move(&s->list, &c->fsck_errors);
+ return s;
+ }
s = kzalloc(sizeof(*s), GFP_NOFS);
if (!s) {
if (!c->fsck_alloc_err)
bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
c->fsck_alloc_err = true;
- buf = _buf;
- goto print;
+ return NULL;
}
INIT_LIST_HEAD(&s->list);
s->fmt = fmt;
-found:
- list_move(&s->list, &c->fsck_errors);
- s->nr++;
- if (c->opts.ratelimit_errors &&
- !(flags & FSCK_NO_RATELIMIT) &&
- s->nr >= FSCK_ERR_RATELIMIT_NR) {
- if (s->nr == FSCK_ERR_RATELIMIT_NR)
- suppressing = true;
- else
- print = false;
- }
- buf = s->buf;
-print:
+ list_add(&s->list, &c->fsck_errors);
+ return s;
+}
+
+int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
+{
+ struct fsck_err_state *s = NULL;
+ va_list args;
+ bool print = true, suppressing = false, inconsistent = false;
+ struct printbuf buf = PRINTBUF, *out = &buf;
+ int ret = -BCH_ERR_fsck_ignore;
+
va_start(args, fmt);
- vscnprintf(buf, sizeof(_buf), fmt, args);
+ prt_vprintf(out, fmt, args);
va_end(args);
- if (c->opts.fix_errors == FSCK_OPT_EXIT) {
- bch_err(c, "%s, exiting", buf);
+ mutex_lock(&c->fsck_error_lock);
+ s = fsck_err_get(c, fmt);
+ if (s) {
+ if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
+ ret = s->ret;
+ mutex_unlock(&c->fsck_error_lock);
+ printbuf_exit(&buf);
+ return ret;
+ }
+
+ kfree(s->last_msg);
+ s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
+
+ if (c->opts.ratelimit_errors &&
+ !(flags & FSCK_NO_RATELIMIT) &&
+ s->nr >= FSCK_ERR_RATELIMIT_NR) {
+ if (s->nr == FSCK_ERR_RATELIMIT_NR)
+ suppressing = true;
+ else
+ print = false;
+ }
+
+ s->nr++;
+ }
+
+#ifdef BCACHEFS_LOG_PREFIX
+ if (!strncmp(fmt, "bcachefs:", 9))
+ prt_printf(out, bch2_log_msg(c, ""));
+#endif
+
+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+ if (c->opts.errors != BCH_ON_ERROR_continue ||
+ !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
+ prt_str(out, ", shutting down");
+ inconsistent = true;
+ ret = -BCH_ERR_fsck_errors_not_fixed;
+ } else if (flags & FSCK_CAN_FIX) {
+ prt_str(out, ", fixing");
+ ret = -BCH_ERR_fsck_fix;
+ } else {
+ prt_str(out, ", continuing");
+ ret = -BCH_ERR_fsck_ignore;
+ }
+ } else if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+ prt_str(out, ", exiting");
+ ret = -BCH_ERR_fsck_errors_not_fixed;
} else if (flags & FSCK_CAN_FIX) {
if (c->opts.fix_errors == FSCK_OPT_ASK) {
- printk(KERN_ERR "%s: fix?", buf);
- fix = ask_yn();
+ prt_str(out, ": fix?");
+ bch2_print_string_as_lines(KERN_ERR, out->buf);
+ print = false;
+ ret = ask_yn()
+ ? -BCH_ERR_fsck_fix
+ : -BCH_ERR_fsck_ignore;
} else if (c->opts.fix_errors == FSCK_OPT_YES ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
- if (print)
- bch_err(c, "%s, fixing", buf);
- fix = true;
+ prt_str(out, ", fixing");
+ ret = -BCH_ERR_fsck_fix;
} else {
- if (print)
- bch_err(c, "%s, not fixing", buf);
- fix = false;
+ prt_str(out, ", not fixing");
}
} else if (flags & FSCK_NEED_FSCK) {
- if (print)
- bch_err(c, "%s (run fsck to correct)", buf);
+ prt_str(out, " (run fsck to correct)");
} else {
- if (print)
- bch_err(c, "%s (repair unimplemented)", buf);
+ prt_str(out, " (repair unimplemented)");
}
- if (suppressing)
+ if (ret == -BCH_ERR_fsck_ignore &&
+ (c->opts.fix_errors == FSCK_OPT_EXIT ||
+ !(flags & FSCK_CAN_IGNORE)))
+ ret = -BCH_ERR_fsck_errors_not_fixed;
+
+ if (print)
+ bch2_print_string_as_lines(KERN_ERR, out->buf);
+
+ if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) &&
+ (ret != -BCH_ERR_fsck_fix &&
+ ret != -BCH_ERR_fsck_ignore))
+ bch_err(c, "Unable to continue, halting");
+ else if (suppressing)
bch_err(c, "Ratelimiting new instances of previous error");
+ if (s)
+ s->ret = ret;
+
mutex_unlock(&c->fsck_error_lock);
- if (fix) {
+ printbuf_exit(&buf);
+
+ if (inconsistent)
+ bch2_inconsistent_error(c);
+
+ if (ret == -BCH_ERR_fsck_fix) {
set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
- return FSCK_ERR_FIX;
} else {
set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
set_bit(BCH_FS_ERROR, &c->flags);
- return c->opts.fix_errors == FSCK_OPT_EXIT ||
- !(flags & FSCK_CAN_IGNORE)
- ? FSCK_ERR_EXIT
- : FSCK_ERR_IGNORE;
}
+
+ return ret;
}
void bch2_flush_fsck_errs(struct bch_fs *c)
@@ -174,10 +224,11 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
mutex_lock(&c->fsck_error_lock);
list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
- if (s->ratelimited)
- bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf);
+ if (s->ratelimited && s->last_msg)
+ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
list_del(&s->list);
+ kfree(s->last_msg);
kfree(s);
}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 986938298adc..91c7e4ee8f72 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -39,7 +39,7 @@ void bch2_topology_error(struct bch_fs *);
#define bch2_fs_inconsistent_on(cond, c, ...) \
({ \
- int _ret = !!(cond); \
+ bool _ret = unlikely(!!(cond)); \
\
if (_ret) \
bch2_fs_inconsistent(c, __VA_ARGS__); \
@@ -59,7 +59,7 @@ do { \
#define bch2_dev_inconsistent_on(cond, ca, ...) \
({ \
- int _ret = !!(cond); \
+ bool _ret = unlikely(!!(cond)); \
\
if (_ret) \
bch2_dev_inconsistent(ca, __VA_ARGS__); \
@@ -67,18 +67,30 @@ do { \
})
/*
+ * When a transaction update discovers or is causing a fs inconsistency, it's
+ * helpful to also dump the pending updates:
+ */
+#define bch2_trans_inconsistent(trans, ...) \
+({ \
+ bch_err(trans->c, __VA_ARGS__); \
+ bch2_dump_trans_updates(trans); \
+ bch2_inconsistent_error(trans->c); \
+})
+
+#define bch2_trans_inconsistent_on(cond, trans, ...) \
+({ \
+ bool _ret = unlikely(!!(cond)); \
+ \
+ if (_ret) \
+ bch2_trans_inconsistent(trans, __VA_ARGS__); \
+ _ret; \
+})
+
+/*
* Fsck errors: inconsistency errors we detect at mount time, and should ideally
* be able to repair:
*/
-enum {
- BCH_FSCK_OK = 0,
- BCH_FSCK_ERRORS_NOT_FIXED = 1,
- BCH_FSCK_REPAIR_UNIMPLEMENTED = 2,
- BCH_FSCK_REPAIR_IMPOSSIBLE = 3,
- BCH_FSCK_UNKNOWN_VERSION = 4,
-};
-
enum fsck_err_opts {
FSCK_OPT_EXIT,
FSCK_OPT_YES,
@@ -86,19 +98,13 @@ enum fsck_err_opts {
FSCK_OPT_ASK,
};
-enum fsck_err_ret {
- FSCK_ERR_IGNORE = 0,
- FSCK_ERR_FIX = 1,
- FSCK_ERR_EXIT = 2,
- FSCK_ERR_START_TOPOLOGY_REPAIR = 3,
-};
-
struct fsck_err_state {
struct list_head list;
const char *fmt;
u64 nr;
bool ratelimited;
- char buf[512];
+ int ret;
+ char *last_msg;
};
#define FSCK_CAN_FIX (1 << 0)
@@ -107,21 +113,20 @@ struct fsck_err_state {
#define FSCK_NO_RATELIMIT (1 << 3)
__printf(3, 4) __cold
-enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
- unsigned, const char *, ...);
+int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...);
void bch2_flush_fsck_errs(struct bch_fs *);
#define __fsck_err(c, _flags, msg, ...) \
({ \
- int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
+ int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \
\
- if (_fix == FSCK_ERR_EXIT) { \
- bch_err(c, "Unable to continue, halting"); \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ if (_ret != -BCH_ERR_fsck_fix && \
+ _ret != -BCH_ERR_fsck_ignore) { \
+ ret = _ret; \
goto fsck_err; \
} \
\
- _fix; \
+ _ret == -BCH_ERR_fsck_fix; \
})
/* These macros return true if error should be fixed: */
@@ -129,7 +134,7 @@ void bch2_flush_fsck_errs(struct bch_fs *);
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
#define __fsck_err_on(cond, c, _flags, ...) \
- ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
+ (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
#define need_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
@@ -164,7 +169,7 @@ do { \
#define bch2_fs_fatal_err_on(cond, c, ...) \
({ \
- int _ret = !!(cond); \
+ bool _ret = unlikely(!!(cond)); \
\
if (_ret) \
bch2_fs_fatal_error(c, __VA_ARGS__); \
@@ -182,36 +187,25 @@ void bch2_io_error_work(struct work_struct *);
/* Does the error handling without logging a message */
void bch2_io_error(struct bch_dev *);
-/* Logs message and handles the error: */
-#define bch2_dev_io_error(ca, fmt, ...) \
-do { \
- printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt, \
- (ca)->name, ##__VA_ARGS__); \
- bch2_io_error(ca); \
-} while (0)
-
-#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...) \
-do { \
- printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\
- (ca)->name, (_inum), (_offset), ##__VA_ARGS__); \
- bch2_io_error(ca); \
-} while (0)
-
#define bch2_dev_io_err_on(cond, ca, ...) \
({ \
bool _ret = (cond); \
\
- if (_ret) \
- bch2_dev_io_error(ca, __VA_ARGS__); \
+ if (_ret) { \
+ bch_err_dev_ratelimited(ca, __VA_ARGS__); \
+ bch2_io_error(ca); \
+ } \
_ret; \
})
-#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...) \
+#define bch2_dev_inum_io_err_on(cond, ca, ...) \
({ \
bool _ret = (cond); \
\
- if (_ret) \
- bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\
+ if (_ret) { \
+ bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \
+ bch2_io_error(ca); \
+ } \
_ret; \
})
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 58b2c96f450c..21af6fb8cecf 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -15,17 +15,26 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
- unsigned ret = 0;
+ unsigned ret = 0, lru = 0;
bkey_extent_entry_for_each(ptrs, entry) {
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
+ /* Might also be updating LRU btree */
+ if (entry->ptr.cached)
+ lru++;
+
+ fallthrough;
case BCH_EXTENT_ENTRY_stripe_ptr:
ret++;
}
}
- return ret;
+ /*
+ * Updating keys in the alloc btree may also update keys in the
+ * freespace or discard btrees:
+ */
+ return lru + ret * 2;
}
static int count_iters_for_insert(struct btree_trans *trans,
@@ -64,8 +73,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
for_each_btree_key_norestart(trans, iter,
BTREE_ID_reflink, POS(0, idx + offset),
BTREE_ITER_SLOTS, r_k, ret2) {
- if (bkey_cmp(bkey_start_pos(r_k.k),
- POS(0, idx + sectors)) >= 0)
+ if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
break;
/* extent_update_to_keys(), for the reflink_v update */
@@ -120,14 +128,10 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
bch2_trans_copy_iter(&copy, iter);
- for_each_btree_key_continue_norestart(copy, 0, k, ret) {
+ for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) {
unsigned offset = 0;
- if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
- break;
-
- if (bkey_cmp(bkey_start_pos(&insert->k),
- bkey_start_pos(k.k)) > 0)
+ if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
offset = bkey_start_offset(&insert->k) -
bkey_start_offset(k.k);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 44c584e9adaa..b35b584176ee 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -115,6 +115,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
return -EIO;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ /*
+ * Unwritten extent: no need to actually read, treat it as a
+ * hole and return 0s:
+ */
+ if (p.ptr.unwritten)
+ return 0;
+
ca = bch_dev_bkey_exists(c, p.ptr.dev);
/*
@@ -156,12 +163,16 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
/* KEY_TYPE_btree_ptr: */
-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
- if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX)
- return "value too big";
+ if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
+ prt_printf(err, "value too big (%zu > %u)",
+ bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
+ return -BCH_ERR_invalid_bkey;
+ }
- return bch2_bkey_ptrs_invalid(c, k);
+ return bch2_bkey_ptrs_invalid(c, k, flags, err);
}
void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -170,35 +181,45 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
-const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- if (bkey_val_bytes(k.k) <= sizeof(*bp.v))
- return "value too small";
+ if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) {
+ prt_printf(err, "value too small (%zu <= %zu)",
+ bkey_val_bytes(k.k), sizeof(*bp.v));
+ return -BCH_ERR_invalid_bkey;
+ }
- if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
- return "value too big";
+ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
+ prt_printf(err, "value too big (%zu > %zu)",
+ bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
+ return -BCH_ERR_invalid_bkey;
+ }
if (c->sb.version < bcachefs_metadata_version_snapshot &&
- bp.v->min_key.snapshot)
- return "invalid min_key.snapshot";
+ bp.v->min_key.snapshot) {
+ prt_printf(err, "invalid min_key.snapshot (%u != 0)",
+ bp.v->min_key.snapshot);
+ return -BCH_ERR_invalid_bkey;
+ }
- return bch2_bkey_ptrs_invalid(c, k);
+ return bch2_bkey_ptrs_invalid(c, k, flags, err);
}
void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+ struct bkey_s_c k)
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- pr_buf(out, "seq %llx written %u min_key %s",
+ prt_printf(out, "seq %llx written %u min_key %s",
le64_to_cpu(bp.v->seq),
le16_to_cpu(bp.v->sectors_written),
BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
bch2_bpos_to_text(out, bp.v->min_key);
- pr_buf(out, " ");
+ prt_printf(out, " ");
bch2_bkey_ptrs_to_text(out, c, k);
}
@@ -212,7 +233,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
if (version < bcachefs_metadata_version_inode_btree_change &&
btree_node_type_is_extents(btree_id) &&
- bkey_cmp(bp.v->min_key, POS_MIN))
+ !bkey_eq(bp.v->min_key, POS_MIN))
bp.v->min_key = write
? bpos_nosnap_predecessor(bp.v->min_key)
: bpos_nosnap_successor(bp.v->min_key);
@@ -220,17 +241,6 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
/* KEY_TYPE_extent: */
-const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
- return bch2_bkey_ptrs_invalid(c, k);
-}
-
-void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- bch2_bkey_ptrs_to_text(out, c, k);
-}
-
bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
{
struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l);
@@ -265,6 +275,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
rp.ptr.offset + rp.crc.offset ||
lp.ptr.dev != rp.ptr.dev ||
lp.ptr.gen != rp.ptr.gen ||
+ lp.ptr.unwritten != rp.ptr.unwritten ||
lp.has_ec != rp.has_ec)
return false;
@@ -287,7 +298,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
lp.crc.uncompressed_size) {
/* can use left extent's crc entry */
- } else if (lp.crc.live_size <= rp.crc.offset ) {
+ } else if (lp.crc.live_size <= rp.crc.offset) {
/* can use right extent's crc entry */
} else {
/* check if checksums can be merged: */
@@ -305,8 +316,20 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
lp.crc.uncompressed_size +
rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
return false;
+ }
- if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
+ en_l = extent_entry_next(en_l);
+ en_r = extent_entry_next(en_r);
+ }
+
+ en_l = l_ptrs.start;
+ en_r = r_ptrs.start;
+ while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+ if (extent_entry_is_crc(en_l)) {
+ struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+ struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+ if (crc_l.uncompressed_size + crc_r.uncompressed_size >
bch2_crc_field_size_max[extent_entry_type(en_l)])
return false;
}
@@ -334,7 +357,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
crc_l.uncompressed_size) {
/* can use left extent's crc entry */
- } else if (crc_l.live_size <= crc_r.offset ) {
+ } else if (crc_l.live_size <= crc_r.offset) {
/* can use right extent's crc entry */
crc_r.offset -= crc_l.live_size;
bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
@@ -363,17 +386,24 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
/* KEY_TYPE_reservation: */
-const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
- if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
- return "incorrect value size";
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(*r.v));
+ return -BCH_ERR_invalid_bkey;
+ }
- if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
- return "invalid nr_replicas";
+ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
+ prt_printf(err, "invalid nr_replicas (%u)",
+ r.v->nr_replicas);
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ return 0;
}
void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
@@ -381,7 +411,7 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
- pr_buf(out, "generation %u replicas %u",
+ prt_printf(out, "generation %u replicas %u",
le32_to_cpu(r.v->generation),
r.v->nr_replicas);
}
@@ -480,7 +510,7 @@ restart_narrow_pointers:
bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
if (can_narrow_crc(p.crc, n)) {
- __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
p.ptr.offset += p.crc.offset;
p.crc = n;
bch2_extent_ptr_decoded_append(k, &p);
@@ -633,22 +663,21 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
return replicas;
}
-static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
- struct extent_ptr_decoded p)
+unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
{
unsigned durability = 0;
struct bch_dev *ca;
- if (p.ptr.cached)
+ if (p->ptr.cached)
return 0;
- ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ ca = bch_dev_bkey_exists(c, p->ptr.dev);
if (ca->mi.state != BCH_MEMBER_STATE_failed)
durability = max_t(unsigned, durability, ca->mi.durability);
- if (p.has_ec)
- durability += p.ec.redundancy;
+ if (p->has_ec)
+ durability += p->ec.redundancy;
return durability;
}
@@ -661,40 +690,23 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
unsigned durability = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- durability += bch2_extent_ptr_durability(c, p);
+ durability += bch2_extent_ptr_durability(c, &p);
return durability;
}
-void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
- unsigned target,
- unsigned nr_desired_replicas)
+static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *entry;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
-
- if (target && extra > 0)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- int n = bch2_extent_ptr_durability(c, p);
-
- if (n && n <= extra &&
- !bch2_dev_in_target(c, p.ptr.dev, target)) {
- entry->ptr.cached = true;
- extra -= n;
- }
- }
+ unsigned durability = 0;
- if (extra > 0)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- int n = bch2_extent_ptr_durability(c, p);
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
+ durability += bch2_extent_ptr_durability(c, &p);
- if (n && n <= extra) {
- entry->ptr.cached = true;
- extra -= n;
- }
- }
+ return durability;
}
void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
@@ -706,41 +718,6 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry
k->k.u64s -= extent_entry_u64s(entry);
}
-void bch2_bkey_append_ptr(struct bkey_i *k,
- struct bch_extent_ptr ptr)
-{
- EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
-
- switch (k->k.type) {
- case KEY_TYPE_btree_ptr:
- case KEY_TYPE_btree_ptr_v2:
- case KEY_TYPE_extent:
- EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
-
- ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-
- memcpy((void *) &k->v + bkey_val_bytes(&k->k),
- &ptr,
- sizeof(ptr));
- k->u64s++;
- break;
- default:
- BUG();
- }
-}
-
-static inline void __extent_entry_insert(struct bkey_i *k,
- union bch_extent_entry *dst,
- union bch_extent_entry *new)
-{
- union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-
- memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
- dst, (u64 *) end - (u64 *) dst);
- k->k.u64s += extent_entry_u64s(new);
- memcpy(dst, new, extent_entry_bytes(new));
-}
-
void bch2_extent_ptr_decoded_append(struct bkey_i *k,
struct extent_ptr_decoded *p)
{
@@ -800,8 +777,8 @@ static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
/*
* Returns pointer to the next entry after the one being dropped:
*/
-union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry = to_entry(ptr), *next;
@@ -844,7 +821,7 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
{
bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
union bch_extent_entry *ret =
- __bch2_bkey_drop_ptr(k, ptr);
+ bch2_bkey_drop_ptr_noerror(k, ptr);
/*
* If we deleted all the dirty pointers and there's still cached
@@ -873,8 +850,15 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
}
-const struct bch_extent_ptr *
-bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
+{
+ struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
+
+ if (ptr)
+ bch2_bkey_drop_ptr_noerror(k, ptr);
+}
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr;
@@ -918,6 +902,78 @@ bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
}
/*
+ * Returns true if two extents refer to the same data:
+ */
+bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
+{
+ if (k1.k->type != k2.k->type)
+ return false;
+
+ if (bkey_extent_is_direct_data(k1.k)) {
+ struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
+ struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+ const union bch_extent_entry *entry1, *entry2;
+ struct extent_ptr_decoded p1, p2;
+
+ if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
+ return false;
+
+ bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+ if (p1.ptr.dev == p2.ptr.dev &&
+ p1.ptr.gen == p2.ptr.gen &&
+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ return true;
+
+ return false;
+ } else {
+ /* KEY_TYPE_deleted, etc. */
+ return true;
+ }
+}
+
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
+{
+ struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
+ union bch_extent_entry *entry2;
+ struct extent_ptr_decoded p2;
+
+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+ if (p1.ptr.dev == p2.ptr.dev &&
+ p1.ptr.gen == p2.ptr.gen &&
+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ return &entry2->ptr;
+
+ return NULL;
+}
+
+void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+ union bch_extent_entry *entry;
+ union bch_extent_entry *ec = NULL;
+
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (&entry->ptr == ptr) {
+ ptr->cached = true;
+ if (ec)
+ extent_entry_drop(k, ec);
+ return;
+ }
+
+ if (extent_entry_is_stripe_ptr(entry))
+ ec = entry;
+ else if (extent_entry_is_ptr(entry))
+ ec = NULL;
+ }
+
+ BUG();
+}
+
+/*
* bch_extent_normalize - clean up an extent, dropping stale pointers etc.
*
* Returns true if @k should be dropped entirely
@@ -947,29 +1003,44 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca;
bool first = true;
+ if (c)
+ prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
+
bkey_extent_entry_for_each(ptrs, entry) {
if (!first)
- pr_buf(out, " ");
+ prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
- ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
- pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
- (u64) ptr->offset, ptr->gen,
- ptr->cached ? " cached" : "",
- ca && ptr_stale(ca, ptr)
- ? " stale" : "");
+ if (!ca) {
+ prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+ (u64) ptr->offset, ptr->gen,
+ ptr->cached ? " cached" : "");
+ } else {
+ u32 offset;
+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+ prt_printf(out, "ptr: %u:%llu:%u gen %u",
+ ptr->dev, b, offset, ptr->gen);
+ if (ptr->cached)
+ prt_str(out, " cached");
+ if (ptr->unwritten)
+ prt_str(out, " unwritten");
+ if (ca && ptr_stale(ca, ptr))
+ prt_printf(out, " stale");
+ }
break;
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
crc.compressed_size,
crc.uncompressed_size,
crc.offset, crc.nonce,
@@ -979,11 +1050,11 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
case BCH_EXTENT_ENTRY_stripe_ptr:
ec = &entry->stripe_ptr;
- pr_buf(out, "ec: idx %llu block %u",
+ prt_printf(out, "ec: idx %llu block %u",
(u64) ec->idx, ec->block);
break;
default:
- pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+ prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
}
@@ -991,69 +1062,108 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
}
}
-static const char *extent_ptr_invalid(const struct bch_fs *c,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- unsigned size_ondisk,
- bool metadata)
+static int extent_ptr_invalid(const struct bch_fs *c,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ unsigned size_ondisk,
+ bool metadata,
+ struct printbuf *err)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr2;
+ u64 bucket;
+ u32 bucket_offset;
struct bch_dev *ca;
- if (!bch2_dev_exists2(c, ptr->dev))
- return "pointer to invalid device";
+ if (!bch2_dev_exists2(c, ptr->dev)) {
+ prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
+ return -BCH_ERR_invalid_bkey;
+ }
ca = bch_dev_bkey_exists(c, ptr->dev);
- if (!ca)
- return "pointer to invalid device";
-
bkey_for_each_ptr(ptrs, ptr2)
- if (ptr != ptr2 && ptr->dev == ptr2->dev)
- return "multiple pointers to same device";
+ if (ptr != ptr2 && ptr->dev == ptr2->dev) {
+ prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
+ return -BCH_ERR_invalid_bkey;
+ }
- if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
- return "offset past end of device";
+ bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
- if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
- return "offset before first bucket";
+ if (bucket >= ca->mi.nbuckets) {
+ prt_printf(err, "pointer past last bucket (%llu > %llu)",
+ bucket, ca->mi.nbuckets);
+ return -BCH_ERR_invalid_bkey;
+ }
- if (bucket_remainder(ca, ptr->offset) +
- size_ondisk > ca->mi.bucket_size)
- return "spans multiple buckets";
+ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
+ prt_printf(err, "pointer before first bucket (%llu < %u)",
+ bucket, ca->mi.first_bucket);
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
+ prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
+ bucket_offset, size_ondisk, ca->mi.bucket_size);
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ return 0;
}
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_devs_list devs;
const union bch_extent_entry *entry;
struct bch_extent_crc_unpacked crc;
unsigned size_ondisk = k.k->size;
- const char *reason;
unsigned nonce = UINT_MAX;
- unsigned i;
+ unsigned nr_ptrs = 0;
+ bool unwritten = false, have_ec = false, crc_since_last_ptr = false;
+ int ret;
- if (k.k->type == KEY_TYPE_btree_ptr ||
- k.k->type == KEY_TYPE_btree_ptr_v2)
+ if (bkey_is_btree_ptr(k.k))
size_ondisk = btree_sectors(c);
bkey_extent_entry_for_each(ptrs, entry) {
- if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
- return "invalid extent entry type";
+ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
+ prt_printf(err, "invalid extent entry type (got %u, max %u)",
+ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
+ return -BCH_ERR_invalid_bkey;
+ }
- if (k.k->type == KEY_TYPE_btree_ptr &&
- !extent_entry_is_ptr(entry))
- return "has non ptr field";
+ if (bkey_is_btree_ptr(k.k) &&
+ !extent_entry_is_ptr(entry)) {
+ prt_printf(err, "has non ptr field");
+ return -BCH_ERR_invalid_bkey;
+ }
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
- reason = extent_ptr_invalid(c, k, &entry->ptr,
- size_ondisk, false);
- if (reason)
- return reason;
+ ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk,
+ false, err);
+ if (ret)
+ return ret;
+
+ if (nr_ptrs && unwritten != entry->ptr.unwritten) {
+ prt_printf(err, "extent with unwritten and written ptrs");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) {
+ prt_printf(err, "has unwritten ptrs");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ if (entry->ptr.cached && have_ec) {
+ prt_printf(err, "cached, erasure coded ptr");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ unwritten = entry->ptr.unwritten;
+ have_ec = false;
+ crc_since_last_ptr = false;
+ nr_ptrs++;
break;
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
@@ -1061,36 +1171,69 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
if (crc.offset + crc.live_size >
- crc.uncompressed_size)
- return "checksum offset + key size > uncompressed size";
+ crc.uncompressed_size) {
+ prt_printf(err, "checksum offset + key size > uncompressed size");
+ return -BCH_ERR_invalid_bkey;
+ }
size_ondisk = crc.compressed_size;
- if (!bch2_checksum_type_valid(c, crc.csum_type))
- return "invalid checksum type";
+ if (!bch2_checksum_type_valid(c, crc.csum_type)) {
+ prt_printf(err, "invalid checksum type");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR)
- return "invalid compression type";
+ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
+ prt_printf(err, "invalid compression type");
+ return -BCH_ERR_invalid_bkey;
+ }
if (bch2_csum_type_is_encryption(crc.csum_type)) {
if (nonce == UINT_MAX)
nonce = crc.offset + crc.nonce;
- else if (nonce != crc.offset + crc.nonce)
- return "incorrect nonce";
+ else if (nonce != crc.offset + crc.nonce) {
+ prt_printf(err, "incorrect nonce");
+ return -BCH_ERR_invalid_bkey;
+ }
}
+
+ if (crc_since_last_ptr) {
+ prt_printf(err, "redundant crc entry");
+ return -BCH_ERR_invalid_bkey;
+ }
+ crc_since_last_ptr = true;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
+ if (have_ec) {
+ prt_printf(err, "redundant stripe entry");
+ return -BCH_ERR_invalid_bkey;
+ }
+ have_ec = true;
break;
}
}
- devs = bch2_bkey_devs(k);
- bubble_sort(devs.devs, devs.nr, u8_cmp);
- for (i = 0; i + 1 < devs.nr; i++)
- if (devs.devs[i] == devs.devs[i + 1])
- return "multiple ptrs to same device";
+ if (!nr_ptrs) {
+ prt_str(err, "no ptrs");
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
+ prt_str(err, "too many ptrs");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ if (crc_since_last_ptr) {
+ prt_printf(err, "redundant crc entry");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ if (have_ec) {
+ prt_printf(err, "redundant stripe entry");
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ return 0;
}
void bch2_ptr_swab(struct bkey_s k)
@@ -1137,10 +1280,10 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
int val_u64s_delta;
u64 sub;
- if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
+ if (bkey_le(where, bkey_start_pos(k.k)))
return 0;
- EBUG_ON(bkey_cmp(where, k.k->p) > 0);
+ EBUG_ON(bkey_gt(where, k.k->p));
sub = where.offset - bkey_start_offset(k.k);
@@ -1217,10 +1360,10 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
int val_u64s_delta;
u64 len = 0;
- if (bkey_cmp(where, k.k->p) >= 0)
+ if (bkey_ge(where, k.k->p))
return 0;
- EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0);
+ EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
len = where.offset - bkey_start_offset(k.k);
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 9c2567274a2b..9b026ae95932 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -76,6 +76,18 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
return extent_entry_bytes(entry) / sizeof(u64);
}
+static inline void __extent_entry_insert(struct bkey_i *k,
+ union bch_extent_entry *dst,
+ union bch_extent_entry *new)
+{
+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+
+ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
+ dst, (u64 *) end - (u64 *) dst);
+ k->k.u64s += extent_entry_u64s(new);
+ memcpy_u64s_small(dst, new, extent_entry_u64s(new));
+}
+
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
@@ -198,6 +210,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
switch (k.k->type) {
case KEY_TYPE_btree_ptr: {
struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
+
return (struct bkey_ptrs_c) {
to_entry(&e.v->start[0]),
to_entry(extent_entry_last(e))
@@ -205,6 +218,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
}
case KEY_TYPE_extent: {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
return (struct bkey_ptrs_c) {
e.v->start,
extent_entry_last(e)
@@ -212,6 +226,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
}
case KEY_TYPE_stripe: {
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
return (struct bkey_ptrs_c) {
to_entry(&s.v->ptrs[0]),
to_entry(&s.v->ptrs[s.v->nr_blocks]),
@@ -227,6 +242,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
}
case KEY_TYPE_btree_ptr_v2: {
struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
+
return (struct bkey_ptrs_c) {
to_entry(&e.v->start[0]),
to_entry(extent_entry_last(e))
@@ -342,7 +358,7 @@ out: \
#define extent_for_each_entry_from(_e, _entry, _start) \
__bkey_extent_entry_for_each_from(_start, \
- extent_entry_last(_e),_entry)
+ extent_entry_last(_e), _entry)
#define extent_for_each_entry(_e, _entry) \
extent_for_each_entry_from(_e, _entry, (_e).v->start)
@@ -367,54 +383,60 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
/* KEY_TYPE_btree_ptr: */
-const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
int, struct bkey_s);
-#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \
+#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \
.key_invalid = bch2_btree_ptr_invalid, \
.val_to_text = bch2_btree_ptr_to_text, \
.swab = bch2_ptr_swab, \
-}
+ .trans_trigger = bch2_trans_mark_extent, \
+ .atomic_trigger = bch2_mark_extent, \
+})
-#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \
+#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \
.key_invalid = bch2_btree_ptr_v2_invalid, \
.val_to_text = bch2_btree_ptr_v2_to_text, \
.swab = bch2_ptr_swab, \
.compat = bch2_btree_ptr_v2_compat, \
-}
+ .trans_trigger = bch2_trans_mark_extent, \
+ .atomic_trigger = bch2_mark_extent, \
+})
/* KEY_TYPE_extent: */
-const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-#define bch2_bkey_ops_extent (struct bkey_ops) { \
- .key_invalid = bch2_extent_invalid, \
- .val_to_text = bch2_extent_to_text, \
+#define bch2_bkey_ops_extent ((struct bkey_ops) { \
+ .key_invalid = bch2_bkey_ptrs_invalid, \
+ .val_to_text = bch2_bkey_ptrs_to_text, \
.swab = bch2_ptr_swab, \
.key_normalize = bch2_extent_normalize, \
.key_merge = bch2_extent_merge, \
-}
+ .trans_trigger = bch2_trans_mark_extent, \
+ .atomic_trigger = bch2_mark_extent, \
+})
/* KEY_TYPE_reservation: */
-const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
+ unsigned, struct printbuf *);
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-#define bch2_bkey_ops_reservation (struct bkey_ops) { \
+#define bch2_bkey_ops_reservation ((struct bkey_ops) { \
.key_invalid = bch2_reservation_invalid, \
.val_to_text = bch2_reservation_to_text, \
.key_merge = bch2_reservation_merge, \
-}
+ .trans_trigger = bch2_trans_mark_reservation, \
+ .atomic_trigger = bch2_mark_reservation, \
+})
/* Extent checksum entries: */
@@ -500,6 +522,23 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
}
}
+static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->unwritten)
+ return true;
+ return false;
+}
+
+static inline bool bkey_extent_is_reservation(struct bkey_s_c k)
+{
+ return k.k->type == KEY_TYPE_reservation ||
+ bkey_extent_is_unwritten(k);
+}
+
static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
@@ -569,17 +608,49 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c);
unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
+unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
-void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
- unsigned, unsigned);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
+
+static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
+{
+ return (void *) bch2_bkey_has_device_c(k.s_c, dev);
+}
+
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
-void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
+
+static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
+{
+ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
+
+ switch (k->k.type) {
+ case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
+ case KEY_TYPE_extent:
+ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
+
+ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+
+ memcpy((void *) &k->v + bkey_val_bytes(&k->k),
+ &ptr,
+ sizeof(ptr));
+ k->k.u64s++;
+ break;
+ default:
+ BUG();
+ }
+}
+
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
-union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s,
- struct bch_extent_ptr *);
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
+ struct bch_extent_ptr *);
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
struct bch_extent_ptr *);
@@ -600,17 +671,19 @@ do { \
} \
} while (0)
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
-bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
-
bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
struct bch_extent_ptr, u64);
+bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
+
+void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
+ unsigned, struct printbuf *);
void bch2_ptr_swab(struct bkey_s);
@@ -627,9 +700,8 @@ enum bch_extent_overlap {
static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
const struct bkey *m)
{
- int cmp1 = bkey_cmp(k->p, m->p) < 0;
- int cmp2 = bkey_cmp(bkey_start_pos(k),
- bkey_start_pos(m)) > 0;
+ int cmp1 = bkey_lt(k->p, m->p);
+ int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m));
return (cmp1 << 1) + cmp2;
}
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
index cdb272708a4b..66b945be10c2 100644
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
@@ -65,7 +65,7 @@ do { \
(((p) - (fifo)->data)))
#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
-#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
+#define fifo_idx_entry(fifo, i) ((fifo)->data[((fifo)->front + (i)) & (fifo)->mask])
#define fifo_push_back_ref(f) \
(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index d543480be111..1f2e1fc4f6b2 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -204,12 +204,19 @@ int bch2_link_trans(struct btree_trans *trans,
goto err;
inode_u->bi_ctime = now;
- bch2_inode_nlink_inc(inode_u);
+ ret = bch2_inode_nlink_inc(inode_u);
+ if (ret)
+ return ret;
ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
+ if (bch2_reinherit_attrs(inode_u, dir_u)) {
+ ret = -EXDEV;
+ goto err;
+ }
+
dir_u->bi_mtime = dir_u->bi_ctime = now;
dir_hash = bch2_hash_info_init(c, dir_u);
@@ -297,7 +304,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
if (ret)
goto err;
} else {
- bch2_inode_nlink_dec(inode_u);
+ bch2_inode_nlink_dec(trans, inode_u);
}
if (inode_u->bi_dir == dirent_iter.pos.inode &&
@@ -462,7 +469,7 @@ int bch2_rename_trans(struct btree_trans *trans,
}
if (mode == BCH_RENAME_OVERWRITE)
- bch2_inode_nlink_dec(dst_inode_u);
+ bch2_inode_nlink_dec(trans, dst_inode_u);
src_dir_u->bi_mtime = now;
src_dir_u->bi_ctime = now;
@@ -480,11 +487,11 @@ int bch2_rename_trans(struct btree_trans *trans,
ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
(src_dir.inum != dst_dir.inum
? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
- : 0 ) ?:
+ : 0) ?:
bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
(dst_inum.inum
? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
- : 0 );
+ : 0);
err:
bch2_trans_iter_exit(trans, &dst_inode_iter);
bch2_trans_iter_exit(trans, &src_inode_iter);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8f0b2a745064..df2f317f5443 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -35,6 +35,81 @@
#include <trace/events/bcachefs.h>
#include <trace/events/writeback.h>
+struct nocow_flush {
+ struct closure *cl;
+ struct bch_dev *ca;
+ struct bio bio;
+};
+
+static void nocow_flush_endio(struct bio *_bio)
+{
+
+ struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
+
+ closure_put(bio->cl);
+ percpu_ref_put(&bio->ca->io_ref);
+ bio_put(&bio->bio);
+}
+
+static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct closure *cl)
+{
+ struct nocow_flush *bio;
+ struct bch_dev *ca;
+ struct bch_devs_mask devs;
+ unsigned dev;
+
+ dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
+ if (dev == BCH_SB_MEMBERS_MAX)
+ return;
+
+ devs = inode->ei_devs_need_flush;
+ memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
+
+ for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
+ rcu_read_lock();
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca && !percpu_ref_tryget(&ca->io_ref))
+ ca = NULL;
+ rcu_read_unlock();
+
+ if (!ca)
+ continue;
+
+ bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
+ REQ_OP_FLUSH,
+ GFP_KERNEL,
+ &c->nocow_flush_bioset),
+ struct nocow_flush, bio);
+ bio->cl = cl;
+ bio->ca = ca;
+ bio->bio.bi_end_io = nocow_flush_endio;
+ closure_bio_submit(&bio->bio, cl);
+ }
+}
+
+static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
+ struct bch_inode_info *inode)
+{
+ struct closure cl;
+
+ closure_init_stack(&cl);
+ bch2_inode_flush_nocow_writes_async(c, inode, &cl);
+ closure_sync(&cl);
+
+ return 0;
+}
+
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+ if (bio->bi_vcnt >= bio->bi_max_vecs)
+ return true;
+ if (bio->bi_iter.bi_size > UINT_MAX - len)
+ return true;
+ return false;
+}
+
static inline struct address_space *faults_disabled_mapping(void)
{
return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
@@ -56,7 +131,6 @@ struct quota_res {
};
struct bch_writepage_io {
- struct closure cl;
struct bch_inode_info *inode;
/* must be last: */
@@ -64,11 +138,14 @@ struct bch_writepage_io {
};
struct dio_write {
- struct completion done;
struct kiocb *req;
+ struct address_space *mapping;
+ struct bch_inode_info *inode;
struct mm_struct *mm;
unsigned loop:1,
+ extending:1,
sync:1,
+ flush:1,
free_iov:1;
struct quota_res quota_res;
u64 written;
@@ -89,7 +166,7 @@ struct dio_read {
};
/* pagecache_block must be held */
-static int write_invalidate_inode_pages_range(struct address_space *mapping,
+static noinline int write_invalidate_inode_pages_range(struct address_space *mapping,
loff_t start, loff_t end)
{
int ret;
@@ -121,28 +198,33 @@ static int write_invalidate_inode_pages_range(struct address_space *mapping,
#ifdef CONFIG_BCACHEFS_QUOTA
-static void bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res)
+static void __bch2_quota_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res)
{
- if (!res->sectors)
- return;
-
- mutex_lock(&inode->ei_quota_lock);
BUG_ON(res->sectors > inode->ei_quota_reserved);
bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
inode->ei_quota_reserved -= res->sectors;
- mutex_unlock(&inode->ei_quota_lock);
-
res->sectors = 0;
}
+static void bch2_quota_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res)
+{
+ if (res->sectors) {
+ mutex_lock(&inode->ei_quota_lock);
+ __bch2_quota_reservation_put(c, inode, res);
+ mutex_unlock(&inode->ei_quota_lock);
+ }
+}
+
static int bch2_quota_reservation_add(struct bch_fs *c,
struct bch_inode_info *inode,
struct quota_res *res,
- unsigned sectors,
+ u64 sectors,
bool check_enospc)
{
int ret;
@@ -161,11 +243,13 @@ static int bch2_quota_reservation_add(struct bch_fs *c,
#else
+static void __bch2_quota_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res) {}
+
static void bch2_quota_reservation_put(struct bch_fs *c,
struct bch_inode_info *inode,
- struct quota_res *res)
-{
-}
+ struct quota_res *res) {}
static int bch2_quota_reservation_add(struct bch_fs *c,
struct bch_inode_info *inode,
@@ -216,14 +300,13 @@ int __must_check bch2_write_inode_size(struct bch_fs *c,
return bch2_write_inode(c, inode, inode_set_size, &s, fields);
}
-static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
struct quota_res *quota_res, s64 sectors)
{
- if (!sectors)
- return;
-
- mutex_lock(&inode->ei_quota_lock);
- BUG_ON((s64) inode->v.i_blocks + sectors < 0);
+ bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+ "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+ inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+ inode->ei_inode.bi_sectors);
inode->v.i_blocks += sectors;
#ifdef CONFIG_BCACHEFS_QUOTA
@@ -237,7 +320,16 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
}
#endif
- mutex_unlock(&inode->ei_quota_lock);
+}
+
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+ struct quota_res *quota_res, s64 sectors)
+{
+ if (sectors) {
+ mutex_lock(&inode->ei_quota_lock);
+ __i_sectors_acct(c, inode, quota_res, sectors);
+ mutex_unlock(&inode->ei_quota_lock);
+ }
}
/* page state: */
@@ -285,28 +377,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page)
/* for newly allocated pages: */
static void __bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = __bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ kfree(detach_page_private(page));
}
static void bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ EBUG_ON(!PageLocked(page));
+ __bch2_page_state_release(page);
}
/* for newly allocated pages: */
@@ -320,13 +397,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page,
return NULL;
spin_lock_init(&s->lock);
- /*
- * migrate_page_move_mapping() assumes that pages with private data
- * have their count elevated by 1.
- */
- get_page(page);
- set_page_private(page, (unsigned long) s);
- SetPagePrivate(page);
+ attach_page_private(page, s);
return s;
}
@@ -336,11 +407,11 @@ static struct bch_page_state *bch2_page_state_create(struct page *page,
return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
}
-static unsigned bkey_to_sector_state(const struct bkey *k)
+static unsigned bkey_to_sector_state(struct bkey_s_c k)
{
- if (k->type == KEY_TYPE_reservation)
+ if (bkey_extent_is_reservation(k))
return SECTOR_RESERVED;
- if (bkey_extent_is_allocation(k))
+ if (bkey_extent_is_allocation(k.k))
return SECTOR_ALLOCATED;
return SECTOR_UNALLOCATED;
}
@@ -391,7 +462,7 @@ retry:
SPOS(inum.inum, offset, snapshot),
BTREE_ITER_SLOTS, k, ret) {
unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = bkey_to_sector_state(k.k);
+ unsigned state = bkey_to_sector_state(k);
while (pg_idx < nr_pages) {
struct page *page = pages[pg_idx];
@@ -418,7 +489,7 @@ retry:
offset = iter.pos.offset;
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
@@ -431,7 +502,7 @@ static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
struct bio_vec bv;
unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = bkey_to_sector_state(k.k);
+ unsigned state = bkey_to_sector_state(k);
bio_for_each_segment(bv, bio, iter)
__bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9,
@@ -443,22 +514,20 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode,
{
pgoff_t index = start >> PAGE_SECTORS_SHIFT;
pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
- struct pagevec pvec;
+ struct folio_batch fbatch;
+ unsigned i, j;
if (end <= start)
return;
- pagevec_init(&pvec);
-
- do {
- unsigned nr_pages, i, j;
+ folio_batch_init(&fbatch);
- nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
- &index, end_index);
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
- u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ while (filemap_get_folios(inode->v.i_mapping,
+ &index, end_index, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ u64 pg_start = folio->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
unsigned pg_offset = max(start, pg_start) - pg_start;
unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
struct bch_page_state *s;
@@ -467,8 +536,8 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode,
BUG_ON(pg_offset >= PAGE_SECTORS);
BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
- lock_page(page);
- s = bch2_page_state(page);
+ folio_lock(folio);
+ s = bch2_page_state(&folio->page);
if (s) {
spin_lock(&s->lock);
@@ -477,10 +546,11 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode,
spin_unlock(&s->lock);
}
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_release(&pvec);
- } while (index <= end_index);
+ folio_batch_release(&fbatch);
+ cond_resched();
+ }
}
static void mark_pagecache_reserved(struct bch_inode_info *inode,
@@ -489,23 +559,21 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
pgoff_t index = start >> PAGE_SECTORS_SHIFT;
pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
- struct pagevec pvec;
+ struct folio_batch fbatch;
s64 i_sectors_delta = 0;
+ unsigned i, j;
if (end <= start)
return;
- pagevec_init(&pvec);
-
- do {
- unsigned nr_pages, i, j;
+ folio_batch_init(&fbatch);
- nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
- &index, end_index);
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
- u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ while (filemap_get_folios(inode->v.i_mapping,
+ &index, end_index, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ u64 pg_start = folio->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT;
unsigned pg_offset = max(start, pg_start) - pg_start;
unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
struct bch_page_state *s;
@@ -514,8 +582,8 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode,
BUG_ON(pg_offset >= PAGE_SECTORS);
BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
- lock_page(page);
- s = bch2_page_state(page);
+ folio_lock(folio);
+ s = bch2_page_state(&folio->page);
if (s) {
spin_lock(&s->lock);
@@ -534,10 +602,11 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode,
spin_unlock(&s->lock);
}
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_release(&pvec);
- } while (index <= end_index);
+ folio_batch_release(&fbatch);
+ cond_resched();
+ }
i_sectors_acct(c, inode, NULL, i_sectors_delta);
}
@@ -617,7 +686,7 @@ static void bch2_page_reservation_put(struct bch_fs *c,
static int bch2_page_reservation_get(struct bch_fs *c,
struct bch_inode_info *inode, struct page *page,
struct bch2_page_reservation *res,
- unsigned offset, unsigned len, bool check_enospc)
+ unsigned offset, unsigned len)
{
struct bch_page_state *s = bch2_page_state_create(page, 0);
unsigned i, disk_sectors = 0, quota_sectors = 0;
@@ -637,19 +706,14 @@ static int bch2_page_reservation_get(struct bch_fs *c,
}
if (disk_sectors) {
- ret = bch2_disk_reservation_add(c, &res->disk,
- disk_sectors,
- !check_enospc
- ? BCH_DISK_RESERVATION_NOFAIL
- : 0);
+ ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
if (unlikely(ret))
return ret;
}
if (quota_sectors) {
ret = bch2_quota_reservation_add(c, inode, &res->quota,
- quota_sectors,
- check_enospc);
+ quota_sectors, true);
if (unlikely(ret)) {
struct disk_reservation tmp = {
.sectors = disk_sectors
@@ -748,7 +812,7 @@ static void bch2_set_page_dirty(struct bch_fs *c,
i_sectors_acct(c, inode, &res->quota, dirty_sectors);
if (!PageDirty(page))
- __set_page_dirty_nobuffers(page);
+ filemap_dirty_folio(inode->v.i_mapping, page_folio(page));
}
vm_fault_t bch2_page_fault(struct vm_fault *vmf)
@@ -766,25 +830,25 @@ vm_fault_t bch2_page_fault(struct vm_fault *vmf)
if (fdm > mapping) {
struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
- if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
+ if (bch2_pagecache_add_tryget(inode))
goto got_lock;
- bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
+ bch2_pagecache_block_put(fdm_host);
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
+ bch2_pagecache_add_put(inode);
- bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
+ bch2_pagecache_block_get(fdm_host);
/* Signal that lock has been dropped: */
set_fdm_dropped_locks();
return VM_FAULT_SIGBUS;
}
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
got_lock:
ret = filemap_fault(vmf);
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
return ret;
}
@@ -812,7 +876,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
* a write_invalidate_inode_pages_range() that works without dropping
* page lock before invalidating page
*/
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
lock_page(page);
isize = i_size_read(&inode->v);
@@ -833,7 +897,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
}
}
- if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
+ if (bch2_page_reservation_get(c, inode, page, &res, 0, len)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
goto out;
@@ -845,68 +909,37 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
wait_for_stable_page(page);
ret = VM_FAULT_LOCKED;
out:
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
sb_end_pagefault(inode->v.i_sb);
return ret;
}
-void bch2_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
- if (offset || length < PAGE_SIZE)
+ if (offset || length < folio_size(folio))
return;
- bch2_clear_page_bits(page);
+ bch2_clear_page_bits(&folio->page);
}
-int bch2_releasepage(struct page *page, gfp_t gfp_mask)
+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
{
- if (PageDirty(page))
- return 0;
+ if (folio_test_dirty(folio) || folio_test_writeback(folio))
+ return false;
- bch2_clear_page_bits(page);
- return 1;
+ bch2_clear_page_bits(&folio->page);
+ return true;
}
-#ifdef CONFIG_MIGRATION
-int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode)
-{
- int ret;
-
- EBUG_ON(!PageLocked(page));
- EBUG_ON(!PageLocked(newpage));
-
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
- if (ret != MIGRATEPAGE_SUCCESS)
- return ret;
-
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- get_page(newpage);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- SetPagePrivate(newpage);
- }
-
- if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
- else
- migrate_page_states(newpage, page);
- return MIGRATEPAGE_SUCCESS;
-}
-#endif
-
/* readpage(s): */
static void bch2_readpages_end_io(struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i) {
+ bio_for_each_segment_all(bv, bio, iter) {
struct page *page = bv->bv_page;
if (!bio->bi_status) {
@@ -925,31 +958,29 @@ struct readpages_iter {
struct address_space *mapping;
struct page **pages;
unsigned nr_pages;
- unsigned nr_added;
unsigned idx;
pgoff_t offset;
};
static int readpages_iter_init(struct readpages_iter *iter,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+ struct readahead_control *ractl)
{
+ unsigned i, nr_pages = readahead_count(ractl);
+
memset(iter, 0, sizeof(*iter));
- iter->mapping = mapping;
- iter->offset = list_last_entry(pages, struct page, lru)->index;
+ iter->mapping = ractl->mapping;
+ iter->offset = readahead_index(ractl);
+ iter->nr_pages = nr_pages;
iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!iter->pages)
return -ENOMEM;
- while (!list_empty(pages)) {
- struct page *page = list_last_entry(pages, struct page, lru);
-
- __bch2_page_state_create(page, __GFP_NOFAIL);
-
- iter->pages[iter->nr_pages++] = page;
- list_del(&page->lru);
+ nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
+ for (i = 0; i < nr_pages; i++) {
+ __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
+ put_page(iter->pages[i]);
}
return 0;
@@ -957,41 +988,9 @@ static int readpages_iter_init(struct readpages_iter *iter,
static inline struct page *readpage_iter_next(struct readpages_iter *iter)
{
- struct page *page;
- unsigned i;
- int ret;
-
- BUG_ON(iter->idx > iter->nr_added);
- BUG_ON(iter->nr_added > iter->nr_pages);
-
- if (iter->idx < iter->nr_added)
- goto out;
-
- while (1) {
- if (iter->idx == iter->nr_pages)
- return NULL;
-
- ret = add_to_page_cache_lru_vec(iter->mapping,
- iter->pages + iter->nr_added,
- iter->nr_pages - iter->nr_added,
- iter->offset + iter->nr_added,
- GFP_NOFS);
- if (ret > 0)
- break;
-
- page = iter->pages[iter->nr_added];
- iter->idx++;
- iter->nr_added++;
-
- __bch2_page_state_release(page);
- put_page(page);
- }
-
- iter->nr_added += ret;
+ if (iter->idx >= iter->nr_pages)
+ return NULL;
- for (i = iter->idx; i < iter->nr_added; i++)
- put_page(iter->pages[i]);
-out:
EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
return iter->pages[iter->idx];
@@ -1029,11 +1028,8 @@ static void readpage_bio_extend(struct readpages_iter *iter,
if (!get_more)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
- rcu_read_unlock();
-
- if (page && !radix_tree_exceptional_entry(page))
+ page = xa_load(&iter->mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page))
break;
page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
@@ -1098,10 +1094,9 @@ retry:
* read_extent -> io_time_reset may cause a transaction restart
* without returning an error, we need to check for that here:
*/
- if (!bch2_trans_relock(trans)) {
- ret = -EINTR;
+ ret = bch2_trans_relock(trans);
+ if (ret)
break;
- }
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
@@ -1126,8 +1121,6 @@ retry:
sectors = min(sectors, k.k->size - offset_into_extent);
- bch2_trans_unlock(trans);
-
if (readpages_iter)
readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
extent_partial_reads_expensive(k));
@@ -1156,11 +1149,13 @@ retry:
err:
bch2_trans_iter_exit(trans, &iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (ret) {
- bch_err_inum_ratelimited(c, inum.inum,
+ bch_err_inum_offset_ratelimited(c,
+ iter.pos.inode,
+ iter.pos.offset << 9,
"read error %i from btree lookup", ret);
rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
@@ -1169,37 +1164,38 @@ err:
bch2_bkey_buf_exit(&sk, c);
}
-int bch2_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+void bch2_readahead(struct readahead_control *ractl)
{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
+ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
+ struct bch_io_opts opts;
struct btree_trans trans;
struct page *page;
struct readpages_iter readpages_iter;
int ret;
- ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages);
+ bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+ ret = readpages_iter_init(&readpages_iter, ractl);
BUG_ON(ret);
bch2_trans_init(&trans, c, 0, 0);
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
while ((page = readpage_iter_next(&readpages_iter))) {
pgoff_t index = readpages_iter.offset + readpages_iter.idx;
unsigned n = min_t(unsigned,
readpages_iter.nr_pages -
readpages_iter.idx,
- BIO_MAX_PAGES);
+ BIO_MAX_VECS);
struct bch_read_bio *rbio =
- rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read),
+ rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
+ GFP_NOFS, &c->bio_read),
opts);
readpages_iter.idx++;
- bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0);
rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
rbio->bio.bi_end_io = bch2_readpages_end_io;
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
@@ -1208,12 +1204,10 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
&readpages_iter);
}
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
-
- return 0;
}
static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
@@ -1223,7 +1217,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
bch2_page_state_create(page, __GFP_NOFAIL);
- bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
+ rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
rbio->bio.bi_iter.bi_sector =
(sector_t) page->index << PAGE_SECTORS_SHIFT;
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
@@ -1233,20 +1227,6 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
bch2_trans_exit(&trans);
}
-int bch2_readpage(struct file *file, struct page *page)
-{
- struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
- struct bch_read_bio *rbio;
-
- rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
- rbio->bio.bi_end_io = bch2_readpages_end_io;
-
- __bchfs_readpage(c, rbio, inode_inum(inode), page);
- return 0;
-}
-
static void bch2_read_single_page_end_io(struct bio *bio)
{
complete(bio->bi_private);
@@ -1258,11 +1238,14 @@ static int bch2_read_single_page(struct page *page,
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_read_bio *rbio;
+ struct bch_io_opts opts;
int ret;
DECLARE_COMPLETION_ONSTACK(done);
- rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read),
- io_opts(c, &inode->ei_inode));
+ bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+ rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read),
+ opts);
rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_page_end_io;
@@ -1279,6 +1262,16 @@ static int bch2_read_single_page(struct page *page,
return 0;
}
+int bch2_read_folio(struct file *file, struct folio *folio)
+{
+ struct page *page = &folio->page;
+ int ret;
+
+ ret = bch2_read_single_page(page, page->mapping);
+ folio_unlock(folio);
+ return bch2_err_class(ret);
+}
+
/* writepages: */
struct bch_writepage_state {
@@ -1289,55 +1282,47 @@ struct bch_writepage_state {
static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
struct bch_inode_info *inode)
{
- return (struct bch_writepage_state) {
- .opts = io_opts(c, &inode->ei_inode)
- };
-}
+ struct bch_writepage_state ret = { 0 };
-static void bch2_writepage_io_free(struct closure *cl)
-{
- struct bch_writepage_io *io = container_of(cl,
- struct bch_writepage_io, cl);
-
- bio_put(&io->op.wbio.bio);
+ bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
+ return ret;
}
-static void bch2_writepage_io_done(struct closure *cl)
+static void bch2_writepage_io_done(struct bch_write_op *op)
{
- struct bch_writepage_io *io = container_of(cl,
- struct bch_writepage_io, cl);
+ struct bch_writepage_io *io =
+ container_of(op, struct bch_writepage_io, op);
struct bch_fs *c = io->op.c;
struct bio *bio = &io->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bvec;
- unsigned i, j;
-
- up(&io->op.c->io_in_flight);
+ unsigned i;
if (io->op.error) {
set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
SetPageError(bvec->bv_page);
- mapping_set_error(io->inode->v.i_mapping, -EIO);
+ mapping_set_error(bvec->bv_page->mapping, -EIO);
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
s = __bch2_page_state(bvec->bv_page);
spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
+ for (i = 0; i < PAGE_SECTORS; i++)
+ s->s[i].nr_replicas = 0;
spin_unlock(&s->lock);
}
}
@@ -1346,7 +1331,7 @@ static void bch2_writepage_io_done(struct closure *cl)
* racing with fallocate can cause us to add fewer sectors than
* expected - but we shouldn't add more sectors than expected:
*/
- WARN_ON(io->op.i_sectors_delta > 0);
+ WARN_ON_ONCE(io->op.i_sectors_delta > 0);
/*
* (error (due to going RO) halfway through a page can screw that up
@@ -1361,25 +1346,22 @@ static void bch2_writepage_io_done(struct closure *cl)
*/
i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
if (atomic_dec_and_test(&s->write_count))
end_page_writeback(bvec->bv_page);
}
- closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
+ bio_put(&io->op.wbio.bio);
}
static void bch2_writepage_do_io(struct bch_writepage_state *w)
{
struct bch_writepage_io *io = w->io;
- down(&io->op.c->io_in_flight);
-
w->io = NULL;
- closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
- continue_at(&io->cl, bch2_writepage_io_done, NULL);
+ closure_call(&io->op.cl, bch2_write, NULL, NULL);
}
/*
@@ -1395,13 +1377,13 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
{
struct bch_write_op *op;
- w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES,
+ w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
+ REQ_OP_WRITE,
+ GFP_NOFS,
&c->writepage_bioset),
struct bch_writepage_io, op.wbio.bio);
- closure_init(&w->io->cl, NULL);
w->io->inode = inode;
-
op = &w->io->op;
bch2_write_op_init(op, c, w->opts);
op->target = w->opts.foreground_target;
@@ -1410,6 +1392,8 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
op->subvol = inode->ei_subvol;
op->pos = POS(inode->v.i_ino, sector);
+ op->end_io = bch2_writepage_io_done;
+ op->devs_need_flush = &inode->ei_devs_need_flush;
op->wbio.bio.bi_iter.bi_sector = sector;
op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
}
@@ -1515,9 +1499,9 @@ do_io:
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bio_full(&w->io->op.wbio.bio) ||
+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
- (BIO_MAX_PAGES * PAGE_SIZE) ||
+ (BIO_MAX_VECS * PAGE_SIZE) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
bch2_writepage_do_io(w);
@@ -1532,8 +1516,13 @@ do_io:
sectors << 9, offset << 9));
/* Check for writing past i_size: */
- WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) >
- round_up(i_size, block_bytes(c)));
+ WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+ round_up(i_size, block_bytes(c)) &&
+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
+ "writing past i_size: %llu > %llu (unrounded %llu)\n",
+ bio_end_sector(&w->io->op.wbio.bio) << 9,
+ round_up(i_size, block_bytes(c)),
+ i_size);
w->io->op.res.sectors += reserved_sectors;
w->io->op.i_sectors_delta -= dirty_sectors;
@@ -1561,27 +1550,13 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
if (w.io)
bch2_writepage_do_io(&w);
blk_finish_plug(&plug);
- return ret;
-}
-
-int bch2_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w =
- bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
- int ret;
-
- ret = __bch2_writepage(page, wbc, &w);
- if (w.io)
- bch2_writepage_do_io(&w);
-
- return ret;
+ return bch2_err_class(ret);
}
/* buffered writes: */
int bch2_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
@@ -1599,9 +1574,9 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
bch2_page_reservation_init(c, inode, res);
*fsdata = res;
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
goto err_unlock;
@@ -1631,11 +1606,10 @@ out:
if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
if (ret)
- goto out;
+ goto err;
}
- ret = bch2_page_reservation_get(c, inode, page, res,
- offset, len, true);
+ ret = bch2_page_reservation_get(c, inode, page, res, offset, len);
if (ret) {
if (!PageUptodate(page)) {
/*
@@ -1657,10 +1631,10 @@ err:
put_page(page);
*pagep = NULL;
err_unlock:
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
kfree(res);
*fsdata = NULL;
- return ret;
+ return bch2_err_class(ret);
}
int bch2_write_end(struct file *file, struct address_space *mapping,
@@ -1701,7 +1675,7 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
unlock_page(page);
put_page(page);
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
bch2_page_reservation_put(c, inode, res);
kfree(res);
@@ -1732,7 +1706,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
bch2_page_reservation_init(c, inode, &res);
for (i = 0; i < nr_pages; i++) {
- pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
+ pages[i] = grab_cache_page_write_begin(mapping, index + i);
if (!pages[i]) {
nr_pages = i;
if (!i) {
@@ -1776,10 +1750,21 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
goto out;
}
+ /*
+ * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
+ * supposed to write as much as we have disk space for.
+ *
+ * On failure here we should still write out a partial page if
+ * we aren't completely out of disk space - we don't do that
+ * yet:
+ */
ret = bch2_page_reservation_get(c, inode, page, &res,
- pg_offset, pg_len, true);
- if (ret)
- goto out;
+ pg_offset, pg_len);
+ if (unlikely(ret)) {
+ if (!reserved)
+ goto out;
+ break;
+ }
reserved += pg_len;
}
@@ -1788,13 +1773,13 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
for (i = 0; i < nr_pages; i++)
flush_dcache_page(pages[i]);
- while (copied < len) {
+ while (copied < reserved) {
struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
- unsigned pg_len = min_t(unsigned, len - copied,
+ unsigned pg_len = min_t(unsigned, reserved - copied,
PAGE_SIZE - pg_offset);
- unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
- iter, pg_offset, pg_len);
+ unsigned pg_copied = copy_page_from_iter_atomic(page,
+ pg_offset, pg_len, iter);
if (!pg_copied)
break;
@@ -1807,7 +1792,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
}
flush_dcache_page(page);
- iov_iter_advance(iter, pg_copied);
copied += pg_copied;
if (pg_copied != pg_len)
@@ -1860,7 +1844,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
ssize_t written = 0;
int ret = 0;
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
do {
unsigned offset = pos & (PAGE_SIZE - 1);
@@ -1877,11 +1861,11 @@ again:
* to check that the address is actually valid, when atomic
* usercopies are used, below.
*/
- if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
bytes = min_t(unsigned long, iov_iter_count(iter),
PAGE_SIZE - offset);
- if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
ret = -EFAULT;
break;
}
@@ -1918,25 +1902,13 @@ again:
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(iter));
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
return written ? written : ret;
}
/* O_DIRECT reads */
-static void bio_release_pages(struct bio *bio, bool mark_dirty)
-{
- struct bio_vec *bvec;
- unsigned i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- if (mark_dirty && !PageCompound(bvec->bv_page))
- set_page_dirty_lock(bvec->bv_page);
- put_page(bvec->bv_page);
- }
-}
-
static void bio_check_or_release(struct bio *bio, bool check_dirty)
{
if (check_dirty) {
@@ -1951,7 +1923,7 @@ static void bch2_dio_read_complete(struct closure *cl)
{
struct dio_read *dio = container_of(cl, struct dio_read, cl);
- dio->req->ki_complete(dio->req, dio->ret, 0);
+ dio->req->ki_complete(dio->req, dio->ret);
bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
}
@@ -1979,7 +1951,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
struct file *file = req->ki_filp;
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
+ struct bch_io_opts opts;
struct dio_read *dio;
struct bio *bio;
loff_t offset = req->ki_pos;
@@ -1987,6 +1959,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
size_t shorten;
ssize_t ret;
+ bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
if ((offset|iter->count) & (block_bytes(c) - 1))
return -EINVAL;
@@ -1999,8 +1973,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
iter->count -= shorten;
- bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_PAGES),
+ bio = bio_alloc_bioset(NULL,
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+ REQ_OP_READ,
+ GFP_KERNEL,
&c->dio_read_bioset);
bio->bi_end_io = bch2_direct_IO_read_endio;
@@ -2034,12 +2010,14 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
goto start;
while (iter->count) {
- bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_PAGES),
+ bio = bio_alloc_bioset(NULL,
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+ REQ_OP_READ,
+ GFP_KERNEL,
&c->bio_read);
bio->bi_end_io = bch2_direct_IO_read_split_endio;
start:
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
+ bio->bi_opf = REQ_OP_READ|REQ_SYNC;
bio->bi_iter.bi_sector = offset >> 9;
bio->bi_private = dio;
@@ -2089,11 +2067,13 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (iocb->ki_flags & IOCB_DIRECT) {
struct blk_plug plug;
- ret = filemap_write_and_wait_range(mapping,
- iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (ret < 0)
- return ret;
+ if (unlikely(mapping->nrpages)) {
+ ret = filemap_write_and_wait_range(mapping,
+ iocb->ki_pos,
+ iocb->ki_pos + count - 1);
+ if (ret < 0)
+ goto out;
+ }
file_accessed(file);
@@ -2104,12 +2084,12 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (ret >= 0)
iocb->ki_pos += ret;
} else {
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
ret = generic_file_read_iter(iocb, iter);
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
}
-
- return ret;
+out:
+ return bch2_err_class(ret);
}
/* O_DIRECT writes */
@@ -2137,7 +2117,7 @@ retry:
for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
SPOS(inum.inum, offset, snapshot),
BTREE_ITER_SLOTS, k, err) {
- if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
+ if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
break;
if (k.k->p.snapshot != snapshot ||
@@ -2151,39 +2131,172 @@ retry:
offset = iter.pos.offset;
bch2_trans_iter_exit(&trans, &iter);
err:
- if (err == -EINTR)
+ if (bch2_err_matches(err, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
return err ? false : ret;
}
+static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
+ struct bch_inode_info *inode = dio->inode;
+ struct bio *bio = &dio->op.wbio.bio;
+
+ return bch2_check_range_allocated(c, inode_inum(inode),
+ dio->op.pos.offset, bio_sectors(bio),
+ dio->op.opts.data_replicas,
+ dio->op.opts.compression != 0);
+}
+
static void bch2_dio_write_loop_async(struct bch_write_op *);
+static __always_inline long bch2_dio_write_done(struct dio_write *dio);
+
+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
+{
+ struct iovec *iov = dio->inline_vecs;
+
+ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+ iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+ GFP_KERNEL);
+ if (unlikely(!iov))
+ return -ENOMEM;
+
+ dio->free_iov = true;
+ }
-static long bch2_dio_write_loop(struct dio_write *dio)
+ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
+ dio->iter.iov = iov;
+ return 0;
+}
+
+static void bch2_dio_write_flush_done(struct closure *cl)
+{
+ struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
+ struct bch_fs *c = dio->op.c;
+
+ closure_debug_destroy(cl);
+
+ dio->op.error = bch2_journal_error(&c->journal);
+
+ bch2_dio_write_done(dio);
+}
+
+static noinline void bch2_dio_write_flush(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ dio->flush = 0;
+
+ closure_init(&dio->op.cl, NULL);
+
+ if (!dio->op.error) {
+ ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
+ if (ret) {
+ dio->op.error = ret;
+ } else {
+ bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
+ bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
+ }
+ }
+
+ if (dio->sync) {
+ closure_sync(&dio->op.cl);
+ closure_debug_destroy(&dio->op.cl);
+ } else {
+ continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
+ }
+}
+
+static __always_inline long bch2_dio_write_done(struct dio_write *dio)
{
- bool kthread = (current->flags & PF_KTHREAD) != 0;
struct kiocb *req = dio->req;
- struct address_space *mapping = req->ki_filp->f_mapping;
- struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_inode_info *inode = dio->inode;
+ bool sync = dio->sync;
+ long ret;
+
+ if (unlikely(dio->flush)) {
+ bch2_dio_write_flush(dio);
+ if (!sync)
+ return -EIOCBQUEUED;
+ }
+
+ bch2_pagecache_block_put(inode);
+
+ if (dio->free_iov)
+ kfree(dio->iter.iov);
+
+ ret = dio->op.error ?: ((long) dio->written << 9);
+ bio_put(&dio->op.wbio.bio);
+
+ /* inode->i_dio_count is our ref on inode and thus bch_fs */
+ inode_dio_end(&inode->v);
+
+ if (ret < 0)
+ ret = bch2_err_class(ret);
+
+ if (!sync) {
+ req->ki_complete(req, ret);
+ ret = -EIOCBQUEUED;
+ }
+ return ret;
+}
+
+static __always_inline void bch2_dio_write_end(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
+ struct kiocb *req = dio->req;
+ struct bch_inode_info *inode = dio->inode;
struct bio *bio = &dio->op.wbio.bio;
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i, unaligned, iter_count;
+
+ req->ki_pos += (u64) dio->op.written << 9;
+ dio->written += dio->op.written;
+
+ if (dio->extending) {
+ spin_lock(&inode->v.i_lock);
+ if (req->ki_pos > inode->v.i_size)
+ i_size_write(&inode->v, req->ki_pos);
+ spin_unlock(&inode->v.i_lock);
+ }
+
+ if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
+ mutex_lock(&inode->ei_quota_lock);
+ __i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
+ __bch2_quota_reservation_put(c, inode, &dio->quota_res);
+ mutex_unlock(&inode->ei_quota_lock);
+ }
+
+ if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
+ bio_for_each_segment_all(bv, bio, iter)
+ put_page(bv->bv_page);
+
+ if (unlikely(dio->op.error))
+ set_bit(EI_INODE_ERROR, &inode->ei_flags);
+}
+
+static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
+ struct kiocb *req = dio->req;
+ struct address_space *mapping = dio->mapping;
+ struct bch_inode_info *inode = dio->inode;
+ struct bch_io_opts opts;
+ struct bio *bio = &dio->op.wbio.bio;
+ unsigned unaligned, iter_count;
bool sync = dio->sync, dropped_locks;
long ret;
- if (dio->loop)
- goto loop;
-
- down(&c->io_in_flight);
+ bch2_inode_opts_get(&opts, c, &inode->ei_inode);
while (1) {
iter_count = dio->iter.count;
- if (kthread)
- use_mm(dio->mm);
- BUG_ON(current->faults_disabled_mapping);
+ EBUG_ON(current->faults_disabled_mapping);
current->faults_disabled_mapping = mapping;
ret = bio_iov_iter_get_pages(bio, &dio->iter);
@@ -2191,8 +2304,6 @@ static long bch2_dio_write_loop(struct dio_write *dio)
dropped_locks = fdm_dropped_locks();
current->faults_disabled_mapping = NULL;
- if (kthread)
- unuse_mm(dio->mm);
/*
* If the fault handler returned an error but also signalled
@@ -2229,113 +2340,93 @@ static long bch2_dio_write_loop(struct dio_write *dio)
goto err;
}
- bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
- dio->op.end_io = bch2_dio_write_loop_async;
+ bch2_write_op_init(&dio->op, c, opts);
+ dio->op.end_io = sync
+ ? NULL
+ : bch2_dio_write_loop_async;
dio->op.target = dio->op.opts.foreground_target;
dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas;
dio->op.subvol = inode->ei_subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
+ dio->op.devs_need_flush = &inode->ei_devs_need_flush;
- if ((req->ki_flags & IOCB_DSYNC) &&
- !c->opts.journal_flush_disabled)
- dio->op.flags |= BCH_WRITE_FLUSH;
+ if (sync)
+ dio->op.flags |= BCH_WRITE_SYNC;
dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
+ ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
+ bio_sectors(bio), true);
+ if (unlikely(ret))
+ goto err;
+
ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
dio->op.opts.data_replicas, 0);
if (unlikely(ret) &&
- !bch2_check_range_allocated(c, inode_inum(inode),
- dio->op.pos.offset, bio_sectors(bio),
- dio->op.opts.data_replicas,
- dio->op.opts.compression != 0))
+ !bch2_dio_write_check_allocated(dio))
goto err;
task_io_account_write(bio->bi_iter.bi_size);
- if (!dio->sync && !dio->loop && dio->iter.count) {
- struct iovec *iov = dio->inline_vecs;
-
- if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
- iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
- GFP_KERNEL);
- if (unlikely(!iov)) {
- dio->sync = sync = true;
- goto do_io;
- }
-
- dio->free_iov = true;
- }
+ if (unlikely(dio->iter.count) &&
+ !dio->sync &&
+ !dio->loop &&
+ bch2_dio_write_copy_iov(dio))
+ dio->sync = sync = true;
- memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
- dio->iter.iov = iov;
- }
-do_io:
dio->loop = true;
closure_call(&dio->op.cl, bch2_write, NULL, NULL);
- if (sync)
- wait_for_completion(&dio->done);
- else
+ if (!sync)
return -EIOCBQUEUED;
-loop:
- i_sectors_acct(c, inode, &dio->quota_res,
- dio->op.i_sectors_delta);
- req->ki_pos += (u64) dio->op.written << 9;
- dio->written += dio->op.written;
- spin_lock(&inode->v.i_lock);
- if (req->ki_pos > inode->v.i_size)
- i_size_write(&inode->v, req->ki_pos);
- spin_unlock(&inode->v.i_lock);
+ bch2_dio_write_end(dio);
- bio_for_each_segment_all(bv, bio, i)
- put_page(bv->bv_page);
- bio->bi_vcnt = 0;
-
- if (dio->op.error) {
- set_bit(EI_INODE_ERROR, &inode->ei_flags);
+ if (likely(!dio->iter.count) || dio->op.error)
break;
- }
- if (!dio->iter.count)
- break;
+ bio_reset(bio, NULL, REQ_OP_WRITE);
+ }
+out:
+ return bch2_dio_write_done(dio);
+err:
+ dio->op.error = ret;
- bio_reset(bio);
- reinit_completion(&dio->done);
+ if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+ struct bvec_iter_all iter;
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, bio, iter)
+ put_page(bv->bv_page);
}
- ret = dio->op.error ?: ((long) dio->written << 9);
-err:
- up(&c->io_in_flight);
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
bch2_quota_reservation_put(c, inode, &dio->quota_res);
+ goto out;
+}
- if (dio->free_iov)
- kfree(dio->iter.iov);
-
- bio_for_each_segment_all(bv, bio, i)
- put_page(bv->bv_page);
- bio_put(bio);
+static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
+{
+ struct mm_struct *mm = dio->mm;
- /* inode->i_dio_count is our ref on inode and thus bch_fs */
- inode_dio_end(&inode->v);
+ bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
- if (!sync) {
- req->ki_complete(req, ret, 0);
- ret = -EIOCBQUEUED;
- }
- return ret;
+ if (mm)
+ kthread_use_mm(mm);
+ bch2_dio_write_loop(dio);
+ if (mm)
+ kthread_unuse_mm(mm);
}
static void bch2_dio_write_loop_async(struct bch_write_op *op)
{
struct dio_write *dio = container_of(op, struct dio_write, op);
- if (dio->sync)
- complete(&dio->done);
+ bch2_dio_write_end(dio);
+
+ if (likely(!dio->iter.count) || dio->op.error)
+ bch2_dio_write_done(dio);
else
- bch2_dio_write_loop(dio);
+ bch2_dio_write_continue(dio);
}
static noinline
@@ -2373,7 +2464,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
goto err;
inode_dio_begin(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_block_get(inode);
extending = req->ki_pos + iter->count > inode->v.i_size;
if (!extending) {
@@ -2381,30 +2472,33 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
locked = false;
}
- bio = bio_alloc_bioset(GFP_KERNEL,
- iov_iter_npages(iter, BIO_MAX_PAGES),
+ bio = bio_alloc_bioset(NULL,
+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+ REQ_OP_WRITE,
+ GFP_KERNEL,
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, op.wbio.bio);
- init_completion(&dio->done);
dio->req = req;
+ dio->mapping = mapping;
+ dio->inode = inode;
dio->mm = current->mm;
dio->loop = false;
+ dio->extending = extending;
dio->sync = is_sync_kiocb(req) || extending;
+ dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
dio->free_iov = false;
dio->quota_res.sectors = 0;
dio->written = 0;
dio->iter = *iter;
+ dio->op.c = c;
- ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
- iter->count >> 9, true);
- if (unlikely(ret))
- goto err_put_bio;
-
- ret = write_invalidate_inode_pages_range(mapping,
- req->ki_pos,
- req->ki_pos + iter->count - 1);
- if (unlikely(ret))
- goto err_put_bio;
+ if (unlikely(mapping->nrpages)) {
+ ret = write_invalidate_inode_pages_range(mapping,
+ req->ki_pos,
+ req->ki_pos + iter->count - 1);
+ if (unlikely(ret))
+ goto err_put_bio;
+ }
ret = bch2_dio_write_loop(dio);
err:
@@ -2412,8 +2506,7 @@ err:
inode_unlock(&inode->v);
return ret;
err_put_bio:
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
- bch2_quota_reservation_put(c, inode, &dio->quota_res);
+ bch2_pagecache_block_put(inode);
bio_put(bio);
inode_dio_end(&inode->v);
goto err;
@@ -2425,8 +2518,10 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct bch_inode_info *inode = file_bch_inode(file);
ssize_t ret;
- if (iocb->ki_flags & IOCB_DIRECT)
- return bch2_direct_write(iocb, from);
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = bch2_direct_write(iocb, from);
+ goto out;
+ }
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(&inode->v);
@@ -2453,8 +2548,8 @@ unlock:
if (ret > 0)
ret = generic_write_sync(iocb, ret);
-
- return ret;
+out:
+ return bch2_err_class(ret);
}
/* fsync: */
@@ -2463,19 +2558,21 @@ unlock:
* inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
* insert trigger: look up the btree inode instead
*/
-static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum)
+static int bch2_flush_inode(struct bch_fs *c,
+ struct bch_inode_info *inode)
{
- struct bch_inode_unpacked inode;
+ struct bch_inode_unpacked u;
int ret;
if (c->opts.journal_flush_disabled)
return 0;
- ret = bch2_inode_find_by_inum(c, inum, &inode);
+ ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
if (ret)
return ret;
- return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq);
+ return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+ bch2_inode_flush_nocow_writes(c, inode);
}
int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -2486,9 +2583,9 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = file_write_and_wait_range(file, start, end);
ret2 = sync_inode_metadata(&inode->v, 1);
- ret3 = bch2_flush_inode(c, inode_inum(inode));
+ ret3 = bch2_flush_inode(c, inode);
- return ret ?: ret2 ?: ret3;
+ return bch2_err_class(ret ?: ret2 ?: ret3);
}
/* truncate: */
@@ -2510,19 +2607,15 @@ retry:
if (ret)
goto err;
- for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
- if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
- break;
-
+ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
if (bkey_extent_is_data(k.k)) {
ret = 1;
break;
}
- }
start = iter.pos;
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
@@ -2558,8 +2651,8 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
* page
*/
ret = range_has_data(c, inode->ei_subvol,
- POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT),
- POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT));
+ POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
+ POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
if (ret <= 0)
return ret;
@@ -2622,7 +2715,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
* redirty the full page:
*/
page_mkclean(page);
- __set_page_dirty_nobuffers(page);
+ filemap_dirty_folio(mapping, page_folio(page));
unlock:
unlock_page(page);
put_page(page);
@@ -2669,7 +2762,7 @@ static int bch2_extend(struct user_namespace *mnt_userns,
truncate_setsize(&inode->v, iattr->ia_size);
- return bch2_setattr_nonsize(inode, iattr);
+ return bch2_setattr_nonsize(mnt_userns, inode, iattr);
}
static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
@@ -2714,7 +2807,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
}
inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_block_get(inode);
ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
if (ret)
@@ -2729,8 +2822,10 @@ int bch2_truncate(struct user_namespace *mnt_userns,
if (ret)
goto err;
- WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
- inode->v.i_size < inode_u.bi_size);
+ WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
+ inode->v.i_size < inode_u.bi_size,
+ "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
+ (u64) inode->v.i_size, inode_u.bi_size);
if (iattr->ia_size > inode->v.i_size) {
ret = bch2_extend(mnt_userns, inode, &inode_u, iattr);
@@ -2779,9 +2874,11 @@ int bch2_truncate(struct user_namespace *mnt_userns,
U64_MAX, &i_sectors_delta);
i_sectors_acct(c, inode, NULL, i_sectors_delta);
- WARN_ON(!inode->v.i_size && inode->v.i_blocks &&
- !bch2_journal_error(&c->journal));
-
+ bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
+ !bch2_journal_error(&c->journal), c,
+ "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
+ inode->v.i_ino, (u64) inode->v.i_blocks,
+ inode->ei_inode.bi_sectors);
if (unlikely(ret))
goto err;
@@ -2789,10 +2886,10 @@ int bch2_truncate(struct user_namespace *mnt_userns,
ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
mutex_unlock(&inode->ei_update_lock);
- ret = bch2_setattr_nonsize(inode, iattr);
+ ret = bch2_setattr_nonsize(mnt_userns, inode, iattr);
err:
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
- return ret;
+ bch2_pagecache_block_put(inode);
+ return bch2_err_class(ret);
}
/* fallocate: */
@@ -2823,7 +2920,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
truncate_pagecache_range(&inode->v, offset, end - 1);
- if (block_start < block_end ) {
+ if (block_start < block_end) {
s64 i_sectors_delta = 0;
ret = bch2_fpunch(c, inode_inum(inode),
@@ -2910,7 +3007,8 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
bch2_trans_copy_iter(&dst, &src);
bch2_trans_copy_iter(&del, &src);
- while (ret == 0 || ret == -EINTR) {
+ while (ret == 0 ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
@@ -2936,7 +3034,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
k = insert
? bch2_btree_iter_peek_prev(&src)
- : bch2_btree_iter_peek(&src);
+ : bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX));
if ((ret = bkey_err(k)))
continue;
@@ -2944,13 +3042,13 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
break;
if (insert &&
- bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
+ bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9)))
break;
reassemble:
bch2_bkey_buf_reassemble(&copy, c, k);
if (insert &&
- bkey_cmp(bkey_start_pos(k.k), move_pos) < 0)
+ bkey_lt(bkey_start_pos(k.k), move_pos))
bch2_cut_front(move_pos, copy.k);
copy.k->k.p.offset += shift >> 9;
@@ -2960,7 +3058,7 @@ reassemble:
if (ret)
continue;
- if (bkey_cmp(atomic_end, copy.k->k.p)) {
+ if (!bkey_eq(atomic_end, copy.k->k.p)) {
if (insert) {
move_pos = atomic_end;
move_pos.offset -= shift >> 9;
@@ -2978,13 +3076,7 @@ reassemble:
next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
- if (copy.k->k.size == k.k->size) {
- /*
- * If we're moving the entire extent, we can skip
- * running triggers:
- */
- trigger_flags |= BTREE_TRIGGER_NORUN;
- } else {
+ if (copy.k->k.size != k.k->size) {
/* We might end up splitting compressed extents: */
unsigned nr_ptrs =
bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
@@ -3035,20 +3127,19 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
struct btree_trans trans;
struct btree_iter iter;
struct bpos end_pos = POS(inode->v.i_ino, end_sector);
- unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
+ struct bch_io_opts opts;
int ret = 0;
+ bch2_inode_opts_get(&opts, c, &inode->ei_inode);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
POS(inode->v.i_ino, start_sector),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- while (!ret && bkey_cmp(iter.pos, end_pos) < 0) {
+ while (!ret && bkey_lt(iter.pos, end_pos)) {
s64 i_sectors_delta = 0;
- struct disk_reservation disk_res = { 0 };
struct quota_res quota_res = { 0 };
- struct bkey_i_reservation reservation;
struct bkey_s_c k;
unsigned sectors;
u32 snapshot;
@@ -3067,8 +3158,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
goto bkey_err;
/* already reserved */
- if (k.k->type == KEY_TYPE_reservation &&
- bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
+ if (bkey_extent_is_reservation(k) &&
+ bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
bch2_btree_iter_advance(&iter);
continue;
}
@@ -3079,16 +3170,12 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
continue;
}
- bkey_reservation_init(&reservation.k_i);
- reservation.k.type = KEY_TYPE_reservation;
- reservation.k.p = k.k->p;
- reservation.k.size = k.k->size;
-
- bch2_cut_front(iter.pos, &reservation.k_i);
- bch2_cut_back(end_pos, &reservation.k_i);
+ /*
+ * XXX: for nocow mode, we should promote shared extents to
+ * unshared here
+ */
- sectors = reservation.k.size;
- reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k);
+ sectors = bpos_min(k.k->p, end_pos).offset - iter.pos.offset;
if (!bkey_extent_is_allocation(k.k)) {
ret = bch2_quota_reservation_add(c, inode,
@@ -3098,34 +3185,23 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
goto bkey_err;
}
- if (reservation.v.nr_replicas < replicas ||
- bch2_bkey_sectors_compressed(k)) {
- ret = bch2_disk_reservation_get(c, &disk_res, sectors,
- replicas, 0);
- if (unlikely(ret))
- goto bkey_err;
-
- reservation.v.nr_replicas = disk_res.nr_replicas;
- }
-
- ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
- &reservation.k_i,
- &disk_res, NULL,
- 0, &i_sectors_delta, true);
+ ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter,
+ sectors, opts, &i_sectors_delta,
+ writepoint_hashed((unsigned long) current));
if (ret)
goto bkey_err;
+
i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
- bch2_disk_reservation_put(c, &disk_res);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
ret = 0;
}
bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
- if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
+ if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
struct quota_res quota_res = { 0 };
s64 i_sectors_delta = 0;
@@ -3176,7 +3252,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
* so that the VFS cache i_size is consistent with the btree i_size:
*/
if (ret &&
- !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)))
+ !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
return ret;
if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
@@ -3204,12 +3280,16 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
long ret;
- if (!percpu_ref_tryget(&c->writes))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
return -EROFS;
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_block_get(inode);
+
+ ret = file_modified(file);
+ if (ret)
+ goto err;
if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
ret = bchfs_fallocate(inode, mode, offset, len);
@@ -3221,242 +3301,65 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
ret = bchfs_fcollapse_finsert(inode, offset, len, false);
else
ret = -EOPNOTSUPP;
-
-
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+err:
+ bch2_pagecache_block_put(inode);
inode_unlock(&inode->v);
- percpu_ref_put(&c->writes);
+ bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
- return ret;
+ return bch2_err_class(ret);
}
-static int generic_access_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- struct inode *inode = file->f_mapping->host;
- loff_t max_size = inode->i_sb->s_maxbytes;
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = MAX_NON_LFS;
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
- *count = min(*count, max_size - pos);
- return 0;
-}
-
-static int generic_write_check_limits(struct file *file, loff_t pos,
- loff_t *count)
-{
- loff_t limit = rlimit(RLIMIT_FSIZE);
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- *count = min(*count, limit - pos);
- }
-
- return generic_access_check_limits(file, pos, count);
-}
-
-static int generic_remap_checks(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *req_count, unsigned int remap_flags)
+/*
+ * Take a quota reservation for unallocated blocks in a given file range
+ * Does not check pagecache
+ */
+static int quota_reserve_range(struct bch_inode_info *inode,
+ struct quota_res *res,
+ u64 start, u64 end)
{
- struct inode *inode_in = file_in->f_mapping->host;
- struct inode *inode_out = file_out->f_mapping->host;
- uint64_t count = *req_count;
- uint64_t bcount;
- loff_t size_in, size_out;
- loff_t bs = inode_out->i_sb->s_blocksize;
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u32 snapshot;
+ u64 sectors = end - start;
+ u64 pos = start;
int ret;
- /* The start of both ranges must be aligned to an fs block. */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
- return -EINVAL;
-
- /* Ensure offsets don't wrap. */
- if (pos_in + count < pos_in || pos_out + count < pos_out)
- return -EINVAL;
-
- size_in = i_size_read(inode_in);
- size_out = i_size_read(inode_out);
-
- /* Dedupe requires both ranges to be within EOF. */
- if ((remap_flags & REMAP_FILE_DEDUP) &&
- (pos_in >= size_in || pos_in + count > size_in ||
- pos_out >= size_out || pos_out + count > size_out))
- return -EINVAL;
-
- /* Ensure the infile range is within the infile. */
- if (pos_in >= size_in)
- return -EINVAL;
- count = min(count, size_in - (uint64_t)pos_in);
-
- ret = generic_access_check_limits(file_in, pos_in, &count);
- if (ret)
- return ret;
+ bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
- ret = generic_write_check_limits(file_out, pos_out, &count);
+ ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
if (ret)
- return ret;
-
- /*
- * If the user wanted us to link to the infile's EOF, round up to the
- * next block boundary for this check.
- *
- * Otherwise, make sure the count is also block-aligned, having
- * already confirmed the starting offsets' block alignment.
- */
- if (pos_in + count == size_in) {
- bcount = ALIGN(size_in, bs) - pos_in;
- } else {
- if (!IS_ALIGNED(count, bs))
- count = ALIGN_DOWN(count, bs);
- bcount = count;
- }
-
- /* Don't allow overlapped cloning within the same file. */
- if (inode_in == inode_out &&
- pos_out + bcount > pos_in &&
- pos_out < pos_in + bcount)
- return -EINVAL;
-
- /*
- * We shortened the request but the caller can't deal with that, so
- * bounce the request back to userspace.
- */
- if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
- return -EINVAL;
-
- *req_count = count;
- return 0;
-}
-
-static int generic_remap_check_len(struct inode *inode_in,
- struct inode *inode_out,
- loff_t pos_out,
- loff_t *len,
- unsigned int remap_flags)
-{
- u64 blkmask = i_blocksize(inode_in) - 1;
- loff_t new_len = *len;
-
- if ((*len & blkmask) == 0)
- return 0;
-
- if ((remap_flags & REMAP_FILE_DEDUP) ||
- pos_out + *len < i_size_read(inode_out))
- new_len &= ~blkmask;
-
- if (new_len == *len)
- return 0;
-
- if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
- *len = new_len;
- return 0;
- }
-
- return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
-}
-
-static int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- bool same_inode = (inode_in == inode_out);
- int ret;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
-
- if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Zero length dedupe exits immediately; reflink goes to EOF. */
- if (*len == 0) {
- loff_t isize = i_size_read(inode_in);
+ goto err;
- if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
- return 0;
- if (pos_in > isize)
- return -EINVAL;
- *len = isize - pos_in;
- if (*len == 0)
- return 0;
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ SPOS(inode->v.i_ino, pos, snapshot), 0);
+
+ while (!(ret = btree_trans_too_many_iters(&trans)) &&
+ (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
+ !(ret = bkey_err(k))) {
+ if (bkey_extent_is_allocation(k.k)) {
+ u64 s = min(end, k.k->p.offset) -
+ max(start, bkey_start_offset(k.k));
+ BUG_ON(s > sectors);
+ sectors -= s;
+ }
+ bch2_btree_iter_advance(&iter);
}
+ pos = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
- /* Check that we don't violate system file offset limits. */
- ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- if (ret)
- return ret;
-
- /* Wait for the completion of any pending IOs on both files */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- ret = filemap_write_and_wait_range(inode_in->i_mapping,
- pos_in, pos_in + *len - 1);
- if (ret)
- return ret;
-
- ret = filemap_write_and_wait_range(inode_out->i_mapping,
- pos_out, pos_out + *len - 1);
- if (ret)
- return ret;
-
- /*
- * Check that the extents are the same.
- */
- if (remap_flags & REMAP_FILE_DEDUP) {
- bool is_same = false;
-
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
- if (ret)
- return ret;
- if (!is_same)
- return -EBADE;
- }
+ bch2_trans_exit(&trans);
- ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
- remap_flags);
if (ret)
return ret;
- /* If can't alter the file contents, we're done. */
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- /* Update the timestamps, since we can alter file contents. */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- return ret;
- }
-
- /*
- * Clear the security bits if the process is not being run by
- * root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- return ret;
- }
-
- return 0;
+ return bch2_quota_reservation_add(c, inode, res, sectors, true);
}
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
@@ -3466,6 +3369,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct bch_inode_info *src = file_bch_inode(file_src);
struct bch_inode_info *dst = file_bch_inode(file_dst);
struct bch_fs *c = src->v.i_sb->s_fs_info;
+ struct quota_res quota_res = { 0 };
s64 i_sectors_delta = 0;
u64 aligned_len;
loff_t ret = 0;
@@ -3486,8 +3390,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
- file_update_time(file_dst);
-
inode_dio_wait(&src->v);
inode_dio_wait(&dst->v);
@@ -3504,6 +3406,13 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
if (ret)
goto err;
+ ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
+ (pos_dst + aligned_len) >> 9);
+ if (ret)
+ goto err;
+
+ file_update_time(file_dst);
+
mark_pagecache_unallocated(src, pos_src >> 9,
(pos_src + aligned_len) >> 9);
@@ -3520,8 +3429,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
*/
ret = min((u64) ret << 9, (u64) len);
- /* XXX get a quota reservation */
- i_sectors_acct(c, dst, NULL, i_sectors_delta);
+ i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
spin_lock(&dst->v.i_lock);
if (pos_dst + ret > dst->v.i_size)
@@ -3530,18 +3438,19 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
IS_SYNC(file_inode(file_dst)))
- ret = bch2_flush_inode(c, inode_inum(dst));
+ ret = bch2_flush_inode(c, dst);
err:
+ bch2_quota_reservation_put(c, dst, &quota_res);
bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
- return ret;
+ return bch2_err_class(ret);
}
/* fseek: */
-static int page_data_offset(struct page *page, unsigned offset)
+static int folio_data_offset(struct folio *folio, unsigned offset)
{
- struct bch_page_state *s = bch2_page_state(page);
+ struct bch_page_state *s = bch2_page_state(&folio->page);
unsigned i;
if (s)
@@ -3556,36 +3465,38 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode,
loff_t start_offset,
loff_t end_offset)
{
- struct address_space *mapping = vinode->i_mapping;
- struct page *page;
+ struct folio_batch fbatch;
pgoff_t start_index = start_offset >> PAGE_SHIFT;
pgoff_t end_index = end_offset >> PAGE_SHIFT;
pgoff_t index = start_index;
+ unsigned i;
loff_t ret;
int offset;
- while (index <= end_index) {
- if (find_get_pages_range(mapping, &index, end_index, 1, &page)) {
- lock_page(page);
+ folio_batch_init(&fbatch);
- offset = page_data_offset(page,
- page->index == start_index
+ while (filemap_get_folios(vinode->i_mapping,
+ &index, end_index, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+
+ folio_lock(folio);
+ offset = folio_data_offset(folio,
+ folio->index == start_index
? start_offset & (PAGE_SIZE - 1)
: 0);
if (offset >= 0) {
- ret = clamp(((loff_t) page->index << PAGE_SHIFT) +
+ ret = clamp(((loff_t) folio->index << PAGE_SHIFT) +
offset,
start_offset, end_offset);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_batch_release(&fbatch);
return ret;
}
-
- unlock_page(page);
- put_page(page);
- } else {
- break;
+ folio_unlock(folio);
}
+ folio_batch_release(&fbatch);
+ cond_resched();
}
return end_offset;
@@ -3615,11 +3526,11 @@ retry:
if (ret)
goto err;
- for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) {
- if (k.k->p.inode != inode->v.i_ino) {
- break;
- } else if (bkey_extent_is_data(k.k)) {
+ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents,
+ SPOS(inode->v.i_ino, offset >> 9, snapshot),
+ POS(inode->v.i_ino, U64_MAX),
+ 0, k, ret) {
+ if (bkey_extent_is_data(k.k)) {
next_data = max(offset, bkey_start_offset(k.k) << 9);
break;
} else if (k.k->p.offset >> 9 > isize)
@@ -3627,7 +3538,7 @@ retry:
}
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
@@ -3742,7 +3653,7 @@ retry:
}
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
@@ -3757,22 +3668,31 @@ err:
loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
{
+ loff_t ret;
+
switch (whence) {
case SEEK_SET:
case SEEK_CUR:
case SEEK_END:
- return generic_file_llseek(file, offset, whence);
+ ret = generic_file_llseek(file, offset, whence);
+ break;
case SEEK_DATA:
- return bch2_seek_data(file, offset);
+ ret = bch2_seek_data(file, offset);
+ break;
case SEEK_HOLE:
- return bch2_seek_hole(file, offset);
+ ret = bch2_seek_hole(file, offset);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
}
- return -EINVAL;
+ return bch2_err_class(ret);
}
void bch2_fs_fsio_exit(struct bch_fs *c)
{
+ bioset_exit(&c->nocow_flush_bioset);
bioset_exit(&c->dio_write_bioset);
bioset_exit(&c->dio_read_bioset);
bioset_exit(&c->writepage_bioset);
@@ -3786,14 +3706,22 @@ int bch2_fs_fsio_init(struct bch_fs *c)
if (bioset_init(&c->writepage_bioset,
4, offsetof(struct bch_writepage_io, op.wbio.bio),
- BIOSET_NEED_BVECS) ||
- bioset_init(&c->dio_read_bioset,
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_writepage_bioset_init;
+
+ if (bioset_init(&c->dio_read_bioset,
4, offsetof(struct dio_read, rbio.bio),
- BIOSET_NEED_BVECS) ||
- bioset_init(&c->dio_write_bioset,
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_dio_read_bioset_init;
+
+ if (bioset_init(&c->dio_write_bioset,
4, offsetof(struct dio_write, op.wbio.bio),
BIOSET_NEED_BVECS))
- ret = -ENOMEM;
+ return -BCH_ERR_ENOMEM_dio_write_bioset_init;
+
+ if (bioset_init(&c->nocow_flush_bioset,
+ 1, offsetof(struct nocow_flush, bio), 0))
+ return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
pr_verbose_init(c->opts, "ret %i", ret);
return ret;
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
index f9e7f49b13c7..a8835298613a 100644
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@@ -15,15 +15,13 @@ int __must_check bch2_write_inode_size(struct bch_fs *,
struct bch_inode_info *,
loff_t, unsigned);
-int bch2_writepage(struct page *, struct writeback_control *);
-int bch2_readpage(struct file *, struct page *);
+int bch2_read_folio(struct file *, struct folio *);
int bch2_writepages(struct address_space *, struct writeback_control *);
-int bch2_readpages(struct file *, struct address_space *,
- struct list_head *, unsigned);
+void bch2_readahead(struct readahead_control *);
int bch2_write_begin(struct file *, struct address_space *, loff_t,
- unsigned, unsigned, struct page **, void **);
+ unsigned, struct page **, void **);
int bch2_write_end(struct file *, struct address_space *, loff_t,
unsigned, unsigned, struct page *, void *);
@@ -36,10 +34,6 @@ int bch2_truncate(struct user_namespace *,
struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
-#define REMAP_FILE_ADVISORY (0)
-#define REMAP_FILE_DEDUP (1 << 0)
-#define REMAP_FILE_CAN_SHORTEN (1 << 1)
-
loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
loff_t, loff_t, unsigned);
@@ -47,10 +41,8 @@ loff_t bch2_llseek(struct file *, loff_t, int);
vm_fault_t bch2_page_fault(struct vm_fault *);
vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidatepage(struct page *, unsigned int, unsigned int);
-int bch2_releasepage(struct page *, gfp_t);
-int bch2_migrate_page(struct address_space *, struct page *,
- struct page *, enum migrate_mode);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
void bch2_fs_fsio_exit(struct bch_fs *);
int bch2_fs_fsio_init(struct bch_fs *);
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index a76017386593..571b4dca4d39 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -26,6 +26,9 @@ struct flags_set {
unsigned flags;
unsigned projid;
+
+ bool set_projinherit;
+ bool projinherit;
};
static int bch2_inode_flags_set(struct bch_inode_info *inode,
@@ -50,6 +53,11 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
(newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
return -EINVAL;
+ if (s->set_projinherit) {
+ bi->bi_fields_set &= ~(1 << Inode_opt_project);
+ bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
+ }
+
bi->bi_flags &= ~s->mask;
bi->bi_flags |= newflags;
@@ -85,7 +93,7 @@ static int bch2_ioc_setflags(struct bch_fs *c,
return ret;
inode_lock(&inode->v);
- if (!inode_owner_or_capable(&inode->v)) {
+ if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) {
ret = -EACCES;
goto setflags_out;
}
@@ -107,6 +115,10 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
struct fsxattr fa = { 0 };
fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+
+ if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
+ fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
+
fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
return copy_to_user(arg, &fa, sizeof(fa));
@@ -138,6 +150,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
if (copy_from_user(&fa, arg, sizeof(fa)))
return -EFAULT;
+ s.set_projinherit = true;
+ s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
+ fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
+
s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
if (fa.fsx_xflags)
return -EOPNOTSUPP;
@@ -156,7 +172,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
return ret;
inode_lock(&inode->v);
- if (!inode_owner_or_capable(&inode->v)) {
+ if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) {
ret = -EACCES;
goto err;
}
@@ -268,22 +284,20 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
down_write(&c->vfs_sb->s_umount);
switch (flags) {
- case FSOP_GOING_FLAGS_DEFAULT: {
- struct super_block *sb = freeze_bdev(c->vfs_sb->s_bdev);
+ case FSOP_GOING_FLAGS_DEFAULT:
+ ret = freeze_bdev(c->vfs_sb->s_bdev);
if (ret)
goto err;
- if (sb && !IS_ERR(sb)) {
- bch2_journal_flush(&c->journal);
- c->vfs_sb->s_flags |= SB_RDONLY;
- bch2_fs_emergency_read_only(c);
- thaw_bdev(c->vfs_sb->s_bdev, sb);
- }
+ bch2_journal_flush(&c->journal);
+ c->vfs_sb->s_flags |= SB_RDONLY;
+ bch2_fs_emergency_read_only(c);
+ thaw_bdev(c->vfs_sb->s_bdev);
break;
- }
case FSOP_GOING_FLAGS_LOGFLUSH:
bch2_journal_flush(&c->journal);
+ fallthrough;
case FSOP_GOING_FLAGS_NOLOGFLUSH:
c->vfs_sb->s_flags |= SB_RDONLY;
@@ -379,7 +393,8 @@ retry:
goto err3;
}
- error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ error = inode_permission(file_mnt_user_ns(filp),
+ dir, MAY_WRITE | MAY_EXEC);
if (error)
goto err3;
@@ -394,7 +409,7 @@ retry:
!arg.src_ptr)
snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol;
- inode = __bch2_create(NULL, to_bch_ei(dir),
+ inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir),
dst_dentry, arg.mode|S_IFDIR,
0, snapshot_src, create_flags);
error = PTR_ERR_OR_ZERO(inode);
@@ -436,17 +451,20 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
return ret;
if (path.dentry->d_sb->s_fs_info != c) {
- path_put(&path);
- return -EXDEV;
+ ret = -EXDEV;
+ goto err;
}
dir = path.dentry->d_parent->d_inode;
ret = __bch2_unlink(dir, path.dentry, true);
- if (!ret)
- d_delete(path.dentry);
- path_put(&path);
+ if (ret)
+ goto err;
+ fsnotify_rmdir(dir, path.dentry);
+ d_delete(path.dentry);
+err:
+ path_put(&path);
return ret;
}
@@ -454,51 +472,67 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ long ret;
switch (cmd) {
case FS_IOC_GETFLAGS:
- return bch2_ioc_getflags(inode, (int __user *) arg);
+ ret = bch2_ioc_getflags(inode, (int __user *) arg);
+ break;
case FS_IOC_SETFLAGS:
- return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+ ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+ break;
case FS_IOC_FSGETXATTR:
- return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+ ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+ break;
+
case FS_IOC_FSSETXATTR:
- return bch2_ioc_fssetxattr(c, file, inode,
- (void __user *) arg);
+ ret = bch2_ioc_fssetxattr(c, file, inode,
+ (void __user *) arg);
+ break;
case BCHFS_IOC_REINHERIT_ATTRS:
- return bch2_ioc_reinherit_attrs(c, file, inode,
- (void __user *) arg);
+ ret = bch2_ioc_reinherit_attrs(c, file, inode,
+ (void __user *) arg);
+ break;
case FS_IOC_GETVERSION:
- return -ENOTTY;
+ ret = -ENOTTY;
+ break;
+
case FS_IOC_SETVERSION:
- return -ENOTTY;
+ ret = -ENOTTY;
+ break;
case FS_IOC_GOINGDOWN:
- return bch2_ioc_goingdown(c, (u32 __user *) arg);
+ ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
+ break;
case BCH_IOCTL_SUBVOLUME_CREATE: {
struct bch_ioctl_subvolume i;
- if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
- return -EFAULT;
- return bch2_ioctl_subvolume_create(c, file, i);
+ ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+ ? -EFAULT
+ : bch2_ioctl_subvolume_create(c, file, i);
+ break;
}
case BCH_IOCTL_SUBVOLUME_DESTROY: {
struct bch_ioctl_subvolume i;
- if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
- return -EFAULT;
- return bch2_ioctl_subvolume_destroy(c, file, i);
+ ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+ ? -EFAULT
+ : bch2_ioctl_subvolume_destroy(c, file, i);
+ break;
}
default:
- return bch2_fs_ioctl(c, cmd, (void __user *) arg);
+ ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
+ break;
}
+
+ return bch2_err_class(ret);
}
#ifdef CONFIG_COMPAT
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 60e6ba4918c4..fafd64509f6b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -8,6 +8,7 @@
#include "buckets.h"
#include "chardev.h"
#include "dirent.h"
+#include "errcode.h"
#include "extents.h"
#include "fs.h"
#include "fs-common.h"
@@ -30,6 +31,7 @@
#include <linux/pagemap.h>
#include <linux/posix_acl.h>
#include <linux/random.h>
+#include <linux/seq_file.h>
#include <linux/statfs.h>
#include <linux/string.h>
#include <linux/xattr.h>
@@ -41,58 +43,6 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
struct bch_subvolume *);
-static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
-{
- BUG_ON(atomic_long_read(&lock->v) == 0);
-
- if (atomic_long_sub_return_release(i, &lock->v) == 0)
- wake_up_all(&lock->wait);
-}
-
-static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
-{
- long v = atomic_long_read(&lock->v), old;
-
- do {
- old = v;
-
- if (i > 0 ? v < 0 : v > 0)
- return false;
- } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
- old, old + i)) != old);
- return true;
-}
-
-static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
-{
- wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
-}
-
-void bch2_pagecache_add_put(struct pagecache_lock *lock)
-{
- __pagecache_lock_put(lock, 1);
-}
-
-bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
-{
- return __pagecache_lock_tryget(lock, 1);
-}
-
-void bch2_pagecache_add_get(struct pagecache_lock *lock)
-{
- __pagecache_lock_get(lock, 1);
-}
-
-void bch2_pagecache_block_put(struct pagecache_lock *lock)
-{
- __pagecache_lock_put(lock, -1);
-}
-
-void bch2_pagecache_block_get(struct pagecache_lock *lock)
-{
- __pagecache_lock_get(lock, -1);
-}
-
void bch2_inode_update_after_write(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
@@ -104,7 +54,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
bch2_assert_pos_locked(trans, BTREE_ID_inodes,
POS(0, bi->bi_inum),
- 0 && c->opts.inodes_use_key_cache);
+ c->opts.inodes_use_key_cache);
set_nlink(&inode->v, bch2_inode_nlink_get(bi));
i_uid_write(&inode->v, bi->bi_uid);
@@ -152,9 +102,14 @@ retry:
bch2_trans_iter_exit(&trans, &iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
+ bch2_fs_fatal_err_on(ret == -ENOENT, c,
+ "inode %u:%llu not found when updating",
+ inode_inum(inode).subvol,
+ inode_inum(inode).inum);
+
bch2_trans_exit(&trans);
return ret < 0 ? ret : 0;
}
@@ -251,6 +206,10 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
return ERR_PTR(ret);
}
+ mutex_lock(&c->vfs_inodes_lock);
+ list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+ mutex_unlock(&c->vfs_inodes_lock);
+
unlock_new_inode(&inode->v);
return &inode->v;
@@ -263,7 +222,6 @@ __bch2_create(struct user_namespace *mnt_userns,
unsigned flags)
{
struct bch_fs *c = dir->v.i_sb->s_fs_info;
- struct user_namespace *ns = dir->v.i_sb->s_user_ns;
struct btree_trans trans;
struct bch_inode_unpacked dir_u;
struct bch_inode_info *inode, *old;
@@ -304,8 +262,8 @@ retry:
inode_inum(dir), &dir_u, &inode_u,
!(flags & BCH_CREATE_TMPFILE)
? &dentry->d_name : NULL,
- from_kuid(ns, current_fsuid()),
- from_kgid(ns, current_fsgid()),
+ from_kuid(mnt_userns, current_fsuid()),
+ from_kgid(mnt_userns, current_fsgid()),
mode, rdev,
default_acl, acl, snapshot_src, flags) ?:
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
@@ -323,7 +281,7 @@ retry:
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
err_before_quota:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
goto err_trans;
}
@@ -365,6 +323,9 @@ err_before_quota:
inode = old;
} else {
+ mutex_lock(&c->vfs_inodes_lock);
+ list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+ mutex_unlock(&c->vfs_inodes_lock);
/*
* we really don't want insert_inode_locked2() to be setting
* I_NEW...
@@ -409,24 +370,26 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
return d_splice_alias(vinode, dentry);
}
-static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
+static int bch2_mknod(struct user_namespace *mnt_userns,
+ struct inode *vdir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
struct bch_inode_info *inode =
- __bch2_create(NULL, to_bch_ei(vdir), dentry, mode, rdev,
+ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev,
(subvol_inum) { 0 }, 0);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return bch2_err_class(PTR_ERR(inode));
d_instantiate(dentry, &inode->v);
return 0;
}
-static int bch2_create(struct inode *vdir, struct dentry *dentry,
+static int bch2_create(struct user_namespace *mnt_userns,
+ struct inode *vdir, struct dentry *dentry,
umode_t mode, bool excl)
{
- return bch2_mknod(vdir, dentry, mode|S_IFREG, 0);
+ return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0);
}
static int __bch2_link(struct bch_fs *c,
@@ -441,7 +404,7 @@ static int __bch2_link(struct bch_fs *c,
mutex_lock(&inode->ei_update_lock);
bch2_trans_init(&trans, c, 4, 1024);
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_link_trans(&trans,
inode_inum(dir), &dir_u,
inode_inum(inode), &inode_u,
@@ -490,20 +453,28 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
bch2_trans_init(&trans, c, 4, 1024);
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL,
- bch2_unlink_trans(&trans,
- inode_inum(dir), &dir_u,
- &inode_u, &dentry->d_name,
- deleting_snapshot));
+ ret = commit_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ bch2_unlink_trans(&trans,
+ inode_inum(dir), &dir_u,
+ &inode_u, &dentry->d_name,
+ deleting_snapshot));
+ if (unlikely(ret))
+ goto err;
- if (likely(!ret)) {
- bch2_inode_update_after_write(&trans, dir, &dir_u,
- ATTR_MTIME|ATTR_CTIME);
- bch2_inode_update_after_write(&trans, inode, &inode_u,
- ATTR_MTIME);
- }
+ bch2_inode_update_after_write(&trans, dir, &dir_u,
+ ATTR_MTIME|ATTR_CTIME);
+ bch2_inode_update_after_write(&trans, inode, &inode_u,
+ ATTR_MTIME);
+ if (inode_u.bi_subvol) {
+ /*
+ * Subvolume deletion is asynchronous, but we still want to tell
+ * the VFS that it's been deleted here:
+ */
+ set_nlink(&inode->v, 0);
+ }
+err:
bch2_trans_exit(&trans);
bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
@@ -515,17 +486,18 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
return __bch2_unlink(vdir, dentry, false);
}
-static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
+static int bch2_symlink(struct user_namespace *mnt_userns,
+ struct inode *vdir, struct dentry *dentry,
const char *symname)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
int ret;
- inode = __bch2_create(NULL, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
+ inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
(subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
- if (unlikely(IS_ERR(inode)))
- return PTR_ERR(inode);
+ if (IS_ERR(inode))
+ return bch2_err_class(PTR_ERR(inode));
inode_lock(&inode->v);
ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
@@ -549,12 +521,14 @@ err:
return ret;
}
-static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
+static int bch2_mkdir(struct user_namespace *mnt_userns,
+ struct inode *vdir, struct dentry *dentry, umode_t mode)
{
- return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0);
+ return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0);
}
-static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
+static int bch2_rename2(struct user_namespace *mnt_userns,
+ struct inode *src_vdir, struct dentry *src_dentry,
struct inode *dst_vdir, struct dentry *dst_dentry,
unsigned flags)
{
@@ -609,7 +583,7 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
goto err;
}
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_rename_trans(&trans,
inode_inum(src_dir), &src_dir_u,
inode_inum(dst_dir), &dst_dir_u,
@@ -660,7 +634,8 @@ err:
return ret;
}
-static void bch2_setattr_copy(struct bch_inode_info *inode,
+static void bch2_setattr_copy(struct user_namespace *mnt_userns,
+ struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
struct iattr *attr)
{
@@ -668,9 +643,9 @@ static void bch2_setattr_copy(struct bch_inode_info *inode,
unsigned int ia_valid = attr->ia_valid;
if (ia_valid & ATTR_UID)
- bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid);
+ bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid);
if (ia_valid & ATTR_GID)
- bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid);
+ bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid);
if (ia_valid & ATTR_SIZE)
bi->bi_size = attr->ia_size;
@@ -689,13 +664,14 @@ static void bch2_setattr_copy(struct bch_inode_info *inode,
: inode->v.i_gid;
if (!in_group_p(gid) &&
- !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID))
+ !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID))
mode &= ~S_ISGID;
bi->bi_mode = mode;
}
}
-int bch2_setattr_nonsize(struct bch_inode_info *inode,
+int bch2_setattr_nonsize(struct user_namespace *mnt_userns,
+ struct bch_inode_info *inode,
struct iattr *attr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -711,10 +687,10 @@ int bch2_setattr_nonsize(struct bch_inode_info *inode,
qid = inode->ei_qid;
if (attr->ia_valid & ATTR_UID)
- qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid);
+ qid.q[QTYP_USR] = from_kuid(mnt_userns, attr->ia_uid);
if (attr->ia_valid & ATTR_GID)
- qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid);
+ qid.q[QTYP_GRP] = from_kgid(mnt_userns, attr->ia_gid);
ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
KEY_TYPE_QUOTA_PREALLOC);
@@ -732,7 +708,7 @@ retry:
if (ret)
goto btree_err;
- bch2_setattr_copy(inode, &inode_u, attr);
+ bch2_setattr_copy(mnt_userns, inode, &inode_u, attr);
if (attr->ia_valid & ATTR_MODE) {
ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
@@ -747,7 +723,7 @@ retry:
btree_err:
bch2_trans_iter_exit(&trans, &inode_iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (unlikely(ret))
goto err_trans;
@@ -761,10 +737,11 @@ err_trans:
err:
mutex_unlock(&inode->ei_update_lock);
- return ret;
+ return bch2_err_class(ret);
}
-static int bch2_getattr(const struct path *path, struct kstat *stat,
+static int bch2_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned query_flags)
{
struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
@@ -804,34 +781,37 @@ static int bch2_getattr(const struct path *path, struct kstat *stat,
return 0;
}
-static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
+static int bch2_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *iattr)
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
int ret;
lockdep_assert_held(&inode->v.i_rwsem);
- ret = setattr_prepare(dentry, iattr);
+ ret = setattr_prepare(mnt_userns, dentry, iattr);
if (ret)
return ret;
return iattr->ia_valid & ATTR_SIZE
- ? bch2_truncate(NULL, inode, iattr)
- : bch2_setattr_nonsize(inode, iattr);
+ ? bch2_truncate(mnt_userns, inode, iattr)
+ : bch2_setattr_nonsize(mnt_userns, inode, iattr);
}
-static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
+static int bch2_tmpfile(struct user_namespace *mnt_userns,
+ struct inode *vdir, struct file *file, umode_t mode)
{
struct bch_inode_info *inode =
- __bch2_create(NULL, to_bch_ei(vdir), dentry, mode, 0,
+ __bch2_create(mnt_userns, to_bch_ei(vdir),
+ file->f_path.dentry, mode, 0,
(subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return bch2_err_class(PTR_ERR(inode));
- d_mark_tmpfile(dentry, &inode->v);
- d_instantiate(dentry, &inode->v);
- return 0;
+ d_mark_tmpfile(file, &inode->v);
+ d_instantiate(file->f_path.dentry, &inode->v);
+ return finish_open_simple(file, 0);
}
static int bch2_fill_extent(struct bch_fs *c,
@@ -851,6 +831,9 @@ static int bch2_fill_extent(struct bch_fs *c,
int flags2 = 0;
u64 offset = p.ptr.offset;
+ if (p.ptr.unwritten)
+ flags2 |= FIEMAP_EXTENT_UNWRITTEN;
+
if (p.crc.compression_type)
flags2 |= FIEMAP_EXTENT_ENCODED;
else
@@ -902,6 +885,10 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
u32 snapshot;
int ret = 0;
+ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
if (start + len < start)
return -EINVAL;
@@ -920,9 +907,9 @@ retry:
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
SPOS(ei->v.i_ino, start, snapshot), 0);
- while ((k = bch2_btree_iter_peek(&iter)).k &&
- !(ret = bkey_err(k)) &&
- bkey_cmp(iter.pos, end) < 0) {
+ while (!(ret = btree_trans_too_many_iters(&trans)) &&
+ (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
+ !(ret = bkey_err(k))) {
enum btree_id data_btree = BTREE_ID_extents;
if (!bkey_extent_is_data(k.k) &&
@@ -971,7 +958,7 @@ retry:
start = iter.pos.offset;
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (!ret && have_extent)
@@ -1017,15 +1004,6 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
return bch2_readdir(c, inode_inum(inode), ctx);
}
-static int bch2_clone_file_range(struct file *file_src, loff_t pos_src,
- struct file *file_dst, loff_t pos_dst,
- u64 len)
-{
- return bch2_remap_file_range(file_src, pos_src,
- file_dst, pos_dst,
- len, 0);
-}
-
static const struct file_operations bch_file_operations = {
.llseek = bch2_llseek,
.read_iter = bch2_read_iter,
@@ -1040,7 +1018,7 @@ static const struct file_operations bch_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl,
#endif
- .clone_file_range = bch2_clone_file_range,
+ .remap_file_range = bch2_remap_file_range,
};
static const struct inode_operations bch_file_inode_operations = {
@@ -1107,18 +1085,17 @@ static const struct inode_operations bch_special_inode_operations = {
};
static const struct address_space_operations bch_address_space_operations = {
- .writepage = bch2_writepage,
- .readpage = bch2_readpage,
+ .read_folio = bch2_read_folio,
.writepages = bch2_writepages,
- .readpages = bch2_readpages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .readahead = bch2_readahead,
+ .dirty_folio = filemap_dirty_folio,
.write_begin = bch2_write_begin,
.write_end = bch2_write_end,
- .invalidatepage = bch2_invalidatepage,
- .releasepage = bch2_releasepage,
+ .invalidate_folio = bch2_invalidate_folio,
+ .release_folio = bch2_release_folio,
.direct_IO = noop_direct_IO,
#ifdef CONFIG_MIGRATION
- .migratepage = bch2_migrate_page,
+ .migrate_folio = filemap_migrate_folio,
#endif
.error_remove_page = generic_error_remove_page,
};
@@ -1331,7 +1308,7 @@ found:
memcpy(name, d.v->d_name, name_len);
name[name_len] = '\0';
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_iter_exit(&trans, &iter1);
@@ -1404,7 +1381,8 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
inode_init_once(&inode->v);
mutex_init(&inode->ei_update_lock);
- pagecache_lock_init(&inode->ei_pagecache_lock);
+ two_state_lock_init(&inode->ei_pagecache_lock);
+ INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
mutex_init(&inode->ei_quota_lock);
return &inode->v;
@@ -1448,7 +1426,7 @@ static int bch2_vfs_write_inode(struct inode *vinode,
ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
- return ret;
+ return bch2_err_class(ret);
}
static void bch2_evict_inode(struct inode *vinode)
@@ -1467,55 +1445,80 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode_inum(inode), true);
+ bch2_inode_rm(c, inode_inum(inode));
}
+
+ mutex_lock(&c->vfs_inodes_lock);
+ list_del_init(&inode->ei_vfs_inode_list);
+ mutex_unlock(&c->vfs_inodes_lock);
}
-void bch2_evict_subvolume_inodes(struct bch_fs *c,
- struct snapshot_id_list *s)
+void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
{
- struct super_block *sb = c->vfs_sb;
- struct inode *inode;
+ struct bch_inode_info *inode, **i;
+ DARRAY(struct bch_inode_info *) grabbed;
+ bool clean_pass = false, this_pass_clean;
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
- (inode->i_state & I_FREEING))
- continue;
+ /*
+ * Initially, we scan for inodes without I_DONTCACHE, then mark them to
+ * be pruned with d_mark_dontcache().
+ *
+ * Once we've had a clean pass where we didn't find any inodes without
+ * I_DONTCACHE, we wait for them to be freed:
+ */
- d_mark_dontcache(inode);
- d_prune_aliases(inode);
- }
- spin_unlock(&sb->s_inode_list_lock);
+ darray_init(&grabbed);
+ darray_make_room(&grabbed, 1024);
again:
cond_resched();
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
- (inode->i_state & I_FREEING))
+ this_pass_clean = true;
+
+ mutex_lock(&c->vfs_inodes_lock);
+ list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
+ if (!snapshot_list_has_id(s, inode->ei_subvol))
continue;
- if (!(inode->i_state & I_DONTCACHE)) {
- d_mark_dontcache(inode);
- d_prune_aliases(inode);
- }
+ if (!(inode->v.i_state & I_DONTCACHE) &&
+ !(inode->v.i_state & I_FREEING)) {
+ this_pass_clean = false;
+
+ d_mark_dontcache(&inode->v);
+ d_prune_aliases(&inode->v);
+
+ /*
+ * If i_count was zero, we have to take and release a
+ * ref in order for I_DONTCACHE to be noticed and the
+ * inode to be dropped;
+ */
+
+ if (!atomic_read(&inode->v.i_count) &&
+ igrab(&inode->v) &&
+ darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN))
+ break;
+ } else if (clean_pass && this_pass_clean) {
+ wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
+ DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
- spin_lock(&inode->i_lock);
- if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
- !(inode->i_state & I_FREEING)) {
- wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
- DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- spin_unlock(&inode->i_lock);
- spin_unlock(&sb->s_inode_list_lock);
+ mutex_unlock(&c->vfs_inodes_lock);
+
schedule();
finish_wait(wq, &wait.wq_entry);
goto again;
}
+ }
+ mutex_unlock(&c->vfs_inodes_lock);
+
+ darray_for_each(grabbed, i)
+ iput(&(*i)->v);
+ grabbed.nr = 0;
- spin_unlock(&inode->i_lock);
+ if (!clean_pass || !this_pass_clean) {
+ clean_pass = this_pass_clean;
+ goto again;
}
- spin_unlock(&sb->s_inode_list_lock);
+
+ darray_exit(&grabbed);
}
static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1552,6 +1555,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
static int bch2_sync_fs(struct super_block *sb, int wait)
{
struct bch_fs *c = sb->s_fs_info;
+ int ret;
if (c->opts.journal_flush_disabled)
return 0;
@@ -1561,19 +1565,21 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
return 0;
}
- return bch2_journal_flush(&c->journal);
+ ret = bch2_journal_flush(&c->journal);
+ return bch2_err_class(ret);
}
static struct bch_fs *bch2_path_to_fs(const char *path)
{
struct bch_fs *c;
- struct block_device *bdev = lookup_bdev(path);
+ dev_t dev;
+ int ret;
- if (IS_ERR(bdev))
- return ERR_CAST(bdev);
+ ret = lookup_bdev(path, &dev);
+ if (ret)
+ return ERR_PTR(ret);
- c = bch2_dev_to_fs(bdev->bd_dev);
- bdput(bdev);
+ c = bch2_dev_to_fs(dev);
if (c)
closure_put(&c->cl);
return c ?: ERR_PTR(-ENOENT);
@@ -1616,7 +1622,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
ret = bch2_parse_mount_opts(c, &opts, data);
if (ret)
- return ret;
+ goto err;
if (opts.read_only != c->opts.read_only) {
down_write(&c->state_lock);
@@ -1630,7 +1636,8 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
if (ret) {
bch_err(c, "error going rw: %i", ret);
up_write(&c->state_lock);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
sb->s_flags &= ~SB_RDONLY;
@@ -1643,8 +1650,8 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
if (opts.errors >= 0)
c->opts.errors = opts.errors;
-
- return ret;
+err:
+ return bch2_err_class(ret);
}
static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
@@ -1669,7 +1676,8 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
enum bch_opt_id i;
- char buf[512];
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
for (i = 0; i < bch2_opts_nr; i++) {
const struct bch_option *opt = &bch2_opt_table[i];
@@ -1681,13 +1689,17 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
continue;
- bch2_opt_to_text(&PBUF(buf), c, opt, v,
+ printbuf_reset(&buf);
+ bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
OPT_SHOW_MOUNT_STYLE);
seq_putc(seq, ',');
- seq_puts(seq, buf);
+ seq_puts(seq, buf.buf);
}
- return 0;
+ if (buf.allocation_failure)
+ ret = -ENOMEM;
+ printbuf_exit(&buf);
+ return ret;
}
static void bch2_put_super(struct super_block *sb)
@@ -1804,8 +1816,11 @@ got_sb:
kfree(devs[0]);
kfree(devs);
- if (IS_ERR(sb))
- return ERR_CAST(sb);
+ if (IS_ERR(sb)) {
+ ret = PTR_ERR(sb);
+ ret = bch2_err_class(ret);
+ return ERR_PTR(ret);
+ }
c = sb->s_fs_info;
@@ -1829,16 +1844,16 @@ got_sb:
sb->s_xattr = bch2_xattr_handlers;
sb->s_magic = BCACHEFS_STATFS_MAGIC;
sb->s_time_gran = c->sb.nsec_per_time_unit;
+ sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
+ sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
c->vfs_sb = sb;
- strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
+ strscpy(sb->s_id, c->name, sizeof(sb->s_id));
ret = super_setup_bdi(sb);
if (ret)
goto err_put_super;
- sb->s_bdi->congested_fn = bch2_congested;
- sb->s_bdi->congested_data = c;
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
@@ -1857,13 +1872,12 @@ got_sb:
sb->s_flags |= SB_POSIXACL;
#endif
- sb->s_shrink.seeks = 1;
+ sb->s_shrink.seeks = 0;
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
- if (IS_ERR(vinode)) {
- bch_err(c, "error mounting: error getting root inode %i",
- (int) PTR_ERR(vinode));
- ret = PTR_ERR(vinode);
+ ret = PTR_ERR_OR_ZERO(vinode);
+ if (ret) {
+ bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret));
goto err_put_super;
}
@@ -1880,7 +1894,7 @@ out:
err_put_super:
deactivate_locked_super(sb);
- return ERR_PTR(ret);
+ return ERR_PTR(bch2_err_class(ret));
}
static void bch2_kill_sb(struct super_block *sb)
@@ -1904,8 +1918,7 @@ MODULE_ALIAS_FS("bcachefs");
void bch2_vfs_exit(void)
{
unregister_filesystem(&bcache_fs_type);
- if (bch2_inode_cache)
- kmem_cache_destroy(bch2_inode_cache);
+ kmem_cache_destroy(bch2_inode_cache);
}
int __init bch2_vfs_init(void)
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index b5bc70afb100..cf0413534182 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -6,50 +6,48 @@
#include "opts.h"
#include "str_hash.h"
#include "quota_types.h"
+#include "two_state_shared_lock.h"
#include <linux/seqlock.h>
#include <linux/stat.h>
-/*
- * Two-state lock - can be taken for add or block - both states are shared,
- * like read side of rwsem, but conflict with other state:
- */
-struct pagecache_lock {
- atomic_long_t v;
- wait_queue_head_t wait;
-};
-
-static inline void pagecache_lock_init(struct pagecache_lock *lock)
-{
- atomic_long_set(&lock->v, 0);
- init_waitqueue_head(&lock->wait);
-}
-
-void bch2_pagecache_add_put(struct pagecache_lock *);
-bool bch2_pagecache_add_tryget(struct pagecache_lock *);
-void bch2_pagecache_add_get(struct pagecache_lock *);
-void bch2_pagecache_block_put(struct pagecache_lock *);
-void bch2_pagecache_block_get(struct pagecache_lock *);
-
struct bch_inode_info {
struct inode v;
+ struct list_head ei_vfs_inode_list;
unsigned long ei_flags;
struct mutex ei_update_lock;
u64 ei_quota_reserved;
unsigned long ei_last_dirtied;
-
- struct pagecache_lock ei_pagecache_lock;
+ two_state_lock_t ei_pagecache_lock;
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
u32 ei_subvol;
+ /*
+ * When we've been doing nocow writes we'll need to issue flushes to the
+ * underlying block devices
+ *
+ * XXX: a device may have had a flush issued by some other codepath. It
+ * would be better to keep for each device a sequence number that's
+ * incremented when we isusue a cache flush, and track here the sequence
+ * number that needs flushing.
+ */
+ struct bch_devs_mask ei_devs_need_flush;
+
/* copy of inode in btree: */
struct bch_inode_unpacked ei_inode;
};
+#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0)
+
+#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
+#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1)
+
static inline subvol_inum inode_inum(struct bch_inode_info *inode)
{
return (subvol_inum) {
@@ -96,7 +94,7 @@ do { \
if ((_locks) & INODE_LOCK) \
down_write_nested(&a[i]->v.i_rwsem, i); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
- bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\
+ bch2_pagecache_block_get(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
mutex_lock_nested(&a[i]->ei_update_lock, i);\
} \
@@ -114,7 +112,7 @@ do { \
if ((_locks) & INODE_LOCK) \
up_write(&a[i]->v.i_rwsem); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
- bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\
+ bch2_pagecache_block_put(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
mutex_unlock(&a[i]->ei_update_lock); \
} \
@@ -186,11 +184,12 @@ void bch2_inode_update_after_write(struct btree_trans *,
int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
inode_set_fn, void *, unsigned);
-int bch2_setattr_nonsize(struct bch_inode_info *,
+int bch2_setattr_nonsize(struct user_namespace *,
+ struct bch_inode_info *,
struct iattr *);
int __bch2_unlink(struct inode *, struct dentry *, bool);
-void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *);
+void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
@@ -198,7 +197,7 @@ int bch2_vfs_init(void);
#else
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
- struct snapshot_id_list *s) {}
+ snapshot_id_list *s) {}
static inline void bch2_vfs_exit(void) {}
static inline int bch2_vfs_init(void) { return 0; }
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 69b4136364c3..ed2523ac2249 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "bkey_buf.h"
#include "btree_update.h"
+#include "darray.h"
#include "dirent.h"
#include "error.h"
#include "fs-common.h"
@@ -18,6 +19,10 @@
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+/*
+ * XXX: this is handling transaction restarts without returning
+ * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
+ */
static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
u32 snapshot)
{
@@ -26,14 +31,12 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
u64 sectors = 0;
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_extents,
- SPOS(inum, 0, snapshot), 0, k, ret) {
- if (k.k->p.inode != inum)
- break;
-
+ for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ SPOS(inum, 0, snapshot),
+ POS(inum, U64_MAX),
+ 0, k, ret)
if (bkey_extent_is_allocation(k.k))
sectors += k.k->size;
- }
bch2_trans_iter_exit(trans, &iter);
@@ -49,11 +52,10 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
u64 subdirs = 0;
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_dirents,
- SPOS(inum, 0, snapshot), 0, k, ret) {
- if (k.k->p.inode != inum)
- break;
-
+ for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot),
+ POS(inum, U64_MAX),
+ 0, k, ret) {
if (k.k->type != KEY_TYPE_dirent)
continue;
@@ -61,7 +63,6 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
if (d.v->d_type == DT_DIR)
subdirs++;
}
-
bch2_trans_iter_exit(trans, &iter);
return ret ?: subdirs;
@@ -128,16 +129,16 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
if (ret)
goto err;
- if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) {
+ if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
ret = -ENOENT;
goto err;
}
ret = bch2_inode_unpack(k, inode);
err:
- if (ret && ret != -EINTR)
- bch_err(trans->c, "error %i fetching inode %llu",
- ret, inode_nr);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(trans->c, "error fetching inode %llu: %s",
+ inode_nr, bch2_err_str(ret));
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -163,9 +164,9 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
if (!ret)
*snapshot = iter.pos.snapshot;
err:
- if (ret && ret != -EINTR)
- bch_err(trans->c, "error %i fetching inode %llu:%u",
- ret, inode_nr, *snapshot);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(trans->c, "error fetching inode %llu:%u: %s",
+ inode_nr, *snapshot, bch2_err_str(ret));
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -219,35 +220,39 @@ static int write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
- int ret = __bch2_trans_do(trans, NULL, NULL,
+ int ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
__write_inode(trans, inode, snapshot));
if (ret)
- bch_err(trans->c, "error in fsck: error %i updating inode", ret);
+ bch_err(trans->c, "error in fsck: error updating inode: %s",
+ bch2_err_str(ret));
return ret;
}
static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
{
+ struct bch_fs *c = trans->c;
struct btree_iter iter = { NULL };
struct bkey_i_inode_generation delete;
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
int ret;
- ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
- SPOS(inum, 0, snapshot),
- SPOS(inum, U64_MAX, snapshot),
- 0, NULL) ?:
- bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
- SPOS(inum, 0, snapshot),
- SPOS(inum, U64_MAX, snapshot),
- 0, NULL) ?:
- bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
- SPOS(inum, 0, snapshot),
- SPOS(inum, U64_MAX, snapshot),
- 0, NULL);
+ do {
+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL) ?:
+ bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL) ?:
+ bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL);
+ } while (ret == -BCH_ERR_transaction_restart_nested);
if (ret)
goto err;
retry:
@@ -262,7 +267,7 @@ retry:
goto err;
if (!bkey_is_inode(k.k)) {
- bch2_fs_inconsistent(trans->c,
+ bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum, snapshot);
ret = -EIO;
@@ -272,11 +277,8 @@ retry:
bch2_inode_unpack(k, &inode_u);
/* Subvolume root? */
- if (inode_u.bi_subvol) {
- ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
- if (ret)
- goto err;
- }
+ if (inode_u.bi_subvol)
+ bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
bkey_inode_generation_init(&delete.k_i);
delete.k.p = iter.pos;
@@ -287,10 +289,10 @@ retry:
BTREE_INSERT_NOFAIL);
err:
bch2_trans_iter_exit(trans, &iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- return ret;
+ return ret ?: -BCH_ERR_transaction_restart_nested;
}
static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
@@ -303,15 +305,19 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
ret = lookup_first_inode(trans, pos.inode, &dir_inode);
if (ret)
- return ret;
+ goto err;
dir_hash_info = bch2_hash_info_init(c, &dir_inode);
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter, 0);
+ &dir_hash_info, &iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
+err:
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
return ret;
}
@@ -346,8 +352,8 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
goto create_lostfound;
}
- if (ret && ret != -EINTR)
- bch_err(c, "error looking up lost+found: %i", ret);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
if (ret)
return ret;
@@ -369,8 +375,8 @@ create_lostfound:
lostfound, &lostfound_str,
0, 0, S_IFDIR|0700, 0, NULL, NULL,
(subvol_inum) { }, 0);
- if (ret && ret != -EINTR)
- bch_err(c, "error creating lost+found: %i", ret);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
return ret;
}
@@ -429,13 +435,13 @@ static int reattach_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 inode_snapshot)
{
- int ret = __bch2_trans_do(trans, NULL, NULL,
+ int ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL,
__reattach_inode(trans, inode, inode_snapshot));
if (ret) {
- bch_err(trans->c, "error %i reattaching inode %llu",
- ret, inode->bi_inum);
+ bch_err(trans->c, "error reattaching inode %llu: %s",
+ inode->bi_inum, bch2_err_str(ret));
return ret;
}
@@ -466,19 +472,82 @@ out:
return ret;
}
-static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
+struct snapshots_seen_entry {
+ u32 id;
+ u32 equiv;
+};
+
+struct snapshots_seen {
+ struct bpos pos;
+ DARRAY(struct snapshots_seen_entry) ids;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
{
- pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+ darray_exit(&s->ids);
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+ memset(s, 0, sizeof(*s));
+}
+
+static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+ struct snapshots_seen_entry *i, n = { id, id };
+ int ret;
+
+ darray_for_each(s->ids, i) {
+ if (n.equiv < i->equiv)
+ break;
- if (bkey_cmp(s->pos, pos))
- s->nr = 0;
+ if (i->equiv == n.equiv) {
+ bch_err(c, "%s(): adding duplicate snapshot", __func__);
+ return -EINVAL;
+ }
+ }
+
+ ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+ if (ret)
+ bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+ s->ids.size);
+ return ret;
+}
+
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
+ enum btree_id btree_id, struct bpos pos)
+{
+ struct snapshots_seen_entry *i, n = {
+ .id = pos.snapshot,
+ .equiv = bch2_snapshot_equiv(c, pos.snapshot),
+ };
+ int ret = 0;
+
+ if (!bkey_eq(s->pos, pos))
+ s->ids.nr = 0;
+
+ pos.snapshot = n.equiv;
s->pos = pos;
- /* Might get called multiple times due to lock restarts */
- if (s->nr && s->d[s->nr - 1] == pos.snapshot)
- return 0;
+ darray_for_each(s->ids, i)
+ if (i->equiv == n.equiv) {
+ if (fsck_err_on(i->id != n.id, c,
+ "snapshot deletion did not run correctly:\n"
+ " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
+ bch2_btree_ids[btree_id],
+ pos.inode, pos.offset,
+ i->id, n.id, n.equiv))
+ return -BCH_ERR_need_snapshot_cleanup;
- return snapshots_seen_add(c, s, pos.snapshot);
+ return 0;
+ }
+
+ ret = darray_push(&s->ids, n);
+ if (ret)
+ bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+ s->ids.size);
+fsck_err:
+ return ret;
}
/**
@@ -491,15 +560,15 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
u32 id, u32 ancestor)
{
ssize_t i;
+ u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0;
BUG_ON(id > ancestor);
-
- id = snapshot_t(c, id)->equiv;
- ancestor = snapshot_t(c, ancestor)->equiv;
+ BUG_ON(!bch2_snapshot_is_equiv(c, id));
+ BUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
/* @ancestor should be the snapshot most recently added to @seen */
- BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
- BUG_ON(seen->pos.snapshot != ancestor);
+ BUG_ON(ancestor != seen->pos.snapshot);
+ BUG_ON(ancestor != top);
if (id == ancestor)
return true;
@@ -507,11 +576,11 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
if (!bch2_snapshot_is_ancestor(c, id, ancestor))
return false;
- for (i = seen->nr - 2;
- i >= 0 && seen->d[i] >= id;
+ for (i = seen->ids.nr - 2;
+ i >= 0 && seen->ids.data[i].equiv >= id;
--i)
- if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
- bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
+ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) &&
+ bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor))
return false;
return true;
@@ -536,27 +605,41 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
: bch2_snapshot_is_ancestor(c, src, dst);
}
-#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
- for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
+static int ref_visible2(struct bch_fs *c,
+ u32 src, struct snapshots_seen *src_seen,
+ u32 dst, struct snapshots_seen *dst_seen)
+{
+ src = bch2_snapshot_equiv(c, src);
+ dst = bch2_snapshot_equiv(c, dst);
+
+ if (dst > src) {
+ swap(dst, src);
+ swap(dst_seen, src_seen);
+ }
+ return key_visible_in_snapshot(c, src_seen, dst, src);
+}
+
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
+ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \
+ (_i)->snapshot <= (_snapshot); _i++) \
if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+struct inode_walker_entry {
+ struct bch_inode_unpacked inode;
+ u32 snapshot;
+ u64 count;
+};
+
struct inode_walker {
bool first_this_inode;
u64 cur_inum;
- size_t nr;
- size_t size;
- struct inode_walker_entry {
- struct bch_inode_unpacked inode;
- u32 snapshot;
- u64 count;
- } *d;
+ DARRAY(struct inode_walker_entry) inodes;
};
static void inode_walker_exit(struct inode_walker *w)
{
- kfree(w->d);
- w->d = NULL;
+ darray_exit(&w->inodes);
}
static struct inode_walker inode_walker_init(void)
@@ -564,43 +647,17 @@ static struct inode_walker inode_walker_init(void)
return (struct inode_walker) { 0, };
}
-static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w)
-{
- if (w->nr == w->size) {
- size_t new_size = max_t(size_t, 8UL, w->size * 2);
- void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
- GFP_KERNEL);
- if (!d) {
- bch_err(c, "fsck: error allocating memory for inode_walker, size %zu",
- new_size);
- return -ENOMEM;
- }
-
- w->d = d;
- w->size = new_size;
- }
-
- return 0;
-}
-
static int add_inode(struct bch_fs *c, struct inode_walker *w,
struct bkey_s_c inode)
{
struct bch_inode_unpacked u;
- int ret;
-
- ret = inode_walker_realloc(c, w);
- if (ret)
- return ret;
BUG_ON(bch2_inode_unpack(inode, &u));
- w->d[w->nr++] = (struct inode_walker_entry) {
+ return darray_push(&w->inodes, ((struct inode_walker_entry) {
.inode = u,
- .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv,
- };
-
- return 0;
+ .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot),
+ }));
}
static int __walk_inode(struct btree_trans *trans,
@@ -609,17 +666,18 @@ static int __walk_inode(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- unsigned i, ancestor_pos;
+ u32 restart_count = trans->restart_count;
+ unsigned i;
int ret;
- pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+ pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot);
if (pos.inode == w->cur_inum) {
w->first_this_inode = false;
goto lookup_snapshot;
}
- w->nr = 0;
+ w->inodes.nr = 0;
for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -636,27 +694,33 @@ static int __walk_inode(struct btree_trans *trans,
w->cur_inum = pos.inode;
w->first_this_inode = true;
+
+ if (trans_was_restarted(trans, restart_count))
+ return -BCH_ERR_transaction_restart_nested;
+
lookup_snapshot:
- for (i = 0; i < w->nr; i++)
- if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
+ for (i = 0; i < w->inodes.nr; i++)
+ if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
goto found;
return INT_MAX;
found:
- BUG_ON(pos.snapshot > w->d[i].snapshot);
+ BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
+
+ if (pos.snapshot != w->inodes.data[i].snapshot) {
+ struct inode_walker_entry e = w->inodes.data[i];
- if (pos.snapshot != w->d[i].snapshot) {
- ancestor_pos = i;
+ e.snapshot = pos.snapshot;
+ e.count = 0;
- while (i && w->d[i - 1].snapshot > pos.snapshot)
+ bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
+ pos.inode, pos.snapshot, w->inodes.data[i].snapshot);
+
+ while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
--i;
- ret = inode_walker_realloc(c, w);
+ ret = darray_insert_item(&w->inodes, i, e);
if (ret)
return ret;
-
- array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
- w->d[i].snapshot = pos.snapshot;
- w->d[i].count = 0;
}
return i;
@@ -672,21 +736,23 @@ static int __get_visible_inodes(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- w->nr = 0;
+ w->inodes.nr = 0;
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
if (k.k->p.offset != inum)
break;
- if (!bkey_is_inode(k.k))
+ if (!ref_visible(c, s, s->pos.snapshot, equiv))
continue;
- if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
+ if (bkey_is_inode(k.k))
add_inode(c, w, k);
- if (k.k->p.snapshot >= s->pos.snapshot)
- break;
- }
+
+ if (equiv >= s->pos.snapshot)
+ break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -698,15 +764,16 @@ static int check_key_has_snapshot(struct btree_trans *trans,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
+ if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
"key in missing snapshot: %s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -715,9 +782,6 @@ static int hash_redo_key(struct btree_trans *trans,
struct bch_hash_info *hash_info,
struct btree_iter *k_iter, struct bkey_s_c k)
{
- bch_err(trans->c, "hash_redo_key() not implemented yet");
- return -EINVAL;
-#if 0
struct bkey_i *delete;
struct bkey_i *tmp;
@@ -725,18 +789,22 @@ static int hash_redo_key(struct btree_trans *trans,
if (IS_ERR(delete))
return PTR_ERR(delete);
- tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ tmp = bch2_bkey_make_mut(trans, k);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
- bkey_reassemble(tmp, k);
-
bkey_init(&delete->k);
delete->k.p = k_iter->pos;
return bch2_btree_iter_traverse(k_iter) ?:
bch2_trans_update(trans, k_iter, delete, 0) ?:
- bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
-#endif
+ bch2_hash_set_snapshot(trans, desc, hash_info,
+ (subvol_inum) { 0, k.k->p.inode },
+ k.k->p.snapshot, tmp,
+ BCH_HASH_SET_MUST_CREATE,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW);
}
static int hash_check_key(struct btree_trans *trans,
@@ -746,7 +814,7 @@ static int hash_check_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_iter iter = { NULL };
- char buf[200];
+ struct printbuf buf = PRINTBUF;
struct bkey_s_c k;
u64 hash;
int ret = 0;
@@ -762,16 +830,18 @@ static int hash_check_key(struct btree_trans *trans,
if (hash_k.k->p.offset < hash)
goto bad_hash;
- for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
- BTREE_ITER_SLOTS, k, ret) {
- if (!bkey_cmp(k.k->p, hash_k.k->p))
+ for_each_btree_key_norestart(trans, iter, desc.btree_id,
+ SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
+ BTREE_ITER_SLOTS, k, ret) {
+ if (bkey_eq(k.k->p, hash_k.k->p))
break;
if (fsck_err_on(k.k->type == desc.key_type &&
!desc.cmp_bkey(k, hash_k), c,
"duplicate hash table keys:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- hash_k), buf))) {
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k),
+ buf.buf))) {
ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
break;
}
@@ -780,49 +850,48 @@ static int hash_check_key(struct btree_trans *trans,
bch2_trans_iter_exit(trans, &iter);
goto bad_hash;
}
-
}
+out:
bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
return ret;
bad_hash:
- if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
- "hashed to %llu\n%s",
- desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
- (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
- return 0;
-
- ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
- if (ret) {
- bch_err(c, "hash_redo_key err %i", ret);
- return ret;
+ if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
+ bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
+ ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
+ if (ret) {
+ bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
+ return ret;
+ }
+ ret = -BCH_ERR_transaction_restart_nested;
}
- return -EINTR;
fsck_err:
- return ret;
+ goto out;
}
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
+ struct bkey_s_c k,
struct bch_inode_unpacked *prev,
+ struct snapshots_seen *s,
bool full)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
struct bch_inode_unpacked u;
bool do_update = false;
int ret;
- k = bch2_btree_iter_peek(iter);
- if (!k.k)
- return 0;
-
- ret = bkey_err(k);
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret < 0)
+ goto err;
if (ret)
- return ret;
+ return 0;
- ret = check_key_has_snapshot(trans, iter, k);
+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
if (ret)
- return ret < 0 ? ret : 0;
+ goto err;
/*
* if snapshot id isn't a leaf node, skip it - deletion in
@@ -861,8 +930,9 @@ static int check_inode(struct btree_trans *trans,
bch2_fs_lazy_rw(c);
ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
- if (ret)
- bch_err(c, "error in fsck: error %i while deleting inode", ret);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error in fsck: error while deleting inode: %s",
+ bch2_err_str(ret));
return ret;
}
@@ -884,10 +954,11 @@ static int check_inode(struct btree_trans *trans,
iter->pos.snapshot),
POS(u.bi_inum, U64_MAX),
0, NULL);
- if (ret) {
- bch_err(c, "error in fsck: error %i truncating inode", ret);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error in fsck: error truncating inode: %s",
+ bch2_err_str(ret));
+ if (ret)
return ret;
- }
/*
* We truncated without our normal sector accounting hook, just
@@ -910,8 +981,8 @@ static int check_inode(struct btree_trans *trans,
sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
if (sectors < 0) {
- bch_err(c, "error in fsck: error %i recounting inode sectors",
- (int) sectors);
+ bch_err(c, "error in fsck: error recounting inode sectors: %s",
+ bch2_err_str(sectors));
return sectors;
}
@@ -928,12 +999,15 @@ static int check_inode(struct btree_trans *trans,
}
if (do_update) {
- ret = write_inode(trans, &u, iter->pos.snapshot);
+ ret = __write_inode(trans, &u, iter->pos.snapshot);
if (ret)
- bch_err(c, "error in fsck: error %i "
- "updating inode", ret);
+ bch_err(c, "error in fsck: error updating inode: %s",
+ bch2_err_str(ret));
}
+err:
fsck_err:
+ if (ret)
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
return ret;
}
@@ -943,86 +1017,23 @@ static int check_inodes(struct bch_fs *c, bool full)
struct btree_trans trans;
struct btree_iter iter;
struct bch_inode_unpacked prev = { 0 };
- int ret;
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- do {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_inode(&trans, &iter, &prev, full));
- if (ret)
- break;
- } while (bch2_btree_iter_advance(&iter));
- bch2_trans_iter_exit(&trans, &iter);
-
- bch2_trans_exit(&trans);
- return ret;
-}
-
-static int check_subvol(struct btree_trans *trans,
- struct btree_iter *iter)
-{
+ struct snapshots_seen s;
struct bkey_s_c k;
- struct bkey_s_c_subvolume subvol;
- int ret;
-
- k = bch2_btree_iter_peek(iter);
- if (!k.k)
- return 0;
-
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (k.k->type != KEY_TYPE_subvolume)
- return 0;
-
- subvol = bkey_s_c_to_subvolume(k);
-
- if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
- ret = bch2_subvolume_delete(trans, iter->pos.offset);
- if (ret && ret != -EINTR)
- bch_err(trans->c, "error deleting subvolume %llu: %i",
- iter->pos.offset, ret);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-noinline_for_stack
-static int check_subvols(struct bch_fs *c)
-{
- struct btree_trans trans;
- struct btree_iter iter;
int ret;
+ snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
- POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH);
-
- do {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_subvol(&trans, &iter));
- if (ret)
- break;
- } while (bch2_btree_iter_advance(&iter));
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+ POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_inode(&trans, &iter, k, &prev, &s, full));
bch2_trans_exit(&trans);
+ snapshots_seen_exit(&s);
+ if (ret)
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
return ret;
}
@@ -1114,7 +1125,7 @@ static int inode_backpointer_exists(struct btree_trans *trans,
SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
ret = bkey_err(d.s_c);
if (ret)
- return ret;
+ return ret == -ENOENT ? 0 : ret;
ret = dirent_points_to_inode(d, inode);
bch2_trans_iter_exit(trans, &iter);
@@ -1125,15 +1136,15 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
- int ret = 0, ret2 = 0;
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
s64 count2;
- for (i = w->d; i < w->d + w->nr; i++) {
+ darray_for_each(w->inodes, i) {
if (i->inode.bi_sectors == i->count)
continue;
- count2 = lockrestart_do(trans,
- bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
+ count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot);
if (i->count != count2) {
bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
@@ -1146,114 +1157,240 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
w->cur_inum, i->snapshot,
- i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
+ i->inode.bi_sectors, i->count)) {
+ i->inode.bi_sectors = i->count;
+ ret = write_inode(trans, &i->inode, i->snapshot);
+ if (ret)
+ break;
+ }
+ }
+fsck_err:
+ if (ret)
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+ if (!ret && trans_was_restarted(trans, restart_count))
+ ret = -BCH_ERR_transaction_restart_nested;
+ return ret;
+}
+
+struct extent_end {
+ u32 snapshot;
+ u64 offset;
+ struct snapshots_seen seen;
+};
+
+typedef DARRAY(struct extent_end) extent_ends;
+
+static int check_overlapping_extents(struct btree_trans *trans,
+ struct snapshots_seen *seen,
+ extent_ends *extent_ends,
+ struct bkey_s_c k,
+ struct btree_iter *iter)
+{
+ struct bch_fs *c = trans->c;
+ struct extent_end *i;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ darray_for_each(*extent_ends, i) {
+ /* duplicate, due to transaction restart: */
+ if (i->offset == k.k->p.offset &&
+ i->snapshot == k.k->p.snapshot)
continue;
- i->inode.bi_sectors = i->count;
- ret = write_inode(trans, &i->inode, i->snapshot);
- if (ret)
- break;
- ret2 = -EINTR;
+ if (!ref_visible2(c,
+ k.k->p.snapshot, seen,
+ i->snapshot, &i->seen))
+ continue;
+
+ if (fsck_err_on(i->offset > bkey_start_offset(k.k), c,
+ "overlapping extents: extent in snapshot %u ends at %llu overlaps with\n%s",
+ i->snapshot,
+ i->offset,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ struct bkey_i *update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+ bkey_reassemble(update, k);
+ ret = bch2_trans_update_extent(trans, iter, update, 0);
+ if (!ret)
+ goto err;
+ }
}
+err:
fsck_err:
- return ret ?: ret2;
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int extent_ends_at(extent_ends *extent_ends,
+ struct snapshots_seen *seen,
+ struct bkey_s_c k)
+{
+ struct extent_end *i, n = (struct extent_end) {
+ .snapshot = k.k->p.snapshot,
+ .offset = k.k->p.offset,
+ .seen = *seen,
+ };
+
+ n.seen.ids.data = kmemdup(seen->ids.data,
+ sizeof(seen->ids.data[0]) * seen->ids.size,
+ GFP_KERNEL);
+ if (!n.seen.ids.data)
+ return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
+
+ darray_for_each(*extent_ends, i) {
+ if (i->snapshot == k.k->p.snapshot) {
+ snapshots_seen_exit(&i->seen);
+ *i = n;
+ return 0;
+ }
+
+ if (i->snapshot >= k.k->p.snapshot)
+ break;
+ }
+
+ return darray_insert_item(extent_ends, i - extent_ends->data, n);
+}
+
+static void extent_ends_reset(extent_ends *extent_ends)
+{
+ struct extent_end *i;
+
+ darray_for_each(*extent_ends, i)
+ snapshots_seen_exit(&i->seen);
+
+ extent_ends->nr = 0;
}
static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k,
struct inode_walker *inode,
- struct snapshots_seen *s)
+ struct snapshots_seen *s,
+ extent_ends *extent_ends)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
struct inode_walker_entry *i;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
+ struct bpos equiv;
int ret = 0;
- k = bch2_btree_iter_peek(iter);
- if (!k.k)
- return 0;
-
- ret = bkey_err(k);
- if (ret)
- return ret;
-
ret = check_key_has_snapshot(trans, iter, k);
- if (ret)
- return ret < 0 ? ret : 0;
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
+ goto out;
+ }
+
+ equiv = k.k->p;
+ equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
- ret = snapshots_seen_update(c, s, k.k->p);
+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
if (ret)
- return ret;
+ goto err;
if (k.k->type == KEY_TYPE_whiteout)
- return 0;
+ goto out;
if (inode->cur_inum != k.k->p.inode) {
ret = check_i_sectors(trans, inode);
if (ret)
- return ret;
+ goto err;
+
+ extent_ends_reset(extent_ends);
}
-#if 0
- if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
- char buf1[200];
- char buf2[200];
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
- bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+ BUG_ON(!iter->path->should_be_locked);
- if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
- return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
- }
-#endif
- ret = __walk_inode(trans, inode, k.k->p);
+ ret = check_overlapping_extents(trans, s, extent_ends, k, iter);
+ if (ret)
+ goto err;
+
+ ret = extent_ends_at(extent_ends, s, k);
+ if (ret)
+ goto err;
+
+ ret = __walk_inode(trans, inode, equiv);
if (ret < 0)
- return ret;
+ goto err;
if (fsck_err_on(ret == INT_MAX, c,
"extent in missing inode:\n %s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ goto out;
+ }
- if (ret == INT_MAX)
- return 0;
+ if (ret == INT_MAX) {
+ ret = 0;
+ goto out;
+ }
- i = inode->d + ret;
+ i = inode->inodes.data + ret;
ret = 0;
if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
!S_ISLNK(i->inode.bi_mode), c,
"extent in non regular inode mode %o:\n %s",
i->inode.bi_mode,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ goto out;
+ }
+
+ /*
+ * Check inodes in reverse order, from oldest snapshots to newest, so
+ * that we emit the fewest number of whiteouts necessary:
+ */
+ for (i = inode->inodes.data + inode->inodes.nr - 1;
+ i >= inode->inodes.data;
+ --i) {
+ if (i->snapshot > equiv.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
+ continue;
- if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
- for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- k.k->type != KEY_TYPE_reservation &&
- k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
- "extent type %u offset %llu past end of inode %llu, i_size %llu",
- k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
- bch2_fs_lazy_rw(c);
- return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
- SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
- k.k->p.snapshot),
- POS(k.k->p.inode, U64_MAX),
- 0, NULL) ?: -EINTR;
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+ !bkey_extent_is_reservation(k), c,
+ "extent type past end of inode %llu:%u, i_size %llu\n %s",
+ i->inode.bi_inum, i->snapshot, i->inode.bi_size,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ struct btree_iter iter2;
+
+ bch2_trans_copy_iter(&iter2, iter);
+ bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
+ ret = bch2_btree_iter_traverse(&iter2) ?:
+ bch2_btree_delete_at(trans, &iter2,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_iter_exit(trans, &iter2);
+ if (ret)
+ goto err;
+
+ if (i->snapshot != equiv.snapshot) {
+ ret = snapshots_seen_add(c, s, i->snapshot);
+ if (ret)
+ goto err;
}
}
}
if (bkey_extent_is_allocation(k.k))
- for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
+ for_each_visible_inode(c, s, inode, equiv.snapshot, i)
i->count += k.k->size;
#if 0
bch2_bkey_buf_reassemble(&prev, c, k);
#endif
+out:
+err:
fsck_err:
+ printbuf_exit(&buf);
+
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
return ret;
}
@@ -1268,40 +1405,30 @@ static int check_extents(struct bch_fs *c)
struct snapshots_seen s;
struct btree_trans trans;
struct btree_iter iter;
+ struct bkey_s_c k;
+ extent_ends extent_ends = { 0 };
int ret = 0;
-#if 0
- struct bkey_buf prev;
- bch2_bkey_buf_init(&prev);
- prev.k->k = KEY(0, 0, 0);
-#endif
snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
bch_verbose(c, "checking extents");
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_extent(&trans, &iter, k, &w, &s, &extent_ends));
- do {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_extent(&trans, &iter, &w, &s));
- if (ret)
- break;
- } while (bch2_btree_iter_advance(&iter));
- bch2_trans_iter_exit(&trans, &iter);
-#if 0
- bch2_bkey_buf_exit(&prev, c);
-#endif
+ extent_ends_reset(&extent_ends);
+ darray_exit(&extent_ends);
inode_walker_exit(&w);
bch2_trans_exit(&trans);
snapshots_seen_exit(&s);
+ if (ret)
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
return ret;
}
@@ -1309,15 +1436,17 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
- int ret = 0, ret2 = 0;
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
s64 count2;
- for (i = w->d; i < w->d + w->nr; i++) {
+ darray_for_each(w->inodes, i) {
if (i->inode.bi_nlink == i->count)
continue;
- count2 = lockrestart_do(trans,
- bch2_count_subdirs(trans, w->cur_inum, i->snapshot));
+ count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
+ if (count2 < 0)
+ return count2;
if (i->count != count2) {
bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
@@ -1334,11 +1463,14 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
ret = write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
- ret2 = -EINTR;
}
}
fsck_err:
- return ret ?: ret2;
+ if (ret)
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
+ if (!ret && trans_was_restarted(trans, restart_count))
+ ret = -BCH_ERR_transaction_restart_nested;
+ return ret;
}
static int check_dirent_target(struct btree_trans *trans,
@@ -1350,7 +1482,7 @@ static int check_dirent_target(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bkey_i_dirent *n;
bool backpointer_exists = true;
- char buf[200];
+ struct printbuf buf = PRINTBUF;
int ret = 0;
if (!target->bi_dir &&
@@ -1376,15 +1508,13 @@ static int check_dirent_target(struct btree_trans *trans,
"directory %llu with multiple links",
target->bi_inum)) {
ret = __remove_dirent(trans, d.k->p);
- if (ret)
- goto err;
- return 0;
+ goto out;
}
if (fsck_err_on(backpointer_exists &&
!target->bi_nlink, c,
- "inode %llu has multiple links but i_nlink 0",
- target->bi_inum)) {
+ "inode %llu type %s has multiple links but i_nlink 0",
+ target->bi_inum, bch2_d_types[d.v->d_type])) {
target->bi_nlink++;
target->bi_flags &= ~BCH_INODE_UNLINKED;
@@ -1415,18 +1545,19 @@ static int check_dirent_target(struct btree_trans *trans,
"incorrect d_type: got %s, should be %s:\n%s",
bch2_d_type_str(d.v->d_type),
bch2_d_type_str(inode_d_type(target)),
- (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
- return ret;
+ goto err;
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_type = inode_d_type(target);
ret = bch2_trans_update(trans, iter, &n->k_i, 0);
if (ret)
- return ret;
+ goto err;
d = dirent_i_to_s_c(n);
}
@@ -1440,94 +1571,110 @@ static int check_dirent_target(struct btree_trans *trans,
n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
- return ret;
+ goto err;
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
ret = bch2_trans_update(trans, iter, &n->k_i, 0);
if (ret)
- return ret;
+ goto err;
d = dirent_i_to_s_c(n);
}
+out:
err:
fsck_err:
+ printbuf_exit(&buf);
+
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
return ret;
}
static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k,
struct bch_hash_info *hash_info,
struct inode_walker *dir,
struct inode_walker *target,
struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
struct bkey_s_c_dirent d;
struct inode_walker_entry *i;
- char buf[200];
- int ret;
-
- k = bch2_btree_iter_peek(iter);
- if (!k.k)
- return 0;
-
- ret = bkey_err(k);
- if (ret)
- return ret;
+ struct printbuf buf = PRINTBUF;
+ struct bpos equiv;
+ int ret = 0;
ret = check_key_has_snapshot(trans, iter, k);
- if (ret)
- return ret < 0 ? ret : 0;
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
+ goto out;
+ }
- ret = snapshots_seen_update(c, s, k.k->p);
+ equiv = k.k->p;
+ equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
if (ret)
- return ret;
+ goto err;
if (k.k->type == KEY_TYPE_whiteout)
- return 0;
+ goto out;
if (dir->cur_inum != k.k->p.inode) {
ret = check_subdir_count(trans, dir);
if (ret)
- return ret;
+ goto err;
}
- ret = __walk_inode(trans, dir, k.k->p);
+ BUG_ON(!iter->path->should_be_locked);
+
+ ret = __walk_inode(trans, dir, equiv);
if (ret < 0)
- return ret;
+ goto err;
if (fsck_err_on(ret == INT_MAX, c,
"dirent in nonexisting directory:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ goto out;
+ }
- if (ret == INT_MAX)
- return 0;
+ if (ret == INT_MAX) {
+ ret = 0;
+ goto out;
+ }
- i = dir->d + ret;
+ i = dir->inodes.data + ret;
ret = 0;
if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
"dirent in non directory inode type %s:\n%s",
bch2_d_type_str(inode_d_type(&i->inode)),
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return bch2_btree_delete_at(trans, iter, 0);
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter, 0);
+ goto out;
+ }
if (dir->first_this_inode)
- *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
+ *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
ret = hash_check_key(trans, bch2_dirent_hash_desc,
hash_info, iter, k);
if (ret < 0)
- return ret;
- if (ret) /* dirent has been deleted */
- return 0;
+ goto err;
+ if (ret) {
+ /* dirent has been deleted */
+ ret = 0;
+ goto out;
+ }
if (k.k->type != KEY_TYPE_dirent)
- return 0;
+ goto out;
d = bkey_s_c_to_dirent(k);
@@ -1540,24 +1687,27 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
ret = __subvol_lookup(trans, target_subvol,
&target_snapshot, &target_inum);
if (ret && ret != -ENOENT)
- return ret;
+ goto err;
if (fsck_err_on(ret, c,
"dirent points to missing subvolume %llu",
- le64_to_cpu(d.v->d_child_subvol)))
- return __remove_dirent(trans, d.k->p);
+ le64_to_cpu(d.v->d_child_subvol))) {
+ ret = __remove_dirent(trans, d.k->p);
+ goto err;
+ }
ret = __lookup_inode(trans, target_inum,
&subvol_root, &target_snapshot);
if (ret && ret != -ENOENT)
- return ret;
+ goto err;
if (fsck_err_on(ret, c,
"subvolume %u points to missing subvolume root %llu",
target_subvol,
target_inum)) {
bch_err(c, "repair not implemented yet");
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
@@ -1567,40 +1717,48 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
subvol_root.bi_subvol = target_subvol;
ret = __write_inode(trans, &subvol_root, target_snapshot);
if (ret)
- return ret;
+ goto err;
}
ret = check_dirent_target(trans, iter, d, &subvol_root,
target_snapshot);
if (ret)
- return ret;
+ goto err;
} else {
ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
if (ret)
- return ret;
+ goto err;
- if (fsck_err_on(!target->nr, c,
- "dirent points to missing inode:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- k), buf))) {
+ if (fsck_err_on(!target->inodes.nr, c,
+ "dirent points to missing inode: (equiv %u)\n%s",
+ equiv.snapshot,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf))) {
ret = __remove_dirent(trans, d.k->p);
if (ret)
- return ret;
+ goto err;
}
- for (i = target->d; i < target->d + target->nr; i++) {
+ darray_for_each(target->inodes, i) {
ret = check_dirent_target(trans, iter, d,
&i->inode, i->snapshot);
if (ret)
- return ret;
+ goto err;
}
}
if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+ for_each_visible_inode(c, s, dir, equiv.snapshot, i)
i->count++;
+out:
+err:
fsck_err:
+ printbuf_exit(&buf);
+
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
return ret;
}
@@ -1617,6 +1775,7 @@ static int check_dirents(struct bch_fs *c)
struct bch_hash_info hash_info;
struct btree_trans trans;
struct btree_iter iter;
+ struct bkey_s_c k;
int ret = 0;
bch_verbose(c, "checking dirents");
@@ -1624,46 +1783,32 @@ static int check_dirents(struct bch_fs *c)
snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- do {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_dirent(&trans, &iter, &hash_info,
- &dir, &target, &s));
- if (ret)
- break;
- } while (bch2_btree_iter_advance(&iter));
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
bch2_trans_exit(&trans);
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
inode_walker_exit(&target);
+
+ if (ret)
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
return ret;
}
static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k,
struct bch_hash_info *hash_info,
struct inode_walker *inode)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
int ret;
- k = bch2_btree_iter_peek(iter);
- if (!k.k)
- return 0;
-
- ret = bkey_err(k);
- if (ret)
- return ret;
-
ret = check_key_has_snapshot(trans, iter, k);
if (ret)
return ret;
@@ -1683,10 +1828,12 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
ret = 0;
if (inode->first_this_inode)
- *hash_info = bch2_hash_info_init(c, &inode->d[0].inode);
+ *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
fsck_err:
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
return ret;
}
@@ -1700,30 +1847,25 @@ static int check_xattrs(struct bch_fs *c)
struct bch_hash_info hash_info;
struct btree_trans trans;
struct btree_iter iter;
+ struct bkey_s_c k;
int ret = 0;
bch_verbose(c, "checking xattrs");
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- do {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_xattr(&trans, &iter, &hash_info,
- &inode));
- if (ret)
- break;
- } while (bch2_btree_iter_advance(&iter));
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_xattr(&trans, &iter, k, &hash_info, &inode));
bch2_trans_exit(&trans);
+
+ if (ret)
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
return ret;
}
@@ -1750,12 +1892,13 @@ static int check_root_trans(struct btree_trans *trans)
root_subvol.v.flags = 0;
root_subvol.v.snapshot = cpu_to_le32(snapshot);
root_subvol.v.inode = cpu_to_le64(inum);
- ret = __bch2_trans_do(trans, NULL, NULL,
+ ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
- __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
+ __bch2_btree_insert(trans, BTREE_ID_subvolumes,
+ &root_subvol.k_i, 0));
if (ret) {
- bch_err(c, "error writing root subvol: %i", ret);
+ bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
goto err;
}
@@ -1774,7 +1917,7 @@ static int check_root_trans(struct btree_trans *trans)
ret = __write_inode(trans, &root_inode, snapshot);
if (ret)
- bch_err(c, "error writing root inode: %i", ret);
+ bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
}
err:
fsck_err:
@@ -1793,21 +1936,18 @@ static int check_root(struct bch_fs *c)
check_root_trans(&trans));
}
-struct pathbuf {
- size_t nr;
- size_t size;
-
- struct pathbuf_entry {
- u64 inum;
- u32 snapshot;
- } *entries;
+struct pathbuf_entry {
+ u64 inum;
+ u32 snapshot;
};
-static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
+typedef DARRAY(struct pathbuf_entry) pathbuf;
+
+static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
{
struct pathbuf_entry *i;
- for (i = p->entries; i < p->entries + p->nr; i++)
+ darray_for_each(*p, i)
if (i->inum == inum &&
i->snapshot == snapshot)
return true;
@@ -1815,29 +1955,18 @@ static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
return false;
}
-static int path_down(struct bch_fs *c, struct pathbuf *p,
+static int path_down(struct bch_fs *c, pathbuf *p,
u64 inum, u32 snapshot)
{
- if (p->nr == p->size) {
- size_t new_size = max_t(size_t, 256UL, p->size * 2);
- void *n = krealloc(p->entries,
- new_size * sizeof(p->entries[0]),
- GFP_KERNEL);
- if (!n) {
- bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
- new_size);
- return -ENOMEM;
- }
-
- p->entries = n;
- p->size = new_size;
- };
-
- p->entries[p->nr++] = (struct pathbuf_entry) {
+ int ret = darray_push(p, ((struct pathbuf_entry) {
.inum = inum,
.snapshot = snapshot,
- };
- return 0;
+ }));
+
+ if (ret)
+ bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
+ p->size);
+ return ret;
}
/*
@@ -1846,14 +1975,14 @@ static int path_down(struct bch_fs *c, struct pathbuf *p,
* XXX: we should also be verifying that inodes are in the right subvolumes
*/
static int check_path(struct btree_trans *trans,
- struct pathbuf *p,
+ pathbuf *p,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
struct bch_fs *c = trans->c;
int ret = 0;
- snapshot = snapshot_t(c, snapshot)->equiv;
+ snapshot = bch2_snapshot_equiv(c, snapshot);
p->nr = 0;
while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
@@ -1920,14 +2049,14 @@ static int check_path(struct btree_trans *trans,
/* XXX print path */
bch_err(c, "directory structure loop");
- for (i = p->entries; i < p->entries + p->nr; i++)
+ darray_for_each(*p, i)
pr_err("%llu:%u", i->inum, i->snapshot);
pr_err("%llu:%u", inode->bi_inum, snapshot);
if (!fsck_err(c, "directory structure loop"))
return 0;
- ret = __bch2_trans_do(trans, NULL, NULL,
+ ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
remove_backpointer(trans, inode));
@@ -1941,7 +2070,7 @@ static int check_path(struct btree_trans *trans,
}
fsck_err:
if (ret)
- bch_err(c, "%s: err %i", __func__, ret);
+ bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
return ret;
}
@@ -1957,7 +2086,7 @@ static int check_directory_structure(struct bch_fs *c)
struct btree_iter iter;
struct bkey_s_c k;
struct bch_inode_unpacked u;
- struct pathbuf path = { 0, 0, NULL };
+ pathbuf path = { 0, };
int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
@@ -1985,9 +2114,7 @@ static int check_directory_structure(struct bch_fs *c)
}
bch2_trans_iter_exit(&trans, &iter);
- BUG_ON(ret == -EINTR);
-
- kfree(path.entries);
+ darray_exit(&path);
bch2_trans_exit(&trans);
return ret;
@@ -2009,11 +2136,12 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t,
{
if (t->nr == t->size) {
size_t new_size = max_t(size_t, 128UL, t->size * 2);
- void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
+ void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
+
if (!d) {
bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
new_size);
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_fsck_add_nlink;
}
if (t->d)
@@ -2052,8 +2180,8 @@ static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
if (inum < range_start || inum >= range_end)
return;
- link = bsearch(&key, links->d, links->nr,
- sizeof(links->d[0]), nlink_cmp);
+ link = __inline_bsearch(&key, links->d, links->nr,
+ sizeof(links->d[0]), nlink_cmp);
if (!link)
return;
@@ -2138,7 +2266,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- ret = snapshots_seen_update(c, &s, k.k->p);
+ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
if (ret)
break;
@@ -2150,7 +2278,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
d.v->d_type != DT_SUBVOL)
inc_link(c, &s, links, range_start, range_end,
le64_to_cpu(d.v->d_inum),
- d.k->p.snapshot);
+ bch2_snapshot_equiv(c, d.k->p.snapshot));
break;
}
}
@@ -2164,6 +2292,47 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
return ret;
}
+static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct nlink_table *links,
+ size_t *idx, u64 range_end)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked u;
+ struct nlink *link = &links->d[*idx];
+ int ret = 0;
+
+ if (k.k->p.offset >= range_end)
+ return 1;
+
+ if (!bkey_is_inode(k.k))
+ return 0;
+
+ BUG_ON(bch2_inode_unpack(k, &u));
+
+ if (S_ISDIR(le16_to_cpu(u.bi_mode)))
+ return 0;
+
+ if (!u.bi_nlink)
+ return 0;
+
+ while ((cmp_int(link->inum, k.k->p.offset) ?:
+ cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
+ BUG_ON(*idx == links->nr);
+ link = &links->d[++*idx];
+ }
+
+ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
+ "inode %llu type %s has wrong i_nlink (%u, should be %u)",
+ u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
+ bch2_inode_nlink_get(&u), link->count)) {
+ bch2_inode_nlink_set(&u, link->count);
+ ret = __write_inode(trans, &u, k.k->p.snapshot);
+ }
+fsck_err:
+ return ret;
+}
+
noinline_for_stack
static int check_nlinks_update_hardlinks(struct bch_fs *c,
struct nlink_table *links,
@@ -2172,56 +2341,25 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct bch_inode_unpacked u;
- struct nlink *link = links->d;
+ size_t idx = 0;
int ret = 0;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_inodes,
- POS(0, range_start),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- if (k.k->p.offset >= range_end)
- break;
-
- if (!bkey_is_inode(k.k))
- continue;
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+ POS(0, range_start),
+ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
- BUG_ON(bch2_inode_unpack(k, &u));
-
- if (S_ISDIR(le16_to_cpu(u.bi_mode)))
- continue;
-
- if (!u.bi_nlink)
- continue;
-
- while ((cmp_int(link->inum, k.k->p.offset) ?:
- cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
- link++;
- BUG_ON(link >= links->d + links->nr);
- }
-
- if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
- "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
- u.bi_inum, mode_to_type(u.bi_mode),
- bch2_inode_nlink_get(&u), link->count)) {
- bch2_inode_nlink_set(&u, link->count);
-
- ret = write_inode(&trans, &u, k.k->p.snapshot);
- if (ret)
- bch_err(c, "error in fsck: error %i updating inode", ret);
- }
- }
-fsck_err:
- bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- if (ret)
+ if (ret < 0) {
bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+ return ret;
+ }
- return ret;
+ return 0;
}
noinline_for_stack
@@ -2261,21 +2399,13 @@ static int check_nlinks(struct bch_fs *c)
return ret;
}
-static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
{
- struct bkey_s_c k;
struct bkey_s_c_reflink_p p;
struct bkey_i_reflink_p *u;
int ret;
- k = bch2_btree_iter_peek(iter);
- if (!k.k)
- return 0;
-
- ret = bkey_err(k);
- if (ret)
- return ret;
-
if (k.k->type != KEY_TYPE_reflink_p)
return 0;
@@ -2311,20 +2441,11 @@ static int fix_reflink_p(struct bch_fs *c)
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- if (k.k->type == KEY_TYPE_reflink_p) {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- fix_reflink_p_key(&trans, &iter));
- if (ret)
- break;
- }
- }
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_extents, POS_MIN,
+ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ fix_reflink_p_key(&trans, &iter, k));
bch2_trans_exit(&trans);
return ret;
@@ -2336,9 +2457,12 @@ static int fix_reflink_p(struct bch_fs *c)
*/
int bch2_fsck_full(struct bch_fs *c)
{
- return bch2_fs_snapshots_check(c) ?:
+ int ret;
+again:
+ ret = bch2_fs_check_snapshots(c) ?:
+ bch2_fs_check_subvols(c) ?:
+ bch2_delete_dead_snapshots(c) ?:
check_inodes(c, true) ?:
- check_subvols(c) ?:
check_extents(c) ?:
check_dirents(c) ?:
check_xattrs(c) ?:
@@ -2346,9 +2470,19 @@ int bch2_fsck_full(struct bch_fs *c)
check_directory_structure(c) ?:
check_nlinks(c) ?:
fix_reflink_p(c);
+
+ if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) {
+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+ goto again;
+ }
+
+ return ret;
}
int bch2_fsck_walk_inodes_only(struct bch_fs *c)
{
- return check_inodes(c, false);
+ return bch2_fs_check_snapshots(c) ?:
+ bch2_fs_check_subvols(c) ?:
+ bch2_delete_dead_snapshots(c) ?:
+ check_inodes(c, false);
}
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 3a7c14684102..7ccbc00b7156 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -60,11 +60,10 @@ static int inode_decode_field(const u8 *in, const u8 *end,
return bytes;
}
-void bch2_inode_pack(struct bch_fs *c,
- struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
+static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
{
- struct bkey_i_inode_v2 *k = &packed->inode;
+ struct bkey_i_inode_v3 *k = &packed->inode;
u8 *out = k->v.fields;
u8 *end = (void *) &packed[1];
u8 *last_nonzero_field = out;
@@ -72,13 +71,17 @@ void bch2_inode_pack(struct bch_fs *c,
unsigned bytes;
int ret;
- bkey_inode_v2_init(&packed->inode.k_i);
+ bkey_inode_v3_init(&packed->inode.k_i);
packed->inode.k.p.offset = inode->bi_inum;
packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq);
packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags);
- packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags);
- packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
+ packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors);
+ packed->inode.v.bi_size = cpu_to_le64(inode->bi_size);
+ packed->inode.v.bi_version = cpu_to_le64(inode->bi_version);
+ SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
+ SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
+
#define x(_name, _bits) \
nr_fields++; \
@@ -99,7 +102,7 @@ void bch2_inode_pack(struct bch_fs *c,
*out++ = 0; \
}
- BCH_INODE_FIELDS()
+ BCH_INODE_FIELDS_v3()
#undef x
BUG_ON(out > end);
@@ -110,7 +113,7 @@ void bch2_inode_pack(struct bch_fs *c,
set_bkey_val_bytes(&packed->inode.k, bytes);
memset_u64s_tail(&packed->inode.v, 0, bytes);
- SET_INODEv2_NR_FIELDS(&k->v, nr_fields);
+ SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct bch_inode_unpacked unpacked;
@@ -120,16 +123,25 @@ void bch2_inode_pack(struct bch_fs *c,
BUG_ON(ret);
BUG_ON(unpacked.bi_inum != inode->bi_inum);
BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
+ BUG_ON(unpacked.bi_sectors != inode->bi_sectors);
+ BUG_ON(unpacked.bi_size != inode->bi_size);
+ BUG_ON(unpacked.bi_version != inode->bi_version);
BUG_ON(unpacked.bi_mode != inode->bi_mode);
#define x(_name, _bits) if (unpacked._name != inode->_name) \
panic("unpacked %llu should be %llu", \
(u64) unpacked._name, (u64) inode->_name);
- BCH_INODE_FIELDS()
+ BCH_INODE_FIELDS_v3()
#undef x
}
}
+void bch2_inode_pack(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
+{
+ bch2_inode_pack_inlined(packed, inode);
+}
+
static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
struct bch_inode_unpacked *unpacked)
{
@@ -141,9 +153,9 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
#define x(_name, _bits) \
if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
- memset(&unpacked->_name, 0, \
- sizeof(*unpacked) - \
- offsetof(struct bch_inode_unpacked, _name)); \
+ unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
+ memset((void *) unpacked + offset, 0, \
+ sizeof(*unpacked) - offset); \
return 0; \
} \
\
@@ -157,7 +169,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
unpacked->_name = field[1]; \
in += ret;
- BCH_INODE_FIELDS()
+ BCH_INODE_FIELDS_v2()
#undef x
/* XXX: signal if there were more fields than expected? */
@@ -196,16 +208,69 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
return -1; \
fieldnr++;
- BCH_INODE_FIELDS()
+ BCH_INODE_FIELDS_v2()
#undef x
/* XXX: signal if there were more fields than expected? */
return 0;
}
-int bch2_inode_unpack(struct bkey_s_c k,
- struct bch_inode_unpacked *unpacked)
+static int bch2_inode_unpack_v3(struct bkey_s_c k,
+ struct bch_inode_unpacked *unpacked)
{
+ struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+ const u8 *in = inode.v->fields;
+ const u8 *end = bkey_val_end(inode);
+ unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v[2];
+
+ unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
+ unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
+ unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors);
+ unpacked->bi_size = le64_to_cpu(inode.v->bi_size);
+ unpacked->bi_version = le64_to_cpu(inode.v->bi_version);
+ unpacked->bi_mode = INODEv3_MODE(inode.v);
+
+#define x(_name, _bits) \
+ if (fieldnr < nr_fields) { \
+ ret = bch2_varint_decode_fast(in, end, &v[0]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ \
+ if (_bits > 64) { \
+ ret = bch2_varint_decode_fast(in, end, &v[1]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v[1] = 0; \
+ } \
+ } else { \
+ v[0] = v[1] = 0; \
+ } \
+ \
+ unpacked->_name = v[0]; \
+ if (v[1] || v[0] != unpacked->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_INODE_FIELDS_v3()
+#undef x
+
+ /* XXX: signal if there were more fields than expected? */
+ return 0;
+}
+
+static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
+ struct bch_inode_unpacked *unpacked)
+{
+ memset(unpacked, 0, sizeof(*unpacked));
+
switch (k.k->type) {
case KEY_TYPE_inode: {
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
@@ -243,6 +308,14 @@ int bch2_inode_unpack(struct bkey_s_c k,
}
}
+int bch2_inode_unpack(struct bkey_s_c k,
+ struct bch_inode_unpacked *unpacked)
+{
+ if (likely(k.k->type == KEY_TYPE_inode_v3))
+ return bch2_inode_unpack_v3(k, unpacked);
+ return bch2_inode_unpack_slowpath(k, unpacked);
+}
+
int bch2_inode_peek(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
@@ -252,15 +325,13 @@ int bch2_inode_peek(struct btree_trans *trans,
u32 snapshot;
int ret;
- if (0 && trans->c->opts.inodes_use_key_cache)
- flags |= BTREE_ITER_CACHED;
-
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
return ret;
bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, snapshot), flags);
+ SPOS(0, inum.inum, snapshot),
+ flags|BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
@@ -290,124 +361,195 @@ int bch2_inode_write(struct btree_trans *trans,
if (IS_ERR(inode_p))
return PTR_ERR(inode_p);
- bch2_inode_pack(trans->c, inode_p, inode);
+ bch2_inode_pack_inlined(inode_p, inode);
inode_p->inode.k.p.snapshot = iter->snapshot;
return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
}
-const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
+struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
{
- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- struct bch_inode_unpacked unpacked;
+ struct bch_inode_unpacked u;
+ struct bkey_inode_buf *inode_p;
+ int ret;
- if (k.k->p.inode)
- return "nonzero k.p.inode";
+ if (!bkey_is_inode(&k->k))
+ return ERR_PTR(-ENOENT);
- if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
- return "incorrect value size";
+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+ if (IS_ERR(inode_p))
+ return ERR_CAST(inode_p);
- if (k.k->p.offset < BLOCKDEV_INODE_MAX)
- return "fs inode in blockdev range";
+ ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
+ if (ret)
+ return ERR_PTR(ret);
+
+ bch2_inode_pack(inode_p, &u);
+ return &inode_p->inode.k_i;
+}
+
+static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
+{
+ struct bch_inode_unpacked unpacked;
+
+ if (k.k->p.inode) {
+ prt_printf(err, "nonzero k.p.inode");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
- return "invalid str hash type";
+ if (k.k->p.offset < BLOCKDEV_INODE_MAX) {
+ prt_printf(err, "fs inode in blockdev range");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (bch2_inode_unpack(k, &unpacked))
- return "invalid variable length fields";
+ if (bch2_inode_unpack(k, &unpacked)) {
+ prt_printf(err, "invalid variable length fields");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
- return "invalid data checksum type";
+ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) {
+ prt_printf(err, "invalid data checksum type (%u >= %u",
+ unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
+ return -BCH_ERR_invalid_bkey;
+ }
- if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
- return "invalid data checksum type";
+ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
+ prt_printf(err, "invalid data checksum type (%u >= %u)",
+ unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
+ return -BCH_ERR_invalid_bkey;
+ }
if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
- unpacked.bi_nlink != 0)
- return "flagged as unlinked but bi_nlink != 0";
+ unpacked.bi_nlink != 0) {
+ prt_printf(err, "flagged as unlinked but bi_nlink != 0");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
- return "subvolume root but not a directory";
+ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) {
+ prt_printf(err, "subvolume root but not a directory");
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ return 0;
}
-const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
- struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
- struct bch_inode_unpacked unpacked;
+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- if (k.k->p.inode)
- return "nonzero k.p.inode";
+ if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*inode.v));
+ return -BCH_ERR_invalid_bkey;
+ }
- if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
- return "incorrect value size";
+ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+ prt_printf(err, "invalid str hash type (%llu >= %u)",
+ INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
+ return -BCH_ERR_invalid_bkey;
+ }
- if (k.k->p.offset < BLOCKDEV_INODE_MAX)
- return "fs inode in blockdev range";
+ return __bch2_inode_invalid(k, err);
+}
- if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
- return "invalid str hash type";
+int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
+{
+ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
- if (bch2_inode_unpack(k, &unpacked))
- return "invalid variable length fields";
+ if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*inode.v));
+ return -BCH_ERR_invalid_bkey;
+ }
- if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
- return "invalid data checksum type";
+ if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+ prt_printf(err, "invalid str hash type (%llu >= %u)",
+ INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
+ return -BCH_ERR_invalid_bkey;
+ }
- if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
- return "invalid data checksum type";
+ return __bch2_inode_invalid(k, err);
+}
- if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
- unpacked.bi_nlink != 0)
- return "flagged as unlinked but bi_nlink != 0";
+int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
+{
+ struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
- if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
- return "subvolume root but not a directory";
+ if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*inode.v));
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
+ INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) {
+ prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)",
+ INODEv3_FIELDS_START(inode.v),
+ INODEv3_FIELDS_START_INITIAL,
+ bkey_val_u64s(inode.k));
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
+ prt_printf(err, "invalid str hash type (%llu >= %u)",
+ INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ return __bch2_inode_invalid(k, err);
}
-static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
+static void __bch2_inode_unpacked_to_text(struct printbuf *out,
+ struct bch_inode_unpacked *inode)
{
- pr_buf(out, "mode %o flags %x journal_seq %llu",
+ prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu",
inode->bi_mode, inode->bi_flags,
- inode->bi_journal_seq);
+ inode->bi_journal_seq,
+ inode->bi_size,
+ inode->bi_sectors,
+ inode->bi_version);
#define x(_name, _bits) \
- pr_buf(out, " "#_name " %llu", (u64) inode->_name);
- BCH_INODE_FIELDS()
+ prt_printf(out, " "#_name " %llu", (u64) inode->_name);
+ BCH_INODE_FIELDS_v3()
#undef x
}
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
{
- pr_buf(out, "inum: %llu ", inode->bi_inum);
+ prt_printf(out, "inum: %llu ", inode->bi_inum);
__bch2_inode_unpacked_to_text(out, inode);
}
-void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
struct bch_inode_unpacked inode;
if (bch2_inode_unpack(k, &inode)) {
- pr_buf(out, "(unpack error)");
+ prt_printf(out, "(unpack error)");
return;
}
__bch2_inode_unpacked_to_text(out, &inode);
}
-const char *bch2_inode_generation_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
- if (k.k->p.inode)
- return "nonzero k.p.inode";
+ if (k.k->p.inode) {
+ prt_printf(err, "nonzero k.p.inode");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
- return "incorrect value size";
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_inode_generation));
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ return 0;
}
void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
@@ -415,7 +557,7 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
- pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
+ prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
}
void bch2_inode_init_early(struct bch_fs *c,
@@ -520,20 +662,9 @@ int bch2_inode_create(struct btree_trans *trans,
again:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
- bkey_cmp(k.k->p, POS(0, max)) < 0) {
- while (pos < iter->pos.offset) {
- if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
- goto found_slot;
-
- pos++;
- }
-
- if (k.k->p.snapshot == snapshot &&
- !bkey_is_inode(k.k) &&
- !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
- bch2_btree_iter_advance(iter);
- continue;
- }
+ bkey_lt(k.k->p, POS(0, max))) {
+ if (pos < iter->pos.offset)
+ goto found_slot;
/*
* We don't need to iterate over keys in every snapshot once
@@ -543,15 +674,11 @@ again:
bch2_btree_iter_set_pos(iter, POS(0, pos));
}
- while (!ret && pos < max) {
- if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
- goto found_slot;
-
- pos++;
- }
+ if (!ret && pos < max)
+ goto found_slot;
if (!ret && start == min)
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_inode_create;
if (ret) {
bch2_trans_iter_exit(trans, iter);
@@ -571,11 +698,6 @@ found_slot:
return ret;
}
- /* We may have raced while the iterator wasn't pointing at pos: */
- if (bkey_is_inode(k.k) ||
- bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
- goto again;
-
*hint = k.k->p.offset;
inode_u->bi_inum = k.k->p.offset;
inode_u->bi_generation = bkey_generation(k);
@@ -592,12 +714,11 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
int ret = 0;
/*
- * We're never going to be deleting extents, no need to use an extent
- * iterator:
+ * We're never going to be deleting partial extents, no need to use an
+ * extent iterator:
*/
bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
+ BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
while (1) {
bch2_trans_begin(trans);
@@ -608,12 +729,12 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
bch2_btree_iter_set_snapshot(&iter, snapshot);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
- if (!k.k || iter.pos.inode != inum.inum)
+ if (!k.k)
break;
bkey_init(&delete.k);
@@ -623,7 +744,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
err:
- if (ret && ret != -EINTR)
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
@@ -631,20 +752,16 @@ err:
return ret;
}
-int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached)
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
{
struct btree_trans trans;
struct btree_iter iter = { NULL };
struct bkey_i_inode_generation delete;
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
- unsigned iter_flags = BTREE_ITER_INTENT;
u32 snapshot;
int ret;
- if (0 && cached && c->opts.inodes_use_key_cache)
- iter_flags |= BTREE_ITER_CACHED;
-
bch2_trans_init(&trans, c, 0, 1024);
/*
@@ -668,7 +785,8 @@ retry:
goto err;
bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, snapshot), iter_flags);
+ SPOS(0, inum.inum, snapshot),
+ BTREE_ITER_INTENT|BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
@@ -677,17 +795,14 @@ retry:
if (!bkey_is_inode(k.k)) {
bch2_fs_inconsistent(trans.c,
- "inode %llu not found when deleting",
- inum.inum);
+ "inode %llu:%u not found when deleting",
+ inum.inum, snapshot);
ret = -EIO;
goto err;
}
bch2_inode_unpack(k, &inode_u);
- /* Subvolume root? */
- BUG_ON(inode_u.bi_subvol);
-
bkey_inode_generation_init(&delete.k_i);
delete.k.p = iter.pos;
delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
@@ -697,7 +812,7 @@ retry:
BTREE_INSERT_NOFAIL);
err:
bch2_trans_iter_exit(&trans, &iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
@@ -723,3 +838,58 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
return bch2_trans_do(c, NULL, NULL, 0,
bch2_inode_find_by_inum_trans(&trans, inum, inode));
}
+
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
+{
+ if (bi->bi_flags & BCH_INODE_UNLINKED)
+ bi->bi_flags &= ~BCH_INODE_UNLINKED;
+ else {
+ if (bi->bi_nlink == U32_MAX)
+ return -EINVAL;
+
+ bi->bi_nlink++;
+ }
+
+ return 0;
+}
+
+void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
+{
+ if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) {
+ bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
+ bi->bi_inum);
+ return;
+ }
+
+ if (bi->bi_flags & BCH_INODE_UNLINKED) {
+ bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
+ return;
+ }
+
+ if (bi->bi_nlink)
+ bi->bi_nlink--;
+ else
+ bi->bi_flags |= BCH_INODE_UNLINKED;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
+{
+ struct bch_opts ret = { 0 };
+#define x(_name, _bits) \
+ if (inode->bi_##_name) \
+ opt_set(ret, _name, inode->bi_##_name - 1);
+ BCH_INODE_OPTS()
+#undef x
+ return ret;
+}
+
+void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
+ struct bch_inode_unpacked *inode)
+{
+#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name);
+ BCH_INODE_OPTS()
+#undef x
+
+ if (opts->nocow)
+ opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
+}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 723186d8afb6..f5066afb4886 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -2,39 +2,52 @@
#ifndef _BCACHEFS_INODE_H
#define _BCACHEFS_INODE_H
+#include "bkey.h"
#include "opts.h"
extern const char * const bch2_inode_opts[];
-const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
-const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-#define bch2_bkey_ops_inode (struct bkey_ops) { \
+#define bch2_bkey_ops_inode ((struct bkey_ops) { \
.key_invalid = bch2_inode_invalid, \
.val_to_text = bch2_inode_to_text, \
-}
+ .trans_trigger = bch2_trans_mark_inode, \
+ .atomic_trigger = bch2_mark_inode, \
+})
-#define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \
+#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \
.key_invalid = bch2_inode_v2_invalid, \
.val_to_text = bch2_inode_to_text, \
-}
+ .trans_trigger = bch2_trans_mark_inode, \
+ .atomic_trigger = bch2_mark_inode, \
+})
+
+#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \
+ .key_invalid = bch2_inode_v3_invalid, \
+ .val_to_text = bch2_inode_to_text, \
+ .trans_trigger = bch2_trans_mark_inode, \
+ .atomic_trigger = bch2_mark_inode, \
+})
static inline bool bkey_is_inode(const struct bkey *k)
{
return k->type == KEY_TYPE_inode ||
- k->type == KEY_TYPE_inode_v2;
+ k->type == KEY_TYPE_inode_v2 ||
+ k->type == KEY_TYPE_inode_v3;
}
-const char *bch2_inode_generation_invalid(const struct bch_fs *,
- struct bkey_s_c);
-void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
+int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
+ unsigned, struct printbuf *);
+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \
+#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \
.key_invalid = bch2_inode_generation_invalid, \
.val_to_text = bch2_inode_generation_to_text, \
-}
+})
#if 0
typedef struct {
@@ -48,25 +61,28 @@ struct bch_inode_unpacked {
u64 bi_inum;
u64 bi_journal_seq;
__le64 bi_hash_seed;
+ u64 bi_size;
+ u64 bi_sectors;
+ u64 bi_version;
u32 bi_flags;
u16 bi_mode;
#define x(_name, _bits) u##_bits _name;
- BCH_INODE_FIELDS()
+ BCH_INODE_FIELDS_v3()
#undef x
};
struct bkey_inode_buf {
- struct bkey_i_inode_v2 inode;
+ struct bkey_i_inode_v3 inode;
#define x(_name, _bits) + 8 + _bits / 8
- u8 _pad[0 + BCH_INODE_FIELDS()];
+ u8 _pad[0 + BCH_INODE_FIELDS_v3()];
#undef x
-} __attribute__((packed, aligned(8)));
+} __packed __aligned(8);
-void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
- const struct bch_inode_unpacked *);
+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
+struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
@@ -87,24 +103,15 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
int bch2_inode_create(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, u32, u64);
-int bch2_inode_rm(struct bch_fs *, subvol_inum, bool);
+int bch2_inode_rm(struct bch_fs *, subvol_inum);
int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *);
int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
struct bch_inode_unpacked *);
-static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
-{
- struct bch_io_opts ret = { 0 };
-
-#define x(_name, _bits) \
- if (inode->bi_##_name) \
- opt_set(ret, _name, inode->bi_##_name - 1);
- BCH_INODE_OPTS()
-#undef x
- return ret;
-}
+#define inode_opt_get(_c, _inode, _name) \
+ ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
enum inode_opt_id id, u64 v)
@@ -135,15 +142,6 @@ static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
}
}
-static inline struct bch_io_opts
-io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode)
-{
- struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
-
- bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode));
- return opts;
-}
-
static inline u8 mode_to_type(umode_t mode)
{
return (mode >> 12) & 15;
@@ -161,23 +159,6 @@ static inline unsigned nlink_bias(umode_t mode)
return S_ISDIR(mode) ? 2 : 1;
}
-static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
-{
- if (bi->bi_flags & BCH_INODE_UNLINKED)
- bi->bi_flags &= ~BCH_INODE_UNLINKED;
- else
- bi->bi_nlink++;
-}
-
-static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi)
-{
- BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED);
- if (bi->bi_nlink)
- bi->bi_nlink--;
- else
- bi->bi_flags |= BCH_INODE_UNLINKED;
-}
-
static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
{
return bi->bi_flags & BCH_INODE_UNLINKED
@@ -197,4 +178,11 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
}
}
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
+void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
+void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
+ struct bch_inode_unpacked *);
+
#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
index 21e1e3956f10..c3da325a25c8 100644
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
@@ -16,6 +16,7 @@
#include "checksum.h"
#include "compress.h"
#include "clock.h"
+#include "data_update.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
@@ -26,12 +27,14 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
+#include "nocow_locking.h"
#include "rebalance.h"
#include "subvolume.h"
#include "super.h"
#include "super-io.h"
#include <linux/blkdev.h>
+#include <linux/prefetch.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
@@ -44,6 +47,8 @@ const char *bch2_blk_status_to_str(blk_status_t status)
return blk_status_to_str(status);
}
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
static bool bch2_target_congested(struct bch_fs *c, u16 target)
{
const struct bch_devs_mask *devs;
@@ -132,14 +137,23 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
}
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+ return false;
+}
+
+#endif
+
/* Allocate, free from mempool: */
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{
+ struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned i;
- bio_for_each_segment_all(bv, bio, i)
+ bio_for_each_segment_all(bv, bio, iter)
if (bv->bv_page != ZERO_PAGE(0))
mempool_free(bv->bv_page, &c->bio_bounce_pages);
bio->bi_vcnt = 0;
@@ -204,7 +218,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
bch2_trans_copy_iter(&iter, extent_iter);
- for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
+ for_each_btree_key_upto_continue_norestart(iter,
+ new->k.p, BTREE_ITER_SLOTS, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
bkey_start_offset(old.k));
@@ -224,7 +239,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
(!new_compressed && bch2_bkey_sectors_compressed(old))))
*usage_increasing = true;
- if (bkey_cmp(old.k->p, new->k.p) >= 0)
+ if (bkey_ge(old.k->p, new->k.p))
break;
}
@@ -232,18 +247,69 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
return ret;
}
+static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ u64 new_i_size,
+ s64 i_sectors_delta)
+{
+ struct btree_iter iter;
+ struct bkey_i *k;
+ struct bkey_i_inode_v3 *inode;
+ unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ SPOS(0,
+ extent_iter->pos.inode,
+ extent_iter->snapshot),
+ BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+ k = bch2_bkey_get_mut(trans, &iter);
+ ret = PTR_ERR_OR_ZERO(k);
+ if (unlikely(ret))
+ goto err;
+
+ if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
+ k = bch2_inode_to_v3(trans, k);
+ ret = PTR_ERR_OR_ZERO(k);
+ if (unlikely(ret))
+ goto err;
+ }
+
+ inode = bkey_i_to_inode_v3(k);
+
+ if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
+ new_i_size > le64_to_cpu(inode->v.bi_size)) {
+ inode->v.bi_size = cpu_to_le64(new_i_size);
+ inode_update_flags = 0;
+ }
+
+ if (i_sectors_delta) {
+ le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
+ inode_update_flags = 0;
+ }
+
+ if (inode->k.p.snapshot != iter.snapshot) {
+ inode->k.p.snapshot = iter.snapshot;
+ inode_update_flags = 0;
+ }
+
+ ret = bch2_trans_update(trans, &iter, &inode->k_i,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ inode_update_flags);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
int bch2_extent_update(struct btree_trans *trans,
subvol_inum inum,
struct btree_iter *iter,
struct bkey_i *k,
struct disk_reservation *disk_res,
- u64 *journal_seq,
u64 new_i_size,
s64 *i_sectors_delta_total,
bool check_enospc)
{
- struct btree_iter inode_iter;
- struct bch_inode_unpacked inode_u;
struct bpos next_pos;
bool usage_increasing;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
@@ -263,7 +329,6 @@ int bch2_extent_update(struct btree_trans *trans,
if (ret)
return ret;
- new_i_size = min(k->k.p.offset << 9, new_i_size);
next_pos = k->k.p;
ret = bch2_sum_sector_overwrites(trans, iter, k,
@@ -283,36 +348,161 @@ int bch2_extent_update(struct btree_trans *trans,
return ret;
}
- ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
- BTREE_ITER_INTENT);
- if (ret)
+ /*
+ * Note:
+ * We always have to do an inode update - even when i_size/i_sectors
+ * aren't changing - for fsync to work properly; fsync relies on
+ * inode->bi_journal_seq which is updated by the trigger code:
+ */
+ ret = bch2_extent_update_i_size_sectors(trans, iter,
+ min(k->k.p.offset << 9, new_i_size),
+ i_sectors_delta) ?:
+ bch2_trans_update(trans, iter, k, 0) ?:
+ bch2_trans_commit(trans, disk_res, NULL,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL);
+ if (unlikely(ret))
return ret;
- if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- new_i_size > inode_u.bi_size)
- inode_u.bi_size = new_i_size;
+ if (i_sectors_delta_total)
+ *i_sectors_delta_total += i_sectors_delta;
+ bch2_btree_iter_set_pos(iter, next_pos);
+ return 0;
+}
- inode_u.bi_sectors += i_sectors_delta;
+/* Overwrites whatever was present with zeroes: */
+int bch2_extent_fallocate(struct btree_trans *trans,
+ subvol_inum inum,
+ struct btree_iter *iter,
+ unsigned sectors,
+ struct bch_io_opts opts,
+ s64 *i_sectors_delta,
+ struct write_point_specifier write_point)
+{
+ struct bch_fs *c = trans->c;
+ struct disk_reservation disk_res = { 0 };
+ struct closure cl;
+ struct open_buckets open_buckets;
+ struct bkey_s_c k;
+ struct bkey_buf old, new;
+ unsigned sectors_allocated;
+ bool have_reservation = false;
+ bool unwritten = opts.nocow &&
+ c->sb.version >= bcachefs_metadata_version_unwritten_extents;
+ int ret;
- ret = bch2_trans_update(trans, iter, k, 0) ?:
- bch2_inode_write(trans, &inode_iter, &inode_u) ?:
- bch2_trans_commit(trans, disk_res, journal_seq,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL);
- bch2_trans_iter_exit(trans, &inode_iter);
+ bch2_bkey_buf_init(&old);
+ bch2_bkey_buf_init(&new);
+ closure_init_stack(&cl);
+ open_buckets.nr = 0;
+retry:
+ sectors_allocated = 0;
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
if (ret)
return ret;
- if (i_sectors_delta_total)
- *i_sectors_delta_total += i_sectors_delta;
- bch2_btree_iter_set_pos(iter, next_pos);
+ sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
- return 0;
+ if (!have_reservation) {
+ unsigned new_replicas =
+ max(0, (int) opts.data_replicas -
+ (int) bch2_bkey_nr_ptrs_fully_allocated(k));
+ /*
+ * Get a disk reservation before (in the nocow case) calling
+ * into the allocator:
+ */
+ ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+ if (unlikely(ret))
+ goto out;
+
+ bch2_bkey_buf_reassemble(&old, c, k);
+ }
+
+ if (have_reservation) {
+ if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
+ goto out;
+
+ bch2_key_resize(&new.k->k, sectors);
+ } else if (!unwritten) {
+ struct bkey_i_reservation *reservation;
+
+ bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
+ reservation = bkey_reservation_init(new.k);
+ reservation->k.p = iter->pos;
+ bch2_key_resize(&reservation->k, sectors);
+ reservation->v.nr_replicas = opts.data_replicas;
+ } else {
+ struct bkey_i_extent *e;
+ struct bch_devs_list devs_have;
+ struct write_point *wp;
+ struct bch_extent_ptr *ptr;
+
+ devs_have.nr = 0;
+
+ bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
+
+ e = bkey_extent_init(new.k);
+ e->k.p = iter->pos;
+
+ ret = bch2_alloc_sectors_start_trans(trans,
+ opts.foreground_target,
+ false,
+ write_point,
+ &devs_have,
+ opts.data_replicas,
+ opts.data_replicas,
+ RESERVE_none, 0, &cl, &wp);
+ if (ret) {
+ bch2_trans_unlock(trans);
+ closure_sync(&cl);
+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+ goto retry;
+ return ret;
+ }
+
+ sectors = min(sectors, wp->sectors_free);
+ sectors_allocated = sectors;
+
+ bch2_key_resize(&e->k, sectors);
+
+ bch2_open_bucket_get(c, wp, &open_buckets);
+ bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+ bch2_alloc_sectors_done(c, wp);
+
+ extent_for_each_ptr(extent_i_to_s(e), ptr)
+ ptr->unwritten = true;
+ }
+
+ have_reservation = true;
+
+ ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
+ 0, i_sectors_delta, true);
+out:
+ if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
+ bch2_trans_unlock(trans);
+ closure_sync(&cl);
+ }
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ bch2_trans_begin(trans);
+ goto retry;
+ }
+
+ if (!ret && sectors_allocated)
+ bch2_increment_clock(c, sectors_allocated, WRITE);
+
+ bch2_open_buckets_put(c, &open_buckets);
+ bch2_disk_reservation_put(c, &disk_res);
+ bch2_bkey_buf_exit(&new, c);
+ bch2_bkey_buf_exit(&old, c);
+
+ return ret;
}
/*
- * Returns -EINTR if we had to drop locks:
+ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
*/
int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
subvol_inum inum, u64 end,
@@ -325,7 +515,8 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
int ret = 0, ret2 = 0;
u32 snapshot;
- while (!ret || ret == -EINTR) {
+ while (!ret ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
@@ -341,11 +532,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
bch2_btree_iter_set_snapshot(iter, snapshot);
- k = bch2_btree_iter_peek(iter);
- if (bkey_cmp(iter->pos, end_pos) >= 0) {
- bch2_btree_iter_set_pos(iter, end_pos);
+ /*
+ * peek_upto() doesn't have ideal semantics for extents:
+ */
+ k = bch2_btree_iter_peek_upto(iter, end_pos);
+ if (!k.k)
break;
- }
ret = bkey_err(k);
if (ret)
@@ -359,8 +551,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
bch2_cut_back(end_pos, &delete);
ret = bch2_extent_update(trans, inum, iter, &delete,
- &disk_res, NULL,
- 0, i_sectors_delta, false);
+ &disk_res, 0, i_sectors_delta, false);
bch2_disk_reservation_put(c, &disk_res);
}
@@ -384,14 +575,16 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- return ret == -EINTR ? 0 : ret;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
+
+ return ret;
}
-int bch2_write_index_default(struct bch_write_op *op)
+static int bch2_write_index_default(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct bkey_buf sk;
- struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
struct keylist *keys = &op->insert_keys;
struct bkey_i *k = bch2_keylist_front(keys);
struct btree_trans trans;
@@ -415,7 +608,7 @@ int bch2_write_index_default(struct bch_write_op *op)
ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
&sk.k->k.p.snapshot);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
@@ -425,20 +618,17 @@ int bch2_write_index_default(struct bch_write_op *op)
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
ret = bch2_extent_update(&trans, inum, &iter, sk.k,
- &op->res, op_journal_seq(op),
+ &op->res,
op->new_i_size, &op->i_sectors_delta,
op->flags & BCH_WRITE_CHECK_ENOSPC);
bch2_trans_iter_exit(&trans, &iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
- if (ec_ob)
- bch2_ob_add_backpointer(c, ec_ob, &sk.k->k);
-
- if (bkey_cmp(iter.pos, k->k.p) >= 0)
+ if (bkey_ge(iter.pos, k->k.p))
bch2_keylist_pop_front(&op->insert_keys);
else
bch2_cut_front(iter.pos, k);
@@ -454,7 +644,8 @@ int bch2_write_index_default(struct bch_write_op *op)
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
enum bch_data_type type,
- const struct bkey_i *k)
+ const struct bkey_i *k,
+ bool nocow)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
const struct bch_extent_ptr *ptr;
@@ -470,8 +661,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
ca = bch_dev_bkey_exists(c, ptr->dev);
if (to_entry(ptr + 1) < ptrs.end) {
- n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
- &ca->replica_set));
+ n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
+ GFP_NOIO, &ca->replica_set));
n->bio.bi_end_io = wbio->bio.bi_end_io;
n->bio.bi_private = wbio->bio.bi_private;
@@ -488,9 +679,11 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
n->c = c;
n->dev = ptr->dev;
- n->have_ioref = bch2_dev_get_ioref(ca,
+ n->have_ioref = nocow || bch2_dev_get_ioref(ca,
type == BCH_DATA_btree ? READ : WRITE);
+ n->nocow = nocow;
n->submit_time = local_clock();
+ n->inode_offset = bkey_start_offset(&k->k);
n->bio.bi_iter.bi_sector = ptr->offset;
if (likely(n->have_ioref)) {
@@ -498,6 +691,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
bio_sectors(&n->bio));
bio_set_dev(&n->bio, ca->disk_sb.bdev);
+
+ if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
+ bio_endio(&n->bio);
+ continue;
+ }
+
submit_bio(&n->bio);
} else {
n->bio.bi_status = BLK_STS_REMOVED;
@@ -506,42 +705,31 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
}
}
-static void __bch2_write(struct closure *);
+static void __bch2_write(struct bch_write_op *);
static void bch2_write_done(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
- if (!op->error && (op->flags & BCH_WRITE_FLUSH))
- op->error = bch2_journal_error(&c->journal);
-
bch2_disk_reservation_put(c, &op->res);
- percpu_ref_put(&c->writes);
+ if (!(op->flags & BCH_WRITE_MOVE))
+ bch2_write_ref_put(c, BCH_WRITE_REF_write);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
- if (op->end_io) {
- EBUG_ON(cl->parent);
- closure_debug_destroy(cl);
+ EBUG_ON(cl->parent);
+ closure_debug_destroy(cl);
+ if (op->end_io)
op->end_io(op);
- } else {
- closure_return(cl);
- }
}
-/**
- * bch_write_index - after a write, update index to point to new data
- */
-static void __bch2_write_index(struct bch_write_op *op)
+static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
{
- struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
struct bch_extent_ptr *ptr;
- struct bkey_i *src, *dst = keys->keys, *n, *k;
- unsigned dev;
- int ret;
+ struct bkey_i *src, *dst = keys->keys, *n;
for (src = keys->keys; src != keys->top; src = n) {
n = bkey_next(src);
@@ -550,46 +738,67 @@ static void __bch2_write_index(struct bch_write_op *op)
bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
test_bit(ptr->dev, op->failed.d));
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
- ret = -EIO;
- goto err;
- }
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
+ return -EIO;
}
if (dst != src)
- memmove_u64s_down(dst, src, src->u64s);
+ memmove_u64s_down(dst, src, src->k.u64s);
dst = bkey_next(dst);
}
keys->top = dst;
+ return 0;
+}
+
+/**
+ * bch_write_index - after a write, update index to point to new data
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct keylist *keys = &op->insert_keys;
+ struct bkey_i *k;
+ unsigned dev;
+ int ret = 0;
+
+ if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+ ret = bch2_write_drop_io_error_ptrs(op);
+ if (ret)
+ goto err;
+ }
/*
* probably not the ideal place to hook this in, but I don't
* particularly want to plumb io_opts all the way through the btree
* update stack right now
*/
- for_each_keylist_key(keys, k) {
+ for_each_keylist_key(keys, k)
bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
- if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k)))
- bch2_check_set_feature(op->c, BCH_FEATURE_incompressible);
-
- }
-
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
- int ret = op->index_update_fn(op);
- BUG_ON(ret == -EINTR);
+ ret = !(op->flags & BCH_WRITE_MOVE)
+ ? bch2_write_index_default(op)
+ : bch2_data_update_index_update(op);
+
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
BUG_ON(keylist_sectors(keys) && !ret);
op->written += sectors_start - keylist_sectors(keys);
- if (ret) {
- bch_err_inum_ratelimited(c, op->pos.inode,
- "write error %i from btree update", ret);
- op->error = ret;
+ if (ret && !bch2_err_matches(ret, EROFS)) {
+ struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+
+ bch_err_inum_offset_ratelimited(c,
+ k->k.p.inode, k->k.p.offset << 9,
+ "write error while doing btree update: %s",
+ bch2_err_str(ret));
}
+
+ if (ret)
+ goto err;
}
out:
/* If some a bucket wasn't written, we can't erasure code it: */
@@ -601,25 +810,90 @@ out:
err:
keys->top = keys->keys;
op->error = ret;
+ op->flags |= BCH_WRITE_DONE;
goto out;
}
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+ if (state != wp->state) {
+ u64 now = ktime_get_ns();
+
+ if (wp->last_state_change &&
+ time_after64(now, wp->last_state_change))
+ wp->time[wp->state] += now - wp->last_state_change;
+ wp->state = state;
+ wp->last_state_change = now;
+ }
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+ enum write_point_state state;
+
+ state = running ? WRITE_POINT_running :
+ !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+ : WRITE_POINT_stopped;
+
+ __wp_update_state(wp, state);
+}
+
static void bch2_write_index(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_fs *c = op->c;
+ struct write_point *wp = op->wp;
+ struct workqueue_struct *wq = index_update_wq(op);
+ unsigned long flags;
- __bch2_write_index(op);
+ if ((op->flags & BCH_WRITE_DONE) &&
+ (op->flags & BCH_WRITE_MOVE))
+ bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
- if (!(op->flags & BCH_WRITE_DONE)) {
- continue_at(cl, __bch2_write, index_update_wq(op));
- } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
- bch2_journal_flush_seq_async(&c->journal,
- *op_journal_seq(op),
- cl);
- continue_at(cl, bch2_write_done, index_update_wq(op));
- } else {
- continue_at_nobarrier(cl, bch2_write_done, NULL);
+ spin_lock_irqsave(&wp->writes_lock, flags);
+ if (wp->state == WRITE_POINT_waiting_io)
+ __wp_update_state(wp, WRITE_POINT_waiting_work);
+ list_add_tail(&op->wp_list, &wp->writes);
+ spin_unlock_irqrestore (&wp->writes_lock, flags);
+
+ queue_work(wq, &wp->index_update_work);
+}
+
+static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
+{
+ op->wp = wp;
+
+ if (wp->state == WRITE_POINT_stopped) {
+ spin_lock_irq(&wp->writes_lock);
+ __wp_update_state(wp, WRITE_POINT_waiting_io);
+ spin_unlock_irq(&wp->writes_lock);
+ }
+}
+
+void bch2_write_point_do_index_updates(struct work_struct *work)
+{
+ struct write_point *wp =
+ container_of(work, struct write_point, index_update_work);
+ struct bch_write_op *op;
+
+ while (1) {
+ spin_lock_irq(&wp->writes_lock);
+ op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
+ if (op)
+ list_del(&op->wp_list);
+ wp_update_state(wp, op != NULL);
+ spin_unlock_irq(&wp->writes_lock);
+
+ if (!op)
+ break;
+
+ op->flags |= BCH_WRITE_IN_WORKER;
+
+ __bch2_write_index(op);
+
+ if (!(op->flags & BCH_WRITE_DONE))
+ __bch2_write(op);
+ else
+ bch2_write_done(&op->cl);
}
}
@@ -634,10 +908,15 @@ static void bch2_write_endio(struct bio *bio)
if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
op->pos.inode,
- op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
+ wbio->inode_offset << 9,
"data write error: %s",
- bch2_blk_status_to_str(bio->bi_status)))
+ bch2_blk_status_to_str(bio->bi_status))) {
set_bit(wbio->dev, op->failed.d);
+ op->flags |= BCH_WRITE_IO_ERROR;
+ }
+
+ if (wbio->nocow)
+ set_bit(wbio->dev, op->devs_need_flush->d);
if (wbio->have_ioref) {
bch2_latency_acct(ca, wbio->submit_time, WRITE);
@@ -652,10 +931,8 @@ static void bch2_write_endio(struct bio *bio)
if (parent)
bio_endio(&parent->bio);
- else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
- closure_put(cl);
else
- continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
+ closure_put(cl);
}
static void init_append_extent(struct bch_write_op *op,
@@ -663,7 +940,6 @@ static void init_append_extent(struct bch_write_op *op,
struct bversion version,
struct bch_extent_crc_unpacked crc)
{
- struct bch_fs *c = op->c;
struct bkey_i_extent *e;
op->pos.offset += crc.uncompressed_size;
@@ -678,7 +954,7 @@ static void init_append_extent(struct bch_write_op *op,
crc.nonce)
bch2_extent_crc_append(&e->k_i, crc);
- bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size,
+ bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
op->flags & BCH_WRITE_CACHED);
bch2_keylist_push(&op->insert_keys);
@@ -699,9 +975,10 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
? ((unsigned long) buf & (PAGE_SIZE - 1))
: 0), PAGE_SIZE);
- pages = min_t(unsigned, pages, BIO_MAX_PAGES);
+ pages = min(pages, BIO_MAX_VECS);
- bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
+ bio = bio_alloc_bioset(NULL, pages, 0,
+ GFP_NOIO, &c->bio_write);
wbio = wbio_init(bio);
wbio->put_bio = true;
/* copy WRITE_SYNC flag */
@@ -764,6 +1041,7 @@ static int bch2_write_decrypt(struct bch_write_op *op)
struct bch_fs *c = op->c;
struct nonce nonce = extent_nonce(op->version, op->crc);
struct bch_csum csum;
+ int ret;
if (!bch2_csum_type_is_encryption(op->crc.csum_type))
return 0;
@@ -778,10 +1056,10 @@ static int bch2_write_decrypt(struct bch_write_op *op)
if (bch2_crc_cmp(op->crc.csum, csum))
return -EIO;
- bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
op->crc.csum_type = 0;
op->crc.csum = (struct bch_csum) { 0, 0 };
- return 0;
+ return ret;
}
static enum prep_encoded_ret {
@@ -911,8 +1189,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
saved_iter = dst->bi_iter;
do {
- struct bch_extent_crc_unpacked crc =
- (struct bch_extent_crc_unpacked) { 0 };
+ struct bch_extent_crc_unpacked crc = { 0 };
struct bversion version = op->version;
size_t dst_len, src_len;
@@ -964,6 +1241,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
!crc_is_compressed(crc) &&
bch2_csum_type_is_encryption(op->crc.csum_type) ==
bch2_csum_type_is_encryption(op->csum_type)) {
+ u8 compression_type = crc.compression_type;
+ u16 nonce = crc.nonce;
/*
* Note: when we're using rechecksum(), we need to be
* checksumming @src because it has all the data our
@@ -982,6 +1261,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
bio_sectors(src) - (src_len >> 9),
op->csum_type))
goto csum_err;
+ /*
+ * rchecksum_bio sets compression_type on crc from op->crc,
+ * this isn't always correct as sometimes we're changing
+ * an extent from uncompressed to incompressible.
+ */
+ crc.compression_type = compression_type;
+ crc.nonce = nonce;
} else {
if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
bch2_rechecksum_bio(c, src, version, op->crc,
@@ -996,8 +1282,11 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
crc.live_size = src_len >> 9;
swap(dst->bi_iter.bi_size, dst_len);
- bch2_encrypt_bio(c, op->csum_type,
- extent_nonce(version, crc), dst);
+ ret = bch2_encrypt_bio(c, op->csum_type,
+ extent_nonce(version, crc), dst);
+ if (ret)
+ goto err;
+
crc.csum = bch2_checksum_bio(c, op->csum_type,
extent_nonce(version, crc), dst);
crc.csum_type = op->csum_type;
@@ -1038,8 +1327,7 @@ do_write:
*_dst = dst;
return more;
csum_err:
- bch_err(c, "error verifying existing checksum while "
- "rewriting existing data (memory corruption?)");
+ bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
ret = -EIO;
err:
if (to_wbio(dst)->bounce)
@@ -1050,17 +1338,334 @@ err:
return ret;
}
-static void __bch2_write(struct closure *cl)
+static bool bch2_extent_is_writeable(struct bch_write_op *op,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = op->c;
+ struct bkey_s_c_extent e;
+ struct extent_ptr_decoded p;
+ const union bch_extent_entry *entry;
+ unsigned replicas = 0;
+
+ if (k.k->type != KEY_TYPE_extent)
+ return false;
+
+ e = bkey_s_c_to_extent(k);
+ extent_for_each_ptr_decode(e, p, entry) {
+ if (p.crc.csum_type ||
+ crc_is_compressed(p.crc) ||
+ p.has_ec)
+ return false;
+
+ replicas += bch2_extent_ptr_durability(c, &p);
+ }
+
+ return replicas >= op->opts.data_replicas;
+}
+
+static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ const struct bch_extent_ptr *ptr;
+ struct bkey_i *k;
+
+ for_each_keylist_key(&op->insert_keys, k) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+
+ bkey_for_each_ptr(ptrs, ptr)
+ bch2_bucket_nocow_unlock(&c->nocow_locks,
+ PTR_BUCKET_POS(c, ptr),
+ BUCKET_NOCOW_LOCK_UPDATE);
+ }
+}
+
+static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *orig,
+ struct bkey_s_c k,
+ u64 new_i_size)
+{
+ struct bkey_i *new;
+ struct bkey_ptrs ptrs;
+ struct bch_extent_ptr *ptr;
+ int ret;
+
+ if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
+ /* trace this */
+ return 0;
+ }
+
+ new = bch2_bkey_make_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ bch2_cut_front(bkey_start_pos(&orig->k), new);
+ bch2_cut_back(orig->k.p, new);
+
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_for_each_ptr(ptrs, ptr)
+ ptr->unwritten = 0;
+
+ /*
+ * Note that we're not calling bch2_subvol_get_snapshot() in this path -
+ * that was done when we kicked off the write, and here it's important
+ * that we update the extent that we wrote to - even if a snapshot has
+ * since been created. The write is still outstanding, so we're ok
+ * w.r.t. snapshot atomicity:
+ */
+ return bch2_extent_update_i_size_sectors(trans, iter,
+ min(new->k.p.offset << 9, new_i_size), 0) ?:
+ bch2_trans_update(trans, iter, new,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_i *orig;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_keylist_key(&op->insert_keys, orig) {
+ ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
+ bkey_start_pos(&orig->k), orig->k.p,
+ BTREE_ITER_INTENT, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL, ({
+ bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
+ }));
+
+ if (ret && !bch2_err_matches(ret, EROFS)) {
+ struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+
+ bch_err_inum_offset_ratelimited(c,
+ k->k.p.inode, k->k.p.offset << 9,
+ "write error while doing btree update: %s",
+ bch2_err_str(ret));
+ }
+
+ if (ret) {
+ op->error = ret;
+ break;
+ }
+ }
+
+ bch2_trans_exit(&trans);
+}
+
+static void __bch2_nocow_write_done(struct bch_write_op *op)
+{
+ bch2_nocow_write_unlock(op);
+
+ if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+ op->error = -EIO;
+ } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
+ bch2_nocow_write_convert_unwritten(op);
+}
+
+static void bch2_nocow_write_done(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+
+ __bch2_nocow_write_done(op);
+ bch2_write_done(cl);
+}
+
+static void bch2_nocow_write(struct bch_write_op *op)
+{
struct bch_fs *c = op->c;
- struct write_point *wp;
- struct bio *bio;
- bool skip_put = true;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_ptrs_c ptrs;
+ const struct bch_extent_ptr *ptr;
+ struct {
+ struct bpos b;
+ unsigned gen;
+ struct nocow_lock_bucket *l;
+ } buckets[BCH_REPLICAS_MAX];
+ unsigned nr_buckets = 0;
+ u32 snapshot;
+ int ret, i;
+
+ if (op->flags & BCH_WRITE_MOVE)
+ return;
+
+ bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot);
+ if (unlikely(ret))
+ goto err;
+
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ SPOS(op->pos.inode, op->pos.offset, snapshot),
+ BTREE_ITER_SLOTS);
+ while (1) {
+ struct bio *bio = &op->wbio.bio;
+
+ nr_buckets = 0;
+
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ break;
+
+ /* fall back to normal cow write path? */
+ if (unlikely(k.k->p.snapshot != snapshot ||
+ !bch2_extent_is_writeable(op, k)))
+ break;
+
+ if (bch2_keylist_realloc(&op->insert_keys,
+ op->inline_keys,
+ ARRAY_SIZE(op->inline_keys),
+ k.k->u64s))
+ break;
+
+ /* Get iorefs before dropping btree locks: */
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr(ptrs, ptr) {
+ buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
+ buckets[nr_buckets].gen = ptr->gen;
+ buckets[nr_buckets].l =
+ bucket_nocow_lock(&c->nocow_locks,
+ bucket_to_u64(buckets[nr_buckets].b));
+
+ prefetch(buckets[nr_buckets].l);
+
+ if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
+ goto err_get_ioref;
+
+ nr_buckets++;
+
+ if (ptr->unwritten)
+ op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+ }
+
+ /* Unlock before taking nocow locks, doing IO: */
+ bkey_reassemble(op->insert_keys.top, k);
+ bch2_trans_unlock(&trans);
+
+ bch2_cut_front(op->pos, op->insert_keys.top);
+ if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+ bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+
+ for (i = 0; i < nr_buckets; i++) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
+ struct nocow_lock_bucket *l = buckets[i].l;
+ bool stale;
+
+ __bch2_bucket_nocow_lock(&c->nocow_locks, l,
+ bucket_to_u64(buckets[i].b),
+ BUCKET_NOCOW_LOCK_UPDATE);
+
+ rcu_read_lock();
+ stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
+ rcu_read_unlock();
+
+ if (unlikely(stale))
+ goto err_bucket_stale;
+ }
+
+ bio = &op->wbio.bio;
+ if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
+ bio = bio_split(bio, k.k->p.offset - op->pos.offset,
+ GFP_KERNEL, &c->bio_write);
+ wbio_init(bio)->put_bio = true;
+ bio->bi_opf = op->wbio.bio.bi_opf;
+ } else {
+ op->flags |= BCH_WRITE_DONE;
+ }
+
+ op->pos.offset += bio_sectors(bio);
+ op->written += bio_sectors(bio);
+
+ bio->bi_end_io = bch2_write_endio;
+ bio->bi_private = &op->cl;
+ bio->bi_opf |= REQ_OP_WRITE;
+ closure_get(&op->cl);
+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+ op->insert_keys.top, true);
+
+ bch2_keylist_push(&op->insert_keys);
+ if (op->flags & BCH_WRITE_DONE)
+ break;
+ bch2_btree_iter_advance(&iter);
+ }
+out:
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
+
+ if (ret) {
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ op->pos.offset << 9,
+ "%s: btree lookup error %s",
+ __func__, bch2_err_str(ret));
+ op->error = ret;
+ op->flags |= BCH_WRITE_DONE;
+ }
+
+ bch2_trans_exit(&trans);
+
+ /* fallback to cow write path? */
+ if (!(op->flags & BCH_WRITE_DONE)) {
+ closure_sync(&op->cl);
+ __bch2_nocow_write_done(op);
+ op->insert_keys.top = op->insert_keys.keys;
+ } else if (op->flags & BCH_WRITE_SYNC) {
+ closure_sync(&op->cl);
+ bch2_nocow_write_done(&op->cl);
+ } else {
+ /*
+ * XXX
+ * needs to run out of process context because ei_quota_lock is
+ * a mutex
+ */
+ continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
+ }
+ return;
+err_get_ioref:
+ for (i = 0; i < nr_buckets; i++)
+ percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
+
+ /* Fall back to COW path: */
+ goto out;
+err_bucket_stale:
+ while (--i >= 0)
+ bch2_bucket_nocow_unlock(&c->nocow_locks,
+ buckets[i].b,
+ BUCKET_NOCOW_LOCK_UPDATE);
+ for (i = 0; i < nr_buckets; i++)
+ percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
+
+ /* We can retry this: */
+ ret = BCH_ERR_transaction_restart;
+ goto out;
+}
+
+static void __bch2_write(struct bch_write_op *op)
+{
+ struct bch_fs *c = op->c;
+ struct write_point *wp = NULL;
+ struct bio *bio = NULL;
unsigned nofs_flags;
int ret;
nofs_flags = memalloc_nofs_save();
+
+ if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
+ bch2_nocow_write(op);
+ if (op->flags & BCH_WRITE_DONE)
+ goto out_nofs_restore;
+ }
again:
memset(&op->failed, 0, sizeof(op->failed));
@@ -1072,138 +1677,103 @@ again:
/* +1 for possible cache device: */
if (op->open_buckets.nr + op->nr_replicas + 1 >
ARRAY_SIZE(op->open_buckets.v))
- goto flush_io;
+ break;
if (bch2_keylist_realloc(&op->insert_keys,
op->inline_keys,
ARRAY_SIZE(op->inline_keys),
BKEY_EXTENT_U64s_MAX))
- goto flush_io;
-
- if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
- percpu_ref_is_dying(&c->writes)) {
- ret = -EROFS;
- goto err;
- }
+ break;
/*
* The copygc thread is now global, which means it's no longer
* freeing up space on specific disks, which means that
* allocations for specific disks may hang arbitrarily long:
*/
- wp = bch2_alloc_sectors_start(c,
- op->target,
- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
- op->write_point,
- &op->devs_have,
- op->nr_replicas,
- op->nr_replicas_required,
- op->alloc_reserve,
- op->flags,
- (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
- BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
- EBUG_ON(!wp);
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_alloc_sectors_start_trans(&trans,
+ op->target,
+ op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
+ op->write_point,
+ &op->devs_have,
+ op->nr_replicas,
+ op->nr_replicas_required,
+ op->alloc_reserve,
+ op->flags,
+ (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+ BCH_WRITE_ONLY_SPECIFIED_DEVS))
+ ? NULL : &op->cl, &wp));
+ if (unlikely(ret)) {
+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+ break;
- if (unlikely(IS_ERR(wp))) {
- if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
- ret = PTR_ERR(wp);
- goto err;
- }
-
- goto flush_io;
+ goto err;
}
- /*
- * It's possible for the allocator to fail, put us on the
- * freelist waitlist, and then succeed in one of various retry
- * paths: if that happens, we need to disable the skip_put
- * optimization because otherwise there won't necessarily be a
- * barrier before we free the bch_write_op:
- */
- if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
- skip_put = false;
+ EBUG_ON(!wp);
bch2_open_bucket_get(c, wp, &op->open_buckets);
ret = bch2_write_extent(op, wp, &bio);
- bch2_alloc_sectors_done(c, wp);
-
- if (ret < 0)
- goto err;
- if (ret) {
- skip_put = false;
- } else {
- /*
- * for the skip_put optimization this has to be set
- * before we submit the bio:
- */
+ bch2_alloc_sectors_done_inlined(c, wp);
+err:
+ if (ret <= 0) {
op->flags |= BCH_WRITE_DONE;
+
+ if (ret < 0) {
+ op->error = ret;
+ break;
+ }
}
bio->bi_end_io = bch2_write_endio;
bio->bi_private = &op->cl;
bio->bi_opf |= REQ_OP_WRITE;
- if (!skip_put)
- closure_get(bio->bi_private);
- else
- op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
+ closure_get(bio->bi_private);
key_to_write = (void *) (op->insert_keys.keys_p +
key_to_write_offset);
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
- key_to_write);
+ key_to_write, false);
} while (ret);
- if (!skip_put)
- continue_at(cl, bch2_write_index, index_update_wq(op));
-out:
- memalloc_nofs_restore(nofs_flags);
- return;
-err:
- op->error = ret;
- op->flags |= BCH_WRITE_DONE;
-
- continue_at(cl, bch2_write_index, index_update_wq(op));
- goto out;
-flush_io:
/*
- * If the write can't all be submitted at once, we generally want to
- * block synchronously as that signals backpressure to the caller.
+ * Sync or no?
*
- * However, if we're running out of a workqueue, we can't block here
- * because we'll be blocking other work items from completing:
+ * If we're running asynchronously, wne may still want to block
+ * synchronously here if we weren't able to submit all of the IO at
+ * once, as that signals backpressure to the caller.
*/
- if (current->flags & PF_WQ_WORKER) {
- continue_at(cl, bch2_write_index, index_update_wq(op));
- goto out;
- }
-
- closure_sync(cl);
-
- if (!bch2_keylist_empty(&op->insert_keys)) {
+ if ((op->flags & BCH_WRITE_SYNC) ||
+ (!(op->flags & BCH_WRITE_DONE) &&
+ !(op->flags & BCH_WRITE_IN_WORKER))) {
+ closure_sync(&op->cl);
__bch2_write_index(op);
- if (op->error) {
- op->flags |= BCH_WRITE_DONE;
- continue_at_nobarrier(cl, bch2_write_done, NULL);
- goto out;
- }
+ if (!(op->flags & BCH_WRITE_DONE))
+ goto again;
+ bch2_write_done(&op->cl);
+ } else {
+ bch2_write_queue(op, wp);
+ continue_at(&op->cl, bch2_write_index, NULL);
}
-
- goto again;
+out_nofs_restore:
+ memalloc_nofs_restore(nofs_flags);
}
static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
{
- struct closure *cl = &op->cl;
struct bio *bio = &op->wbio.bio;
struct bvec_iter iter;
struct bkey_i_inline_data *id;
unsigned sectors;
int ret;
+ op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+ op->flags |= BCH_WRITE_DONE;
+
bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
@@ -1231,11 +1801,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
set_bkey_val_bytes(&id->k, data_len);
bch2_keylist_push(&op->insert_keys);
- op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
- op->flags |= BCH_WRITE_DONE;
-
- continue_at_nobarrier(cl, bch2_write_index, NULL);
- return;
+ __bch2_write_index(op);
err:
bch2_write_done(&op->cl);
}
@@ -1263,27 +1829,36 @@ void bch2_write(struct closure *cl)
struct bch_fs *c = op->c;
unsigned data_len;
+ EBUG_ON(op->cl.parent);
BUG_ON(!op->nr_replicas);
BUG_ON(!op->write_point.v);
- BUG_ON(!bkey_cmp(op->pos, POS_MAX));
+ BUG_ON(bkey_eq(op->pos, POS_MAX));
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
wbio_init(bio)->put_bio = false;
if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
- bch_err_inum_ratelimited(c, op->pos.inode,
- "misaligned write");
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ op->pos.offset << 9,
+ "misaligned write");
op->error = -EIO;
goto err;
}
- if (c->opts.nochanges ||
- !percpu_ref_tryget(&c->writes)) {
- op->error = -EROFS;
+ if (c->opts.nochanges) {
+ op->error = -BCH_ERR_erofs_no_writes;
goto err;
}
+ if (!(op->flags & BCH_WRITE_MOVE) &&
+ !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
+ op->error = -BCH_ERR_erofs_no_writes;
+ goto err;
+ }
+
+ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
bch2_increment_clock(c, bio_sectors(bio), WRITE);
data_len = min_t(u64, bio->bi_iter.bi_size,
@@ -1295,31 +1870,54 @@ void bch2_write(struct closure *cl)
return;
}
- continue_at_nobarrier(cl, __bch2_write, NULL);
+ __bch2_write(op);
return;
err:
bch2_disk_reservation_put(c, &op->res);
- if (op->end_io) {
- EBUG_ON(cl->parent);
- closure_debug_destroy(cl);
+ closure_debug_destroy(&op->cl);
+ if (op->end_io)
op->end_io(op);
- } else {
- closure_return(cl);
- }
+}
+
+const char * const bch2_write_flags[] = {
+#define x(f) #f,
+ BCH_WRITE_FLAGS()
+#undef x
+ NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+ prt_str(out, "pos: ");
+ bch2_bpos_to_text(out, op->pos);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_str(out, "started: ");
+ bch2_pr_time_units(out, local_clock() - op->start_time);
+ prt_newline(out);
+
+ prt_str(out, "flags: ");
+ prt_bitflags(out, bch2_write_flags, op->flags);
+ prt_newline(out);
+
+ prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
}
/* Cache promotion on read */
struct promote_op {
- struct closure cl;
struct rcu_head rcu;
u64 start_time;
struct rhash_head hash;
struct bpos pos;
- struct migrate_write write;
+ struct data_update write;
struct bio_vec bi_inline_vecs[0]; /* must be last */
};
@@ -1343,6 +1941,9 @@ static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
if (bch2_bkey_has_target(c, k, opts.promote_target))
return false;
+ if (bkey_extent_is_unwritten(k))
+ return false;
+
if (bch2_target_congested(c, opts.promote_target)) {
/* XXX trace this */
return false;
@@ -1359,33 +1960,31 @@ static void promote_free(struct bch_fs *c, struct promote_op *op)
{
int ret;
+ bch2_data_update_exit(&op->write);
+
ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
bch_promote_params);
BUG_ON(ret);
- percpu_ref_put(&c->writes);
+ bch2_write_ref_put(c, BCH_WRITE_REF_promote);
kfree_rcu(op, rcu);
}
-static void promote_done(struct closure *cl)
+static void promote_done(struct bch_write_op *wop)
{
struct promote_op *op =
- container_of(cl, struct promote_op, cl);
+ container_of(wop, struct promote_op, write.op);
struct bch_fs *c = op->write.op.c;
bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
op->start_time);
-
- bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
promote_free(c, op);
}
static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
{
- struct bch_fs *c = rbio->c;
- struct closure *cl = &op->cl;
struct bio *bio = &op->write.op.wbio.bio;
- trace_promote(&rbio->bio);
+ trace_and_count(op->write.op.c, read_promote, &rbio->bio);
/* we now own pages: */
BUG_ON(!rbio->bounce);
@@ -1395,14 +1994,10 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
- bch2_migrate_read_done(&op->write, rbio);
-
- closure_init(cl, NULL);
- closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl);
- closure_return_with_destructor(cl, promote_done);
+ bch2_data_update_read_done(&op->write, rbio->pick.crc);
}
-static struct promote_op *__promote_alloc(struct bch_fs *c,
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
enum btree_id btree_id,
struct bkey_s_c k,
struct bpos pos,
@@ -1411,12 +2006,13 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
unsigned sectors,
struct bch_read_bio **rbio)
{
+ struct bch_fs *c = trans->c;
struct promote_op *op = NULL;
struct bio *bio;
unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
int ret;
- if (!percpu_ref_tryget(&c->writes))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
return NULL;
op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
@@ -1437,7 +2033,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
goto err;
rbio_init(&(*rbio)->bio, opts);
- bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
+ bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
GFP_NOIO))
@@ -1452,18 +2048,26 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
goto err;
bio = &op->write.op.wbio.bio;
- bio_init(bio, bio->bi_inline_vecs, pages);
+ bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
- ret = bch2_migrate_write_init(c, &op->write,
+ ret = bch2_data_update_init(trans, NULL, &op->write,
writepoint_hashed((unsigned long) current),
opts,
- DATA_PROMOTE,
- (struct data_opts) {
+ (struct data_update_opts) {
.target = opts.promote_target,
- .nr_replicas = 1,
+ .extra_replicas = 1,
+ .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
},
btree_id, k);
+ if (ret == -BCH_ERR_nocow_lock_blocked) {
+ ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params);
+ BUG_ON(ret);
+ goto err;
+ }
+
BUG_ON(ret);
+ op->write.op.end_io = promote_done;
return op;
err:
@@ -1472,21 +2076,22 @@ err:
kfree(*rbio);
*rbio = NULL;
kfree(op);
- percpu_ref_put(&c->writes);
+ bch2_write_ref_put(c, BCH_WRITE_REF_promote);
return NULL;
}
noinline
-static struct promote_op *promote_alloc(struct bch_fs *c,
- struct bvec_iter iter,
- struct bkey_s_c k,
- struct extent_ptr_decoded *pick,
- struct bch_io_opts opts,
- unsigned flags,
- struct bch_read_bio **rbio,
- bool *bounce,
- bool *read_full)
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+ struct bvec_iter iter,
+ struct bkey_s_c k,
+ struct extent_ptr_decoded *pick,
+ struct bch_io_opts opts,
+ unsigned flags,
+ struct bch_read_bio **rbio,
+ bool *bounce,
+ bool *read_full)
{
+ struct bch_fs *c = trans->c;
bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
/* data might have to be decompressed in the write path: */
unsigned sectors = promote_full
@@ -1500,7 +2105,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
if (!should_promote(c, k, pos, opts, flags))
return NULL;
- promote = __promote_alloc(c,
+ promote = __promote_alloc(trans,
k.k->type == KEY_TYPE_reflink_v
? BTREE_ID_reflink
: BTREE_ID_extents,
@@ -1653,7 +2258,7 @@ static void bch2_rbio_retry(struct work_struct *work)
};
struct bch_io_failures failed = { .nr = 0 };
- trace_read_retry(&rbio->bio);
+ trace_and_count(c, read_retry, &rbio->bio);
if (rbio->retry == READ_RETRY_AVOID)
bch2_mark_io_failure(&failed, &rbio->pick);
@@ -1772,6 +2377,7 @@ static void __bch2_read_endio(struct work_struct *work)
struct nonce nonce = extent_nonce(rbio->version, crc);
unsigned nofs_flags;
struct bch_csum csum;
+ int ret;
nofs_flags = memalloc_nofs_save();
@@ -1785,7 +2391,7 @@ static void __bch2_read_endio(struct work_struct *work)
}
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
- if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
+ if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
goto csum_err;
/*
@@ -1806,7 +2412,10 @@ static void __bch2_read_endio(struct work_struct *work)
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
if (crc_is_compressed(crc)) {
- bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
+
if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
goto decompression_err;
} else {
@@ -1817,7 +2426,9 @@ static void __bch2_read_endio(struct work_struct *work)
BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
src->bi_iter.bi_size = dst_iter.bi_size;
- bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
if (rbio->bounce) {
struct bvec_iter src_iter = src->bi_iter;
@@ -1830,7 +2441,10 @@ static void __bch2_read_endio(struct work_struct *work)
* Re encrypt data we decrypted, so it's consistent with
* rbio->crc:
*/
- bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
+
promote_start(rbio->promote, rbio);
rbio->promote = NULL;
}
@@ -1854,15 +2468,25 @@ csum_err:
goto out;
}
- bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
- "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
+ bch_err_inum_offset_ratelimited(ca,
+ rbio->read_pos.inode,
+ rbio->read_pos.offset << 9,
+ "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
- csum.hi, csum.lo, crc.csum_type);
+ csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
+ bch2_io_error(ca);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
decompression_err:
- bch_err_inum_ratelimited(c, rbio->read_pos.inode,
- "decompression error");
+ bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+ rbio->read_pos.offset << 9,
+ "decompression error");
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ goto out;
+decrypt_err:
+ bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+ rbio->read_pos.offset << 9,
+ "decrypt error");
bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
goto out;
}
@@ -1893,10 +2517,9 @@ static void bch2_read_endio(struct bio *bio)
return;
}
- if (rbio->pick.ptr.cached &&
- (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(ca, &rbio->pick.ptr))) {
- atomic_long_inc(&c->read_realloc_races);
+ if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+ ptr_stale(ca, &rbio->pick.ptr)) {
+ trace_and_count(c, read_reuse_race, &rbio->bio);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
@@ -1906,6 +2529,7 @@ static void bch2_read_endio(struct bio *bio)
}
if (rbio->narrow_crcs ||
+ rbio->promote ||
crc_is_compressed(rbio->pick.crc) ||
bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
@@ -1937,7 +2561,9 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_reflink_v &&
k.k->type != KEY_TYPE_indirect_inline_data) {
- bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
+ bch_err_inum_offset_ratelimited(trans->c,
+ orig_k->k->k.p.inode,
+ orig_k->k->k.p.offset << 9,
"%llu len %u points to nonexistent indirect extent %llu",
orig_k->k->k.p.offset,
orig_k->k->k.size,
@@ -1954,6 +2580,41 @@ err:
return ret;
}
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct bch_extent_ptr ptr)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+ struct btree_iter iter;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ PTR_BUCKET_POS(c, &ptr),
+ BTREE_ITER_CACHED);
+
+ prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+ printbuf_indent_add(&buf, 2);
+ prt_newline(&buf);
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (!ret) {
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ }
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+}
+
int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bvec_iter iter, struct bpos read_pos,
enum btree_id data_btree, struct bkey_s_c k,
@@ -1963,7 +2624,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct bch_read_bio *rbio = NULL;
- struct bch_dev *ca;
+ struct bch_dev *ca = NULL;
struct promote_op *promote = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
@@ -1980,7 +2641,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
zero_fill_bio_iter(&orig->bio, iter);
goto out_read_done;
}
-
+retry_pick:
pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
/* hole or reservation - just zero fill: */
@@ -1988,13 +2649,33 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
goto hole;
if (pick_ret < 0) {
- bch_err_inum_ratelimited(c, k.k->p.inode,
- "no device to read from");
+ bch_err_inum_offset_ratelimited(c,
+ read_pos.inode, read_pos.offset << 9,
+ "no device to read from");
goto err;
}
- if (pick_ret > 0)
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+ /*
+ * Stale dirty pointers are treated as IO errors, but @failed isn't
+ * allocated unless we're in the retry path - so if we're not in the
+ * retry path, don't check here, it'll be caught in bch2_read_endio()
+ * and we'll end up in the retry path:
+ */
+ if ((flags & BCH_READ_IN_RETRY) &&
+ !pick.ptr.cached &&
+ unlikely(ptr_stale(ca, &pick.ptr))) {
+ read_from_stale_dirty_pointer(trans, k, pick.ptr);
+ bch2_mark_io_failure(failed, &pick);
+ goto retry_pick;
+ }
+
+ /*
+ * Unlock the iterator while the btree node's lock is still in
+ * cache, before doing the IO:
+ */
+ bch2_trans_unlock(trans);
if (flags & BCH_READ_NODECODE) {
/*
@@ -2031,7 +2712,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
}
if (orig->opts.promote_target)
- promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+ promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
&rbio, &bounce, &read_full);
if (!read_full) {
@@ -2067,8 +2748,10 @@ get_bio:
} else if (bounce) {
unsigned sectors = pick.crc.compressed_size;
- rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
+ rbio = rbio_init(bio_alloc_bioset(NULL,
DIV_ROUND_UP(sectors, PAGE_SECTORS),
+ 0,
+ GFP_NOIO,
&c->bio_read_split),
orig->opts);
@@ -2084,8 +2767,8 @@ get_bio:
* from the whole bio, in which case we don't want to retry and
* lose the error)
*/
- rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
- &c->bio_read_split),
+ rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO,
+ &c->bio_read_split),
orig->opts);
rbio->bio.bi_iter = iter;
rbio->split = true;
@@ -2127,8 +2810,9 @@ get_bio:
rbio->bio.bi_end_io = bch2_read_endio;
if (rbio->bounce)
- trace_read_bounce(&rbio->bio);
+ trace_and_count(c, read_bounce, &rbio->bio);
+ this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
/*
@@ -2141,13 +2825,15 @@ get_bio:
if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
bio_inc_remaining(&orig->bio);
- trace_read_split(&orig->bio);
+ trace_and_count(c, read_split, &orig->bio);
}
if (!rbio->pick.idx) {
if (!rbio->have_ioref) {
- bch_err_inum_ratelimited(c, k.k->p.inode,
- "no device to read from");
+ bch_err_inum_offset_ratelimited(c,
+ read_pos.inode,
+ read_pos.offset << 9,
+ "no device to read from");
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
@@ -2156,10 +2842,21 @@ get_bio:
bio_sectors(&rbio->bio));
bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
- if (likely(!(flags & BCH_READ_IN_RETRY)))
- submit_bio(&rbio->bio);
- else
- submit_bio_wait(&rbio->bio);
+ if (unlikely(c->opts.no_data_io)) {
+ if (likely(!(flags & BCH_READ_IN_RETRY)))
+ bio_endio(&rbio->bio);
+ } else {
+ if (likely(!(flags & BCH_READ_IN_RETRY)))
+ submit_bio(&rbio->bio);
+ else
+ submit_bio_wait(&rbio->bio);
+ }
+
+ /*
+ * We just submitted IO which may block, we expect relock fail
+ * events and shouldn't count them:
+ */
+ trans->notrace_relock_fail = true;
} else {
/* Attempting reconstruct read: */
if (bch2_ec_read_extent(c, rbio)) {
@@ -2250,10 +2947,9 @@ retry:
* read_extent -> io_time_reset may cause a transaction restart
* without returning an error, we need to check for that here:
*/
- if (!bch2_trans_relock(&trans)) {
- ret = -EINTR;
+ ret = bch2_trans_relock(&trans);
+ if (ret)
break;
- }
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, bvec_iter.bi_sector));
@@ -2282,12 +2978,6 @@ retry:
*/
sectors = min(sectors, k.k->size - offset_into_extent);
- /*
- * Unlock the iterator while the btree node's lock is still in
- * cache, before doing the IO:
- */
- bch2_trans_unlock(&trans);
-
bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
swap(bvec_iter.bi_size, bytes);
@@ -2313,15 +3003,18 @@ retry:
err:
bch2_trans_iter_exit(&trans, &iter);
- if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ ret == READ_RETRY ||
+ ret == READ_RETRY_AVOID)
goto retry;
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
if (ret) {
- bch_err_inum_ratelimited(c, inum.inum,
- "read error %i from btree lookup", ret);
+ bch_err_inum_offset_ratelimited(c, inum.inum,
+ bvec_iter.bi_sector << 9,
+ "read error %i from btree lookup", ret);
rbio->bio.bi_status = BLK_STS_IOERR;
bch2_rbio_done(rbio);
}
@@ -2340,18 +3033,26 @@ void bch2_fs_io_exit(struct bch_fs *c)
int bch2_fs_io_init(struct bch_fs *c)
{
if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
- BIOSET_NEED_BVECS) ||
- bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
- BIOSET_NEED_BVECS) ||
- bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
- BIOSET_NEED_BVECS) ||
- mempool_init_page_pool(&c->bio_bounce_pages,
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_bio_read_init;
+
+ if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_bio_read_split_init;
+
+ if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -BCH_ERR_ENOMEM_bio_write_init;
+
+ if (mempool_init_page_pool(&c->bio_bounce_pages,
max_t(unsigned,
c->opts.btree_node_size,
c->opts.encoded_extent_max) /
- PAGE_SIZE, 0) ||
- rhashtable_init(&c->promote_table, &bch_promote_params))
- return -ENOMEM;
+ PAGE_SIZE, 0))
+ return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+
+ if (rhashtable_init(&c->promote_table, &bch_promote_params))
+ return -BCH_ERR_ENOMEM_promote_table_init;
return 0;
}
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
index 1aa422dccef7..90948bb0aabd 100644
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@@ -15,42 +15,51 @@
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
- enum bch_data_type, const struct bkey_i *);
+ enum bch_data_type, const struct bkey_i *, bool);
#define BLK_STS_REMOVED ((__force blk_status_t)128)
const char *bch2_blk_status_to_str(blk_status_t);
-enum bch_write_flags {
- BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
- BCH_WRITE_CACHED = (1 << 1),
- BCH_WRITE_FLUSH = (1 << 2),
- BCH_WRITE_DATA_ENCODED = (1 << 3),
- BCH_WRITE_PAGES_STABLE = (1 << 4),
- BCH_WRITE_PAGES_OWNED = (1 << 5),
- BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
- BCH_WRITE_WROTE_DATA_INLINE = (1 << 7),
- BCH_WRITE_FROM_INTERNAL = (1 << 8),
- BCH_WRITE_CHECK_ENOSPC = (1 << 9),
-
- /* Internal: */
- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10),
- BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11),
- BCH_WRITE_DONE = (1 << 12),
+#define BCH_WRITE_FLAGS() \
+ x(ALLOC_NOWAIT) \
+ x(CACHED) \
+ x(DATA_ENCODED) \
+ x(PAGES_STABLE) \
+ x(PAGES_OWNED) \
+ x(ONLY_SPECIFIED_DEVS) \
+ x(WROTE_DATA_INLINE) \
+ x(FROM_INTERNAL) \
+ x(CHECK_ENOSPC) \
+ x(SYNC) \
+ x(MOVE) \
+ x(IN_WORKER) \
+ x(DONE) \
+ x(IO_ERROR) \
+ x(CONVERT_UNWRITTEN)
+
+enum __bch_write_flags {
+#define x(f) __BCH_WRITE_##f,
+ BCH_WRITE_FLAGS()
+#undef x
};
-static inline u64 *op_journal_seq(struct bch_write_op *op)
-{
- return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
- ? op->journal_seq_p : &op->journal_seq;
-}
+enum bch_write_flags {
+#define x(f) BCH_WRITE_##f = 1U << __BCH_WRITE_##f,
+ BCH_WRITE_FLAGS()
+#undef x
+};
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
- return op->alloc_reserve == RESERVE_MOVINGGC
+ return op->alloc_reserve == RESERVE_movinggc
? op->c->copygc_wq
: op->c->btree_update_wq;
}
@@ -59,14 +68,15 @@ int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
struct bkey_i *, bool *, s64 *, s64 *);
int bch2_extent_update(struct btree_trans *, subvol_inum,
struct btree_iter *, struct bkey_i *,
- struct disk_reservation *, u64 *, u64, s64 *, bool);
+ struct disk_reservation *, u64, s64 *, bool);
+int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
+ unsigned, struct bch_io_opts, s64 *,
+ struct write_point_specifier);
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
subvol_inum, u64, s64 *);
int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
-int bch2_write_index_default(struct bch_write_op *);
-
static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
struct bch_io_opts opts)
{
@@ -75,11 +85,11 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
op->flags = 0;
op->written = 0;
op->error = 0;
- op->csum_type = bch2_data_checksum_type(c, opts.data_checksum);
+ op->csum_type = bch2_data_checksum_type(c, opts);
op->compression_type = bch2_compression_opt_to_type[opts.compression];
op->nr_replicas = 0;
op->nr_replicas_required = c->opts.data_replicas_required;
- op->alloc_reserve = RESERVE_NONE;
+ op->alloc_reserve = RESERVE_none;
op->incompressible = 0;
op->open_buckets.nr = 0;
op->devs_have.nr = 0;
@@ -90,14 +100,15 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
op->version = ZERO_VERSION;
op->write_point = (struct write_point_specifier) { 0 };
op->res = (struct disk_reservation) { 0 };
- op->journal_seq = 0;
op->new_i_size = U64_MAX;
op->i_sectors_delta = 0;
- op->index_update_fn = bch2_write_index_default;
+ op->devs_need_flush = NULL;
}
void bch2_write(struct closure *);
+void bch2_write_point_do_index_updates(struct work_struct *);
+
static inline struct bch_write_bio *wbio_init(struct bio *bio)
{
struct bch_write_bio *wbio = to_wbio(bio);
@@ -106,6 +117,8 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio)
return wbio;
}
+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
+
struct bch_devs_mask;
struct cache_promote_op;
struct extent_ptr_decoded;
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
index 78bff13d36f2..3b2ed0fa583a 100644
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@@ -87,6 +87,7 @@ struct bch_write_bio {
struct bch_write_bio *parent;
u64 submit_time;
+ u64 inode_offset;
struct bch_devs_list failed;
u8 dev;
@@ -95,6 +96,7 @@ struct bch_write_bio {
bounce:1,
put_bio:1,
have_ioref:1,
+ nocow:1,
used_mempool:1,
first_btree_write:1;
@@ -117,6 +119,7 @@ struct bch_write_op {
unsigned nr_replicas_required:4;
unsigned alloc_reserve:3;
unsigned incompressible:1;
+ unsigned stripe_waited:1;
struct bch_devs_list devs_have;
u16 target;
@@ -132,28 +135,27 @@ struct bch_write_op {
struct write_point_specifier write_point;
+ struct write_point *wp;
+ struct list_head wp_list;
+
struct disk_reservation res;
struct open_buckets open_buckets;
- /*
- * If caller wants to flush but hasn't passed us a journal_seq ptr, we
- * still need to stash the journal_seq somewhere:
- */
- union {
- u64 *journal_seq_p;
- u64 journal_seq;
- };
u64 new_i_size;
s64 i_sectors_delta;
- int (*index_update_fn)(struct bch_write_op *);
-
struct bch_devs_mask failed;
struct keylist insert_keys;
u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+ /*
+ * Bitmask of devices that have had nocow writes issued to them since
+ * last flush:
+ */
+ struct bch_devs_mask *devs_need_flush;
+
/* Must be last: */
struct bch_write_bio wbio;
};
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 158df42e5e10..3f0e6d71aa32 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -15,23 +15,26 @@
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
+#include "journal_sb.h"
#include "journal_seq_blacklist.h"
-#include "super-io.h"
#include <trace/events/bcachefs.h>
-static u64 last_unwritten_seq(struct journal *j)
-{
- union journal_res_state s = READ_ONCE(j->reservations);
-
- lockdep_assert_held(&j->lock);
+#define x(n) #n,
+static const char * const bch2_journal_watermarks[] = {
+ JOURNAL_WATERMARKS()
+ NULL
+};
- return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
-}
+static const char * const bch2_journal_errors[] = {
+ JOURNAL_ERRORS()
+ NULL
+};
+#undef x
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
- return seq >= last_unwritten_seq(j);
+ return seq > j->seq_ondisk;
}
static bool __journal_entry_is_open(union journal_res_state state)
@@ -39,6 +42,11 @@ static bool __journal_entry_is_open(union journal_res_state state)
return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
}
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+ return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
static bool journal_entry_is_open(struct journal *j)
{
return __journal_entry_is_open(j->reservations);
@@ -50,8 +58,6 @@ journal_seq_to_buf(struct journal *j, u64 seq)
struct journal_buf *buf = NULL;
EBUG_ON(seq > journal_cur_seq(j));
- EBUG_ON(seq == journal_cur_seq(j) &&
- j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
if (journal_seq_unwritten(j, seq)) {
buf = j->buf + (seq & JOURNAL_BUF_MASK);
@@ -62,59 +68,73 @@ journal_seq_to_buf(struct journal *j, u64 seq)
static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
{
- INIT_LIST_HEAD(&p->list);
- INIT_LIST_HEAD(&p->key_cache_list);
+ unsigned i;
+ for (i = 0; i < ARRAY_SIZE(p->list); i++)
+ INIT_LIST_HEAD(&p->list[i]);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, count);
p->devs.nr = 0;
}
-static void journal_pin_new_entry(struct journal *j)
-{
- /*
- * The fifo_push() needs to happen at the same time as j->seq is
- * incremented for journal_last_seq() to be calculated correctly
- */
- atomic64_inc(&j->seq);
- journal_pin_list_init(fifo_push_ref(&j->pin), 1);
-}
-
-static void bch2_journal_buf_init(struct journal *j)
+/*
+ * Detect stuck journal conditions and trigger shutdown. Technically the journal
+ * can end up stuck for a variety of reasons, such as a blocked I/O, journal
+ * reservation lockup, etc. Since this is a fatal error with potentially
+ * unpredictable characteristics, we want to be fairly conservative before we
+ * decide to shut things down.
+ *
+ * Consider the journal stuck when it appears full with no ability to commit
+ * btree transactions, to discard journal buckets, nor acquire priority
+ * (reserved watermark) reservation.
+ */
+static inline bool
+journal_error_check_stuck(struct journal *j, int error, unsigned flags)
{
- struct journal_buf *buf = journal_cur_buf(j);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool stuck = false;
+ struct printbuf buf = PRINTBUF;
- bkey_extent_init(&buf->key);
- buf->noflush = false;
- buf->must_flush = false;
- buf->separate_flush = false;
+ if (!(error == JOURNAL_ERR_journal_full ||
+ error == JOURNAL_ERR_journal_pin_full) ||
+ nr_unwritten_journal_entries(j) ||
+ (flags & JOURNAL_WATERMARK_MASK) != JOURNAL_WATERMARK_reserved)
+ return stuck;
- memset(buf->data, 0, sizeof(*buf->data));
- buf->data->seq = cpu_to_le64(journal_cur_seq(j));
- buf->data->u64s = 0;
-}
+ spin_lock(&j->lock);
-void bch2_journal_halt(struct journal *j)
-{
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
+ if (j->can_discard) {
+ spin_unlock(&j->lock);
+ return stuck;
+ }
- do {
- old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return;
-
- new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ stuck = true;
/*
- * XXX: we're not using j->lock here because this can be called from
- * interrupt context, this can race with journal_write_done()
+ * The journal shutdown path will set ->err_seq, but do it here first to
+ * serialize against concurrent failures and avoid duplicate error
+ * reports.
*/
- if (!j->err_seq)
- j->err_seq = journal_cur_seq(j);
- journal_wake(j);
- closure_wake_up(&journal_cur_buf(j)->wait);
+ if (j->err_seq) {
+ spin_unlock(&j->lock);
+ return stuck;
+ }
+ j->err_seq = journal_cur_seq(j);
+ spin_unlock(&j->lock);
+
+ bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
+ bch2_journal_errors[error]);
+ bch2_journal_debug_to_text(&buf, j);
+ bch_err(c, "%s", buf.buf);
+
+ printbuf_reset(&buf);
+ bch2_journal_pins_to_text(&buf, j);
+ bch_err(c, "Journal pins:\n%s", buf.buf);
+ printbuf_exit(&buf);
+
+ bch2_fatal_error(c);
+ dump_stack();
+
+ return stuck;
}
/* journal entry close/open: */
@@ -132,7 +152,7 @@ void __bch2_journal_buf_put(struct journal *j)
* We don't close a journal_buf until the next journal_buf is finished writing,
* and can be opened again - this also initializes the next journal_buf:
*/
-static bool __journal_entry_close(struct journal *j)
+static void __journal_entry_close(struct journal *j, unsigned closed_val)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
@@ -140,34 +160,24 @@ static bool __journal_entry_close(struct journal *j)
u64 v = atomic64_read(&j->reservations.counter);
unsigned sectors;
+ BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
+ closed_val != JOURNAL_ENTRY_ERROR_VAL);
+
lockdep_assert_held(&j->lock);
do {
old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
- return true;
-
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
- /* this entry will never be written: */
- closure_wake_up(&buf->wait);
- return true;
- }
-
- if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
- j->need_write_time = local_clock();
- }
-
- new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
- new.idx++;
-
- if (new.idx == new.unwritten_idx)
- return false;
+ new.cur_entry_offset = closed_val;
- BUG_ON(journal_state_count(new, new.idx));
+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
+ old.cur_entry_offset == new.cur_entry_offset)
+ return;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
+ if (!__journal_entry_is_open(old))
+ return;
+
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
@@ -197,36 +207,43 @@ static bool __journal_entry_close(struct journal *j)
*/
buf->last_seq = journal_last_seq(j);
buf->data->last_seq = cpu_to_le64(buf->last_seq);
+ BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
- /* Initialize new buffer: */
- journal_pin_new_entry(j);
-
- bch2_journal_buf_init(j);
-
cancel_delayed_work(&j->write_work);
- clear_bit(JOURNAL_NEED_WRITE, &j->flags);
bch2_journal_space_available(j);
bch2_journal_buf_put(j, old.idx);
- return true;
+}
+
+void bch2_journal_halt(struct journal *j)
+{
+ spin_lock(&j->lock);
+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+ if (!j->err_seq)
+ j->err_seq = journal_cur_seq(j);
+ journal_wake(j);
+ spin_unlock(&j->lock);
}
static bool journal_entry_want_write(struct journal *j)
{
- union journal_res_state s = READ_ONCE(j->reservations);
- bool ret = false;
+ bool ret = !journal_entry_is_open(j) ||
+ journal_cur_seq(j) == journal_last_unwritten_seq(j);
- /*
- * Don't close it yet if we already have a write in flight, but do set
- * NEED_WRITE:
- */
- if (s.idx != s.unwritten_idx)
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
- else
- ret = __journal_entry_close(j);
+ /* Don't close it yet if we already have a write in flight: */
+ if (ret)
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ else if (nr_unwritten_journal_entries(j)) {
+ struct journal_buf *buf = journal_cur_buf(j);
+
+ if (!buf->flush_time) {
+ buf->flush_time = local_clock() ?: 1;
+ buf->expires = jiffies;
+ }
+ }
return ret;
}
@@ -245,44 +262,81 @@ static bool journal_entry_close(struct journal *j)
/*
* should _only_ called from journal_res_get() - when we actually want a
* journal reservation - journal entry is open means journal is dirty:
- *
- * returns:
- * 0: success
- * -ENOSPC: journal currently full, must invoke reclaim
- * -EAGAIN: journal blocked, must wait
- * -EROFS: insufficient rw devices or journal error
*/
static int journal_entry_open(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf = journal_cur_buf(j);
+ struct journal_buf *buf = j->buf +
+ ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
union journal_res_state old, new;
int u64s;
u64 v;
- BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j));
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
if (j->blocked)
- return cur_entry_blocked;
+ return JOURNAL_ERR_blocked;
if (j->cur_entry_error)
return j->cur_entry_error;
+ if (bch2_journal_error(j))
+ return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+
+ if (!fifo_free(&j->pin))
+ return JOURNAL_ERR_journal_pin_full;
+
+ if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
+ return JOURNAL_ERR_max_in_flight;
+
BUG_ON(!j->cur_entry_sectors);
+ buf->expires =
+ (journal_cur_seq(j) == j->flushed_seq_ondisk
+ ? jiffies
+ : j->last_flush_write) +
+ msecs_to_jiffies(c->opts.journal_flush_delay);
+
buf->u64s_reserved = j->entry_u64s_reserved;
buf->disk_sectors = j->cur_entry_sectors;
buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9);
u64s = (int) (buf->sectors << 9) / sizeof(u64) -
journal_entry_overhead(j);
- u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
+ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
+
+ if (u64s <= (ssize_t) j->early_journal_entries.nr)
+ return JOURNAL_ERR_journal_full;
- if (u64s <= le32_to_cpu(buf->data->u64s))
- return cur_entry_journal_full;
+ if (fifo_empty(&j->pin) && j->reclaim_thread)
+ wake_up_process(j->reclaim_thread);
+
+ /*
+ * The fifo_push() needs to happen at the same time as j->seq is
+ * incremented for journal_last_seq() to be calculated correctly
+ */
+ atomic64_inc(&j->seq);
+ journal_pin_list_init(fifo_push_ref(&j->pin), 1);
+
+ BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
+
+ bkey_extent_init(&buf->key);
+ buf->noflush = false;
+ buf->must_flush = false;
+ buf->separate_flush = false;
+ buf->flush_time = 0;
+
+ memset(buf->data, 0, sizeof(*buf->data));
+ buf->data->seq = cpu_to_le64(journal_cur_seq(j));
+ buf->data->u64s = 0;
+
+ if (j->early_journal_entries.nr) {
+ memcpy(buf->data->_data, j->early_journal_entries.data,
+ j->early_journal_entries.nr * sizeof(u64));
+ le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr);
+ }
/*
* Must be set before marking the journal entry as open:
@@ -293,14 +347,16 @@ static int journal_entry_open(struct journal *j)
do {
old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return cur_entry_insufficient_devices;
+ BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
- /* Handle any already added entries */
- new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+ new.idx++;
+ BUG_ON(journal_state_count(new, new.idx));
+ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
- EBUG_ON(journal_state_count(new, new.idx));
journal_state_inc(&new);
+
+ /* Handle any already added entries */
+ new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
@@ -313,13 +369,15 @@ static int journal_entry_open(struct journal *j)
&j->write_work,
msecs_to_jiffies(c->opts.journal_flush_delay));
journal_wake(j);
+
+ if (j->early_journal_entries.nr)
+ darray_exit(&j->early_journal_entries);
return 0;
}
static bool journal_quiesced(struct journal *j)
{
- union journal_res_state s = READ_ONCE(j->reservations);
- bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
+ bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
if (!ret)
journal_entry_close(j);
@@ -334,8 +392,21 @@ static void journal_quiesce(struct journal *j)
static void journal_write_work(struct work_struct *work)
{
struct journal *j = container_of(work, struct journal, write_work.work);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ long delta;
+
+ spin_lock(&j->lock);
+ if (!__journal_entry_is_open(j->reservations))
+ goto unlock;
+
+ delta = journal_cur_buf(j)->expires - jiffies;
- journal_entry_close(j);
+ if (delta > 0)
+ mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
+ else
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+unlock:
+ spin_unlock(&j->lock);
}
static int __journal_res_get(struct journal *j, struct journal_res *res,
@@ -350,10 +421,16 @@ retry:
return 0;
if (bch2_journal_error(j))
- return -EROFS;
+ return -BCH_ERR_erofs_journal_err;
spin_lock(&j->lock);
+ /* check once more in case somebody else shut things down... */
+ if (bch2_journal_error(j)) {
+ spin_unlock(&j->lock);
+ return -BCH_ERR_erofs_journal_err;
+ }
+
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call journal_entry_close()
@@ -364,13 +441,12 @@ retry:
return 0;
}
- if (!(flags & JOURNAL_RES_GET_RESERVED) &&
- !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
+ if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) {
/*
* Don't want to close current journal entry, just need to
* invoke reclaim:
*/
- ret = cur_entry_journal_full;
+ ret = JOURNAL_ERR_journal_full;
goto unlock;
}
@@ -385,23 +461,16 @@ retry:
buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
- if (journal_entry_is_open(j) &&
- !__journal_entry_close(j)) {
- /*
- * We failed to get a reservation on the current open journal
- * entry because it's full, and we can't close it because
- * there's still a previous one in flight:
- */
- trace_journal_entry_full(c);
- ret = cur_entry_blocked;
- } else {
- ret = journal_entry_open(j);
- }
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ ret = journal_entry_open(j);
+
+ if (ret == JOURNAL_ERR_max_in_flight)
+ trace_and_count(c, journal_entry_full, c);
unlock:
- if ((ret && ret != cur_entry_insufficient_devices) &&
+ if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
!j->res_get_blocked_start) {
j->res_get_blocked_start = local_clock() ?: 1;
- trace_journal_full(c);
+ trace_and_count(c, journal_full, c);
}
can_discard = j->can_discard;
@@ -409,34 +478,15 @@ unlock:
if (!ret)
goto retry;
-
- if ((ret == cur_entry_journal_full ||
- ret == cur_entry_journal_pin_full) &&
- !can_discard &&
- j->reservations.idx == j->reservations.unwritten_idx &&
- (flags & JOURNAL_RES_GET_RESERVED)) {
- char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
-
- bch_err(c, "Journal stuck!");
- if (journal_debug_buf) {
- bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
- bch_err(c, "%s", journal_debug_buf);
-
- bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
- bch_err(c, "Journal pins:\n%s", journal_debug_buf);
- kfree(journal_debug_buf);
- }
-
- bch2_fatal_error(c);
- dump_stack();
- }
+ if (journal_error_check_stuck(j, ret, flags))
+ ret = -BCH_ERR_journal_res_get_blocked;
/*
* Journal is full - can't rely on reclaim from work item due to
* freezing:
*/
- if ((ret == cur_entry_journal_full ||
- ret == cur_entry_journal_pin_full) &&
+ if ((ret == JOURNAL_ERR_journal_full ||
+ ret == JOURNAL_ERR_journal_pin_full) &&
!(flags & JOURNAL_RES_GET_NONBLOCK)) {
if (can_discard) {
bch2_journal_do_discards(j);
@@ -449,7 +499,9 @@ unlock:
}
}
- return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
+ return ret == JOURNAL_ERR_insufficient_devices
+ ? -EROFS
+ : -BCH_ERR_journal_res_get_blocked;
}
/*
@@ -468,7 +520,8 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
int ret;
closure_wait_event(&j->async_wait,
- (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
+ (ret = __journal_res_get(j, res, flags)) !=
+ -BCH_ERR_journal_res_get_blocked||
(flags & JOURNAL_RES_GET_NONBLOCK));
return ret;
}
@@ -528,7 +581,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
/*
* Not enough room in current journal entry, have to flush it:
*/
- __journal_entry_close(j);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
} else {
journal_cur_buf(j)->u64s_reserved += d;
}
@@ -573,12 +626,15 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
}
/* if seq was written, but not flushed - flush a newer one instead */
- seq = max(seq, last_unwritten_seq(j));
+ seq = max(seq, journal_last_unwritten_seq(j));
recheck_need_open:
- if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+ if (seq > journal_cur_seq(j)) {
struct journal_res res = { 0 };
+ if (journal_entry_is_open(j))
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+
spin_unlock(&j->lock);
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
@@ -588,7 +644,11 @@ recheck_need_open:
seq = res.seq;
buf = j->buf + (seq & JOURNAL_BUF_MASK);
buf->must_flush = true;
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+ if (!buf->flush_time) {
+ buf->flush_time = local_clock() ?: 1;
+ buf->expires = jiffies;
+ }
if (parent && !closure_wait(&buf->wait, parent))
BUG();
@@ -640,69 +700,18 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
return ret ?: ret2 < 0 ? ret2 : 0;
}
-int bch2_journal_meta(struct journal *j)
-{
- struct journal_buf *buf;
- struct journal_res res;
- int ret;
-
- memset(&res, 0, sizeof(res));
-
- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
- if (ret)
- return ret;
-
- buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
- buf->must_flush = true;
- set_bit(JOURNAL_NEED_WRITE, &j->flags);
-
- bch2_journal_res_put(j, &res);
-
- return bch2_journal_flush_seq(j, res.seq);
-}
-
/*
* bch2_journal_flush_async - if there is an open journal entry, or a journal
* still being written, write it and wait for the write to complete
*/
void bch2_journal_flush_async(struct journal *j, struct closure *parent)
{
- u64 seq, journal_seq;
-
- spin_lock(&j->lock);
- journal_seq = journal_cur_seq(j);
-
- if (journal_entry_is_open(j)) {
- seq = journal_seq;
- } else if (journal_seq) {
- seq = journal_seq - 1;
- } else {
- spin_unlock(&j->lock);
- return;
- }
- spin_unlock(&j->lock);
-
- bch2_journal_flush_seq_async(j, seq, parent);
+ bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
}
int bch2_journal_flush(struct journal *j)
{
- u64 seq, journal_seq;
-
- spin_lock(&j->lock);
- journal_seq = journal_cur_seq(j);
-
- if (journal_entry_is_open(j)) {
- seq = journal_seq;
- } else if (journal_seq) {
- seq = journal_seq - 1;
- } else {
- spin_unlock(&j->lock);
- return 0;
- }
- spin_unlock(&j->lock);
-
- return bch2_journal_flush_seq(j, seq);
+ return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
}
/*
@@ -725,13 +734,13 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
if (seq <= c->journal.flushed_seq_ondisk)
goto out;
- for (unwritten_seq = last_unwritten_seq(j);
+ for (unwritten_seq = journal_last_unwritten_seq(j);
unwritten_seq < seq;
unwritten_seq++) {
struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
/* journal write is already in flight, and was a flush write: */
- if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush)
+ if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
goto out;
buf->noflush = true;
@@ -743,6 +752,31 @@ out:
return ret;
}
+int bch2_journal_meta(struct journal *j)
+{
+ struct journal_buf *buf;
+ struct journal_res res;
+ int ret;
+
+ memset(&res, 0, sizeof(res));
+
+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+ if (ret)
+ return ret;
+
+ buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+ buf->must_flush = true;
+
+ if (!buf->flush_time) {
+ buf->flush_time = local_clock() ?: 1;
+ buf->expires = jiffies;
+ }
+
+ bch2_journal_res_put(j, &res);
+
+ return bch2_journal_flush_seq(j, res.seq);
+}
+
/* block/unlock the journal: */
void bch2_journal_unblock(struct journal *j)
@@ -770,116 +804,129 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
{
struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets;
u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+ struct open_bucket **ob = NULL;
+ long *bu = NULL;
+ unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
int ret = 0;
- /* don't handle reducing nr of buckets yet: */
- if (nr <= ja->nr)
- return 0;
+ BUG_ON(nr <= ja->nr);
- new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- if (!new_buckets || !new_bucket_seq) {
- ret = -ENOMEM;
- goto err;
+ bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
+ ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
+ new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL);
+ new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL);
+ if (!bu || !ob || !new_buckets || !new_bucket_seq) {
+ ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+ goto err_free;
}
- journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
- if (!journal_buckets) {
- ret = -ENOSPC;
- goto err;
+ for (nr_got = 0; nr_got < nr_want; nr_got++) {
+ if (new_fs) {
+ bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
+ if (bu[nr_got] < 0) {
+ ret = -BCH_ERR_ENOSPC_bucket_alloc;
+ break;
+ }
+ } else {
+ ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, cl);
+ ret = PTR_ERR_OR_ZERO(ob[nr_got]);
+ if (ret)
+ break;
+
+ ret = bch2_trans_run(c,
+ bch2_trans_mark_metadata_bucket(&trans, ca,
+ ob[nr_got]->bucket, BCH_DATA_journal,
+ ca->mi.bucket_size));
+ if (ret) {
+ bch2_open_bucket_put(c, ob[nr_got]);
+ bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret));
+ break;
+ }
+
+ bu[nr_got] = ob[nr_got]->bucket;
+ }
}
- /*
- * We may be called from the device add path, before the new device has
- * actually been added to the running filesystem:
- */
- if (!new_fs)
- spin_lock(&c->journal.lock);
+ if (!nr_got)
+ goto err_free;
+
+ /* Don't return an error if we successfully allocated some buckets: */
+ ret = 0;
+
+ if (c) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_block(&c->journal);
+ mutex_lock(&c->sb_lock);
+ }
memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
- swap(new_buckets, ja->buckets);
- swap(new_bucket_seq, ja->bucket_seq);
- if (!new_fs)
- spin_unlock(&c->journal.lock);
+ BUG_ON(ja->discard_idx > ja->nr);
- while (ja->nr < nr) {
- struct open_bucket *ob = NULL;
- unsigned pos;
- long b;
+ pos = ja->discard_idx ?: ja->nr;
- if (new_fs) {
- b = bch2_bucket_alloc_new_fs(ca);
- if (b < 0) {
- ret = -ENOSPC;
- goto err;
- }
- } else {
- rcu_read_lock();
- ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
- false, cl);
- rcu_read_unlock();
- if (IS_ERR(ob)) {
- ret = cl ? -EAGAIN : -ENOSPC;
- goto err;
- }
+ memmove(new_buckets + pos + nr_got,
+ new_buckets + pos,
+ sizeof(new_buckets[0]) * (ja->nr - pos));
+ memmove(new_bucket_seq + pos + nr_got,
+ new_bucket_seq + pos,
+ sizeof(new_bucket_seq[0]) * (ja->nr - pos));
- b = ob->bucket;
- }
+ for (i = 0; i < nr_got; i++) {
+ new_buckets[pos + i] = bu[i];
+ new_bucket_seq[pos + i] = 0;
+ }
- if (c)
- spin_lock(&c->journal.lock);
+ nr = ja->nr + nr_got;
- /*
- * XXX
- * For resize at runtime, we should be writing the new
- * superblock before inserting into the journal array
- */
+ ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
+ if (ret)
+ goto err_unblock;
- pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
- __array_insert_item(ja->buckets, ja->nr, pos);
- __array_insert_item(ja->bucket_seq, ja->nr, pos);
- __array_insert_item(journal_buckets->buckets, ja->nr, pos);
- ja->nr++;
-
- ja->buckets[pos] = b;
- ja->bucket_seq[pos] = 0;
- journal_buckets->buckets[pos] = cpu_to_le64(b);
-
- if (pos <= ja->discard_idx)
- ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
- if (pos <= ja->dirty_idx_ondisk)
- ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
- if (pos <= ja->dirty_idx)
- ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
- if (pos <= ja->cur_idx)
- ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-
- if (c)
- spin_unlock(&c->journal.lock);
-
- if (!new_fs) {
- ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
- bch2_trans_mark_metadata_bucket(&trans, ca,
- b, BCH_DATA_journal,
- ca->mi.bucket_size));
+ if (!new_fs)
+ bch2_write_super(c);
- bch2_open_bucket_put(c, ob);
+ /* Commit: */
+ if (c)
+ spin_lock(&c->journal.lock);
- if (ret)
- goto err;
- }
+ swap(new_buckets, ja->buckets);
+ swap(new_bucket_seq, ja->bucket_seq);
+ ja->nr = nr;
+
+ if (pos <= ja->discard_idx)
+ ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
+ if (pos <= ja->dirty_idx_ondisk)
+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
+ if (pos <= ja->dirty_idx)
+ ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
+ if (pos <= ja->cur_idx)
+ ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
+
+ if (c)
+ spin_unlock(&c->journal.lock);
+err_unblock:
+ if (c) {
+ bch2_journal_unblock(&c->journal);
+ mutex_unlock(&c->sb_lock);
}
-err:
- bch2_sb_resize_journal(&ca->disk_sb,
- ja->nr + sizeof(*journal_buckets) / sizeof(u64));
+
+ if (ret && !new_fs)
+ for (i = 0; i < nr_got; i++)
+ bch2_trans_run(c,
+ bch2_trans_mark_metadata_bucket(&trans, ca,
+ bu[i], BCH_DATA_free, 0));
+err_free:
+ if (!new_fs)
+ for (i = 0; i < nr_got; i++)
+ bch2_open_bucket_put(c, ob[i]);
+
kfree(new_bucket_seq);
kfree(new_buckets);
-
+ kfree(ob);
+ kfree(bu);
return ret;
}
@@ -892,41 +939,49 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
{
struct journal_device *ja = &ca->journal;
struct closure cl;
- unsigned current_nr;
- int ret;
+ int ret = 0;
closure_init_stack(&cl);
- do {
- struct disk_reservation disk_res = { 0, 0 };
+ down_write(&c->state_lock);
- closure_sync(&cl);
+ /* don't handle reducing nr of buckets yet: */
+ if (nr < ja->nr)
+ goto unlock;
- mutex_lock(&c->sb_lock);
- current_nr = ja->nr;
+ while (ja->nr < nr) {
+ struct disk_reservation disk_res = { 0, 0 };
/*
* note: journal buckets aren't really counted as _sectors_ used yet, so
* we don't need the disk reservation to avoid the BUG_ON() in buckets.c
* when space used goes up without a reservation - but we do need the
* reservation to ensure we'll actually be able to allocate:
+ *
+ * XXX: that's not right, disk reservations only ensure a
+ * filesystem-wide allocation will succeed, this is a device
+ * specific allocation - we can hang here:
*/
- if (bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
- mutex_unlock(&c->sb_lock);
- return -ENOSPC;
- }
+ ret = bch2_disk_reservation_get(c, &disk_res,
+ bucket_to_sector(ca, nr - ja->nr), 1, 0);
+ if (ret)
+ break;
ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
bch2_disk_reservation_put(c, &disk_res);
- if (ja->nr != current_nr)
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- } while (ret == -EAGAIN);
+ closure_sync(&cl);
+
+ if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
+ break;
+ }
+ if (ret)
+ bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
+unlock:
+ up_write(&c->state_lock);
return ret;
}
@@ -935,7 +990,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
unsigned nr;
if (dynamic_fault("bcachefs:add:journal_alloc"))
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_set_nr_journal_buckets;
/* 1/128th of the device by default: */
nr = ca->mi.nbuckets >> 7;
@@ -956,17 +1011,16 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
{
- union journal_res_state state;
bool ret = false;
- unsigned i;
+ u64 seq;
spin_lock(&j->lock);
- state = READ_ONCE(j->reservations);
- i = state.idx;
+ for (seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j) && !ret;
+ seq++) {
+ struct journal_buf *buf = journal_seq_to_buf(j, seq);
- while (i != state.unwritten_idx) {
- i = (i - 1) & JOURNAL_BUF_MASK;
- if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
+ if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
ret = true;
}
spin_unlock(&j->lock);
@@ -981,6 +1035,7 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
void bch2_fs_journal_stop(struct journal *j)
{
+ bch2_journal_reclaim_stop(j);
bch2_journal_flush_all_pins(j);
wait_event(j->wait, journal_entry_close(j));
@@ -995,24 +1050,30 @@ void bch2_fs_journal_stop(struct journal *j)
BUG_ON(!bch2_journal_error(j) &&
test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
- (journal_entry_is_open(j) ||
- j->last_empty_seq + 1 != journal_cur_seq(j)));
+ j->last_empty_seq != journal_cur_seq(j));
cancel_delayed_work_sync(&j->write_work);
- bch2_journal_reclaim_stop(j);
}
-int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
- struct list_head *journal_entries)
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
- struct journal_replay *i;
+ struct journal_replay *i, **_i;
+ struct genradix_iter iter;
+ bool had_entries = false;
+ unsigned ptr;
u64 last_seq = cur_seq, nr, seq;
- if (!list_empty(journal_entries))
- last_seq = le64_to_cpu(list_last_entry(journal_entries,
- struct journal_replay, list)->j.last_seq);
+ genradix_for_each_reverse(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ last_seq = le64_to_cpu(i->j.last_seq);
+ break;
+ }
nr = cur_seq - last_seq;
@@ -1021,7 +1082,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
if (!j->pin.data) {
bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_journal_pin_fifo;
}
}
@@ -1029,18 +1090,19 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
j->replay_journal_seq_end = cur_seq;
j->last_seq_ondisk = last_seq;
j->flushed_seq_ondisk = cur_seq - 1;
+ j->seq_ondisk = cur_seq - 1;
j->pin.front = last_seq;
j->pin.back = cur_seq;
atomic64_set(&j->seq, cur_seq - 1);
- if (list_empty(journal_entries))
- j->last_empty_seq = cur_seq - 1;
-
fifo_for_each_entry_ptr(p, &j->pin, seq)
journal_pin_list_init(p, 1);
- list_for_each_entry(i, journal_entries, list) {
- unsigned ptr;
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
seq = le64_to_cpu(i->j.seq);
BUG_ON(seq >= cur_seq);
@@ -1056,9 +1118,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
p->devs.nr = 0;
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+
+ had_entries = true;
}
- if (list_empty(journal_entries))
+ if (!had_entries)
j->last_empty_seq = cur_seq;
spin_lock(&j->lock);
@@ -1066,11 +1130,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
set_bit(JOURNAL_STARTED, &j->flags);
j->last_flush_write = jiffies;
- journal_pin_new_entry(j);
-
j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
-
- bch2_journal_buf_init(j);
+ j->reservations.unwritten_idx++;
c->last_bucket_seq_cleanup = journal_cur_seq(j);
@@ -1098,25 +1159,49 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
bch2_sb_get_journal(sb);
- unsigned i;
+ struct bch_sb_field_journal_v2 *journal_buckets_v2 =
+ bch2_sb_get_journal_v2(sb);
+ unsigned i, nr_bvecs;
+
+ ja->nr = 0;
- ja->nr = bch2_nr_journal_buckets(journal_buckets);
+ if (journal_buckets_v2) {
+ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+
+ for (i = 0; i < nr; i++)
+ ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
+ } else if (journal_buckets) {
+ ja->nr = bch2_nr_journal_buckets(journal_buckets);
+ }
ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->bucket_seq)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_dev_journal_init;
- ca->journal.bio = bio_kmalloc(GFP_KERNEL,
- DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE));
+ nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+
+ ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
if (!ca->journal.bio)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_dev_journal_init;
+
+ bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->buckets)
- return -ENOMEM;
-
- for (i = 0; i < ja->nr; i++)
- ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+ return -BCH_ERR_ENOMEM_dev_journal_init;
+
+ if (journal_buckets_v2) {
+ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+ unsigned j, dst = 0;
+
+ for (i = 0; i < nr; i++)
+ for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+ ja->buckets[dst++] =
+ le64_to_cpu(journal_buckets_v2->d[i].start) + j;
+ } else if (journal_buckets) {
+ for (i = 0; i < ja->nr; i++)
+ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+ }
return 0;
}
@@ -1125,6 +1210,8 @@ void bch2_fs_journal_exit(struct journal *j)
{
unsigned i;
+ darray_exit(&j->early_journal_entries);
+
for (i = 0; i < ARRAY_SIZE(j->buf); i++)
kvpfree(j->buf[i].data, j->buf[i].buf_size);
free_fifo(&j->pin);
@@ -1155,7 +1242,7 @@ int bch2_fs_journal_init(struct journal *j)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
- ret = -ENOMEM;
+ ret = -BCH_ERR_ENOMEM_journal_pin_fifo;
goto out;
}
@@ -1163,7 +1250,7 @@ int bch2_fs_journal_init(struct journal *j)
j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
if (!j->buf[i].data) {
- ret = -ENOMEM;
+ ret = -BCH_ERR_ENOMEM_journal_buf;
goto out;
}
}
@@ -1182,68 +1269,94 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
union journal_res_state s;
struct bch_dev *ca;
unsigned long now = jiffies;
+ u64 seq;
unsigned i;
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 24);
+ out->atomic++;
+
rcu_read_lock();
s = READ_ONCE(j->reservations);
- pr_buf(out, "active journal entries:\t%llu\n", fifo_used(&j->pin));
- pr_buf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
- pr_buf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
- pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
- pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- pr_buf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining);
- pr_buf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
- pr_buf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
- pr_buf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
- pr_buf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
- pr_buf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
- pr_buf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
- pr_buf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
+ prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
+ prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
+ prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
+ prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
+ prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
+ prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
+ prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining);
+ prt_printf(out, "watermark:\t\t%s\n", bch2_journal_watermarks[j->watermark]);
+ prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
+ prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
+ prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
+ prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
+ prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
+ prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
+ prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
- pr_buf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
- pr_buf(out, "current entry error:\t%u\n", j->cur_entry_error);
- pr_buf(out, "current entry:\t\t");
+ prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
+ prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
+ prt_printf(out, "current entry:\t\t");
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
- pr_buf(out, "error\n");
+ prt_printf(out, "error");
break;
case JOURNAL_ENTRY_CLOSED_VAL:
- pr_buf(out, "closed\n");
+ prt_printf(out, "closed");
break;
default:
- pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
+ prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
break;
}
- pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx));
+ prt_newline(out);
+
+ for (seq = journal_cur_seq(j);
+ seq >= journal_last_unwritten_seq(j);
+ --seq) {
+ i = seq & JOURNAL_BUF_MASK;
+
+ prt_printf(out, "unwritten entry:");
+ prt_tab(out);
+ prt_printf(out, "%llu", seq);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "refcount:");
+ prt_tab(out);
+ prt_printf(out, "%u", journal_state_count(s, i));
+ prt_newline(out);
+
+ prt_printf(out, "sectors:");
+ prt_tab(out);
+ prt_printf(out, "%u", j->buf[i].sectors);
+ prt_newline(out);
- i = s.idx;
- while (i != s.unwritten_idx) {
- i = (i - 1) & JOURNAL_BUF_MASK;
+ prt_printf(out, "expires");
+ prt_tab(out);
+ prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
+ prt_newline(out);
- pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
- i, journal_state_count(s, i), j->buf[i].sectors);
+ printbuf_indent_sub(out, 2);
}
- pr_buf(out,
- "need write:\t\t%i\n"
+ prt_printf(out,
"replay done:\t\t%i\n",
- test_bit(JOURNAL_NEED_WRITE, &j->flags),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
- pr_buf(out, "space:\n");
- pr_buf(out, "\tdiscarded\t%u:%u\n",
+ prt_printf(out, "space:\n");
+ prt_printf(out, "\tdiscarded\t%u:%u\n",
j->space[journal_space_discarded].next_entry,
j->space[journal_space_discarded].total);
- pr_buf(out, "\tclean ondisk\t%u:%u\n",
+ prt_printf(out, "\tclean ondisk\t%u:%u\n",
j->space[journal_space_clean_ondisk].next_entry,
j->space[journal_space_clean_ondisk].total);
- pr_buf(out, "\tclean\t\t%u:%u\n",
+ prt_printf(out, "\tclean\t\t%u:%u\n",
j->space[journal_space_clean].next_entry,
j->space[journal_space_clean].total);
- pr_buf(out, "\ttotal\t\t%u:%u\n",
+ prt_printf(out, "\ttotal\t\t%u:%u\n",
j->space[journal_space_total].next_entry,
j->space[journal_space_total].total);
@@ -1257,17 +1370,19 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
if (!ja->nr)
continue;
- pr_buf(out, "dev %u:\n", i);
- pr_buf(out, "\tnr\t\t%u\n", ja->nr);
- pr_buf(out, "\tbucket size\t%u\n", ca->mi.bucket_size);
- pr_buf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
- pr_buf(out, "\tdiscard_idx\t%u\n", ja->discard_idx);
- pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
- pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
- pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
+ prt_printf(out, "dev %u:\n", i);
+ prt_printf(out, "\tnr\t\t%u\n", ja->nr);
+ prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size);
+ prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+ prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx);
+ prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
+ prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
+ prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
}
rcu_read_unlock();
+
+ --out->atomic;
}
void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
@@ -1277,27 +1392,56 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
spin_unlock(&j->lock);
}
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
- u64 i;
+ unsigned i;
spin_lock(&j->lock);
- fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
- pr_buf(out, "%llu: count %u\n",
- i, atomic_read(&pin_list->count));
+ *seq = max(*seq, j->pin.front);
+
+ if (*seq >= j->pin.back) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+
+ out->atomic++;
+
+ pin_list = journal_seq_pin(j, *seq);
- list_for_each_entry(pin, &pin_list->list, list)
- pr_buf(out, "\t%px %ps\n",
- pin, pin->flush);
+ prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+ list_for_each_entry(pin, &pin_list->list[i], list) {
+ prt_printf(out, "\t%px %ps", pin, pin->flush);
+ prt_newline(out);
+ }
- if (!list_empty(&pin_list->flushed))
- pr_buf(out, "flushed:\n");
+ if (!list_empty(&pin_list->flushed)) {
+ prt_printf(out, "flushed:");
+ prt_newline(out);
+ }
- list_for_each_entry(pin, &pin_list->flushed, list)
- pr_buf(out, "\t%px %ps\n",
- pin, pin->flush);
+ list_for_each_entry(pin, &pin_list->flushed, list) {
+ prt_printf(out, "\t%px %ps", pin, pin->flush);
+ prt_newline(out);
}
+
+ printbuf_indent_sub(out, 2);
+
+ --out->atomic;
spin_unlock(&j->lock);
+
+ return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+ u64 seq = 0;
+
+ while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+ seq++;
}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 62f9aec4a427..024cea9f5902 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -29,8 +29,8 @@
*
* Synchronous updates are specified by passing a closure (@flush_cl) to
* bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
- * down to the journalling code. That closure will will wait on the journal
- * write to complete (via closure_wait()).
+ * down to the journalling code. That closure will wait on the journal write to
+ * complete (via closure_wait()).
*
* If the index update wasn't synchronous, the journal entry will be
* written out after 10 ms have elapsed, by default (the delay_ms field
@@ -141,7 +141,10 @@ static inline u64 journal_cur_seq(struct journal *j)
return j->pin.back - 1;
}
-void bch2_journal_set_has_inum(struct journal *, u64, u64);
+static inline u64 journal_last_unwritten_seq(struct journal *j)
+{
+ return j->seq_ondisk + 1;
+}
static inline int journal_state_count(union journal_res_state s, int idx)
{
@@ -196,9 +199,9 @@ journal_res_entry(struct journal *j, struct journal_res *res)
return vstruct_idx(j->buf[res->idx].data, res->offset);
}
-static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
enum btree_id id, unsigned level,
- const void *data, unsigned u64s)
+ unsigned u64s)
{
entry->u64s = cpu_to_le16(u64s);
entry->btree_id = id;
@@ -207,32 +210,33 @@ static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type
entry->pad[0] = 0;
entry->pad[1] = 0;
entry->pad[2] = 0;
- memcpy_u64s_small(entry->_data, data, u64s);
-
return jset_u64s(u64s);
}
-static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
- unsigned type, enum btree_id id,
- unsigned level,
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+ enum btree_id id, unsigned level,
const void *data, unsigned u64s)
{
- unsigned actual = journal_entry_set(journal_res_entry(j, res),
- type, id, level, data, u64s);
+ unsigned ret = journal_entry_init(entry, type, id, level, u64s);
+
+ memcpy_u64s_small(entry->_data, data, u64s);
+ return ret;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+ unsigned type, enum btree_id id,
+ unsigned level, unsigned u64s)
+{
+ struct jset_entry *entry = journal_res_entry(j, res);
+ unsigned actual = journal_entry_init(entry, type, id, level, u64s);
EBUG_ON(!res->ref);
EBUG_ON(actual > res->u64s);
res->offset += actual;
res->u64s -= actual;
-}
-
-static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
- enum btree_id id, unsigned level,
- const struct bkey_i *k)
-{
- bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
- id, level, k, k->k.u64s);
+ return entry;
}
static inline bool journal_entry_empty(struct jset *j)
@@ -261,9 +265,6 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
.buf3_count = idx == 3,
}).v, &j->reservations.counter);
- EBUG_ON(((s.idx - idx) & 3) >
- ((s.idx - s.unwritten_idx) & 3));
-
if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
__bch2_journal_buf_put(j);
}
@@ -278,12 +279,12 @@ static inline void bch2_journal_res_put(struct journal *j,
if (!res->ref)
return;
- lock_release(&j->res_map, 0, _THIS_IP_);
+ lock_release(&j->res_map, _THIS_IP_);
while (res->u64s)
bch2_journal_add_entry(j, res,
BCH_JSET_ENTRY_btree_keys,
- 0, 0, NULL, 0);
+ 0, 0, 0);
bch2_journal_buf_put(j, res->idx);
@@ -293,9 +294,9 @@ static inline void bch2_journal_res_put(struct journal *j,
int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
unsigned);
-#define JOURNAL_RES_GET_NONBLOCK (1 << 0)
-#define JOURNAL_RES_GET_CHECK (1 << 1)
-#define JOURNAL_RES_GET_RESERVED (1 << 2)
+/* First two bits for JOURNAL_WATERMARK: */
+#define JOURNAL_RES_GET_NONBLOCK (1 << 2)
+#define JOURNAL_RES_GET_CHECK (1 << 3)
static inline int journal_res_get_fast(struct journal *j,
struct journal_res *res,
@@ -316,8 +317,7 @@ static inline int journal_res_get_fast(struct journal *j,
EBUG_ON(!journal_state_count(new, new.idx));
- if (!(flags & JOURNAL_RES_GET_RESERVED) &&
- !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
+ if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark)
return 0;
new.cur_entry_offset += res->u64s;
@@ -370,23 +370,27 @@ out:
/* journal_preres: */
-static inline bool journal_check_may_get_unreserved(struct journal *j)
+static inline void journal_set_watermark(struct journal *j)
{
union journal_preres_state s = READ_ONCE(j->prereserved);
- bool ret = s.reserved < s.remaining &&
- fifo_free(&j->pin) > 8;
-
- lockdep_assert_held(&j->lock);
-
- if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
- if (ret) {
- set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
- journal_wake(j);
- } else {
- clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
- }
- }
- return ret;
+ unsigned watermark = JOURNAL_WATERMARK_any;
+
+ if (fifo_free(&j->pin) < j->pin.size / 4)
+ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
+ if (fifo_free(&j->pin) < j->pin.size / 8)
+ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
+
+ if (s.reserved > s.remaining)
+ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
+ if (!s.remaining)
+ watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
+
+ if (watermark == j->watermark)
+ return;
+
+ swap(watermark, j->watermark);
+ if (watermark > j->watermark)
+ journal_wake(j);
}
static inline void bch2_journal_preres_put(struct journal *j,
@@ -406,12 +410,8 @@ static inline void bch2_journal_preres_put(struct journal *j,
closure_wake_up(&j->preres_wait);
}
- if (s.reserved <= s.remaining &&
- !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
- spin_lock(&j->lock);
- journal_check_may_get_unreserved(j);
- spin_unlock(&j->lock);
- }
+ if (s.reserved <= s.remaining && j->watermark)
+ journal_set_watermark(j);
}
int __bch2_journal_preres_get(struct journal *,
@@ -432,8 +432,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
old.v = new.v = v;
ret = 0;
- if ((flags & JOURNAL_RES_GET_RESERVED) ||
- test_bit(JOURNAL_NOCHANGES, &j->flags) ||
+ if ((flags & JOURNAL_WATERMARK_reserved) ||
new.reserved + d < new.remaining) {
new.reserved += d;
ret = 1;
@@ -461,7 +460,7 @@ static inline int bch2_journal_preres_get(struct journal *j,
return 0;
if (flags & JOURNAL_RES_GET_NONBLOCK)
- return -EAGAIN;
+ return -BCH_ERR_journal_preres_get_blocked;
return __bch2_journal_preres_get(j, res, new_u64s, flags);
}
@@ -502,6 +501,7 @@ void bch2_journal_block(struct journal *);
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
@@ -510,7 +510,7 @@ int bch2_dev_journal_alloc(struct bch_dev *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64, struct list_head *);
+int bch2_fs_journal_start(struct journal *, u64);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index e566f8516052..45b1b839783d 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_io.h"
#include "btree_update_interior.h"
@@ -16,12 +17,39 @@
#include <trace/events/bcachefs.h>
-static void __journal_replay_free(struct journal_replay *i)
+static struct nonce journal_nonce(const struct jset *jset)
+{
+ return (struct nonce) {{
+ [0] = 0,
+ [1] = ((__le32 *) &jset->seq)[0],
+ [2] = ((__le32 *) &jset->seq)[1],
+ [3] = BCH_NONCE_JOURNAL,
+ }};
+}
+
+static bool jset_csum_good(struct bch_fs *c, struct jset *j)
{
- list_del(&i->list);
+ return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
+ !bch2_crc_cmp(j->csum,
+ csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
+}
+
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
+{
+ return (seq - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static void __journal_replay_free(struct bch_fs *c,
+ struct journal_replay *i)
+{
+ struct journal_replay **p =
+ genradix_ptr(&c->journal_entries,
+ journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
+
+ BUG_ON(*p != i);
+ *p = NULL;
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
-
}
static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
@@ -29,13 +57,13 @@ static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
i->ignore = true;
if (!c->opts.read_entire_journal)
- __journal_replay_free(i);
+ __journal_replay_free(c, i);
}
struct journal_list {
struct closure cl;
+ u64 last_seq;
struct mutex lock;
- struct list_head *head;
int ret;
};
@@ -47,94 +75,105 @@ struct journal_list {
* be replayed:
*/
static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
- struct bch_extent_ptr entry_ptr,
- struct journal_list *jlist, struct jset *j,
- bool bad)
+ struct journal_ptr entry_ptr,
+ struct journal_list *jlist, struct jset *j)
{
- struct journal_replay *i, *pos, *dup = NULL;
- struct bch_extent_ptr *ptr;
- struct list_head *where;
+ struct genradix_iter iter;
+ struct journal_replay **_i, *i, *dup;
+ struct journal_ptr *ptr;
size_t bytes = vstruct_bytes(j);
- u64 last_seq = 0;
+ u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
int ret = JOURNAL_ENTRY_ADD_OK;
- list_for_each_entry_reverse(i, jlist->head, list) {
- if (!JSET_NO_FLUSH(&i->j)) {
- last_seq = le64_to_cpu(i->j.last_seq);
- break;
- }
- }
-
/* Is this entry older than the range we need? */
if (!c->opts.read_entire_journal &&
- le64_to_cpu(j->seq) < last_seq) {
- ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
- goto out;
- }
+ le64_to_cpu(j->seq) < jlist->last_seq)
+ return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+
+ /*
+ * genradixes are indexed by a ulong, not a u64, so we can't index them
+ * by sequence number directly: Assume instead that they will all fall
+ * within the range of +-2billion of the filrst one we find.
+ */
+ if (!c->journal_entries_base_seq)
+ c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
/* Drop entries we don't need anymore */
- if (!JSET_NO_FLUSH(j)) {
- list_for_each_entry_safe(i, pos, jlist->head, list) {
- if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+ if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
+ genradix_for_each_from(&c->journal_entries, iter, _i,
+ journal_entry_radix_idx(c, jlist->last_seq)) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ if (le64_to_cpu(i->j.seq) >= last_seq)
break;
journal_replay_free(c, i);
}
}
- list_for_each_entry_reverse(i, jlist->head, list) {
- if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
- where = &i->list;
- goto add;
- }
- }
+ jlist->last_seq = max(jlist->last_seq, last_seq);
- where = jlist->head;
-add:
- dup = where->next != jlist->head
- ? container_of(where->next, struct journal_replay, list)
- : NULL;
-
- if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
- dup = NULL;
+ _i = genradix_ptr_alloc(&c->journal_entries,
+ journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
+ GFP_KERNEL);
+ if (!_i)
+ return -BCH_ERR_ENOMEM_journal_entry_add;
/*
* Duplicate journal entries? If so we want the one that didn't have a
* checksum error:
*/
+ dup = *_i;
if (dup) {
- if (dup->bad) {
- /* we'll replace @dup: */
- } else if (bad) {
+ if (bytes == vstruct_bytes(&dup->j) &&
+ !memcmp(j, &dup->j, bytes)) {
i = dup;
goto found;
- } else {
- fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
- memcmp(j, &dup->j, bytes), c,
- "found duplicate but non identical journal entries (seq %llu)",
- le64_to_cpu(j->seq));
+ }
+
+ if (!entry_ptr.csum_good) {
i = dup;
goto found;
}
- }
- i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
- if (!i) {
- ret = -ENOMEM;
- goto out;
+ if (!dup->csum_good)
+ goto replace;
+
+ fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
+ le64_to_cpu(j->seq));
+ i = dup;
+ goto found;
}
+replace:
+ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+ if (!i)
+ return -BCH_ERR_ENOMEM_journal_entry_add;
- i->nr_ptrs = 0;
- i->bad = bad;
+ i->nr_ptrs = 0;
+ i->csum_good = entry_ptr.csum_good;
i->ignore = false;
- memcpy(&i->j, j, bytes);
+ unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
+ i->ptrs[i->nr_ptrs++] = entry_ptr;
if (dup) {
- i->nr_ptrs = dup->nr_ptrs;
- memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
- __journal_replay_free(dup);
+ if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
+ bch_err(c, "found too many copies of journal entry %llu",
+ le64_to_cpu(i->j.seq));
+ dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
+ }
+
+ /* The first ptr should represent the jset we kept: */
+ memcpy(i->ptrs + i->nr_ptrs,
+ dup->ptrs,
+ sizeof(dup->ptrs[0]) * dup->nr_ptrs);
+ i->nr_ptrs += dup->nr_ptrs;
+ __journal_replay_free(c, dup);
}
- list_add(&i->list, where);
+ *_i = i;
+ return 0;
found:
for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
if (ptr->dev == ca->dev_idx) {
@@ -156,16 +195,6 @@ fsck_err:
return ret;
}
-static struct nonce journal_nonce(const struct jset *jset)
-{
- return (struct nonce) {{
- [0] = 0,
- [1] = ((__le32 *) &jset->seq)[0],
- [2] = ((__le32 *) &jset->seq)[1],
- [3] = BCH_NONCE_JOURNAL,
- }};
-}
-
/* this fills in a range with empty jset_entries: */
static void journal_entry_null_range(void *start, void *end)
{
@@ -179,66 +208,84 @@ static void journal_entry_null_range(void *start, void *end)
#define JOURNAL_ENTRY_NONE 6
#define JOURNAL_ENTRY_BAD 7
-#define journal_entry_err(c, msg, ...) \
+static void journal_entry_err_msg(struct printbuf *out,
+ struct jset *jset,
+ struct jset_entry *entry)
+{
+ prt_str(out, "invalid journal entry ");
+ if (entry)
+ prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
+
+ if (!jset)
+ prt_printf(out, "in superblock");
+ else if (!entry)
+ prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
+ else
+ prt_printf(out, "at offset %zi/%u seq %llu",
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s),
+ le64_to_cpu(jset->seq));
+ prt_str(out, ": ");
+}
+
+#define journal_entry_err(c, jset, entry, msg, ...) \
({ \
+ struct printbuf buf = PRINTBUF; \
+ \
+ journal_entry_err_msg(&buf, jset, entry); \
+ prt_printf(&buf, msg, ##__VA_ARGS__); \
+ \
switch (write) { \
case READ: \
- mustfix_fsck_err(c, msg, ##__VA_ARGS__); \
+ mustfix_fsck_err(c, "%s", buf.buf); \
break; \
case WRITE: \
- bch_err(c, "corrupt metadata before write:\n" \
- msg, ##__VA_ARGS__); \
+ bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\
if (bch2_fs_inconsistent(c)) { \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ ret = -BCH_ERR_fsck_errors_not_fixed; \
goto fsck_err; \
} \
break; \
} \
+ \
+ printbuf_exit(&buf); \
true; \
})
-#define journal_entry_err_on(cond, c, msg, ...) \
- ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+#define journal_entry_err_on(cond, c, jset, entry, msg, ...) \
+ ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
#define FSCK_DELETED_KEY 5
-static int journal_validate_key(struct bch_fs *c, const char *where,
+static int journal_validate_key(struct bch_fs *c,
+ struct jset *jset,
struct jset_entry *entry,
unsigned level, enum btree_id btree_id,
- struct bkey_i *k, const char *type,
+ struct bkey_i *k,
unsigned version, int big_endian, int write)
{
void *next = vstruct_next(entry);
- const char *invalid;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
- if (journal_entry_err_on(!k->k.u64s, c,
- "invalid %s in %s entry offset %zi/%u: k->u64s 0",
- type, where,
- (u64 *) k - entry->_data,
- le16_to_cpu(entry->u64s))) {
+ if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
return FSCK_DELETED_KEY;
}
if (journal_entry_err_on((void *) bkey_next(k) >
- (void *) vstruct_next(entry), c,
- "invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
- type, where,
- (u64 *) k - entry->_data,
- le16_to_cpu(entry->u64s))) {
+ (void *) vstruct_next(entry),
+ c, jset, entry,
+ "extends past end of journal entry")) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
return FSCK_DELETED_KEY;
}
- if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
- "invalid %s in %s entry offset %zi/%u: bad format %u",
- type, where,
- (u64 *) k - entry->_data,
- le16_to_cpu(entry->u64s),
- k->k.format)) {
+ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
+ c, jset, entry,
+ "bad format %u", k->k.format)) {
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
journal_entry_null_range(vstruct_next(entry), next);
@@ -249,21 +296,29 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
bch2_bkey_compat(level, btree_id, version, big_endian,
write, NULL, bkey_to_packed(k));
- invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
- __btree_node_type(level, btree_id));
- if (invalid) {
- char buf[160];
+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+ __btree_node_type(level, btree_id), write, &buf)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
+ bch2_jset_entry_types[entry->type],
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s),
+ le64_to_cpu(jset->seq));
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+ prt_newline(&buf);
+ bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+ __btree_node_type(level, btree_id), write, &buf);
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
- mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
- type, where,
- (u64 *) k - entry->_data,
- le16_to_cpu(entry->u64s),
- invalid, buf);
+ mustfix_fsck_err(c, "%s", buf.buf);
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
journal_entry_null_range(vstruct_next(entry), next);
+
+ printbuf_exit(&buf);
return FSCK_DELETED_KEY;
}
@@ -271,21 +326,22 @@ static int journal_validate_key(struct bch_fs *c, const char *where,
bch2_bkey_compat(level, btree_id, version, big_endian,
write, NULL, bkey_to_packed(k));
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
static int journal_entry_btree_keys_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
struct bkey_i *k = entry->start;
while (k != vstruct_last(entry)) {
- int ret = journal_validate_key(c, where, entry,
+ int ret = journal_validate_key(c, jset, entry,
entry->level,
entry->btree_id,
- k, "key", version, big_endian, write);
+ k, version, big_endian, write|BKEY_INVALID_FROM_JOURNAL);
if (ret == FSCK_DELETED_KEY)
continue;
@@ -299,15 +355,21 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
struct jset_entry *entry)
{
struct bkey_i *k;
+ bool first = true;
- pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
-
- vstruct_for_each(entry, k)
+ jset_entry_for_each_key(entry, k) {
+ if (!first) {
+ prt_newline(out);
+ prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ }
+ prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
+ first = false;
+ }
}
static int journal_entry_btree_root_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
@@ -315,7 +377,8 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
int ret = 0;
if (journal_entry_err_on(!entry->u64s ||
- le16_to_cpu(entry->u64s) != k->k.u64s, c,
+ le16_to_cpu(entry->u64s) != k->k.u64s,
+ c, jset, entry,
"invalid btree root journal entry: wrong number of keys")) {
void *next = vstruct_next(entry);
/*
@@ -328,8 +391,8 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
return 0;
}
- return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
- "btree root", version, big_endian, write);
+ return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
+ version, big_endian, write);
fsck_err:
return ret;
}
@@ -341,7 +404,7 @@ static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs
}
static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
@@ -355,13 +418,14 @@ static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs
}
static int journal_entry_blacklist_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
int ret = 0;
- if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
+ c, jset, entry,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
}
@@ -375,18 +439,19 @@ static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs
struct jset_entry_blacklist *bl =
container_of(entry, struct jset_entry_blacklist, entry);
- pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq));
+ prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
}
static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
struct jset_entry_blacklist_v2 *bl_entry;
int ret = 0;
- if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
+ c, jset, entry,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
goto out;
@@ -395,7 +460,8 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
- le64_to_cpu(bl_entry->end), c,
+ le64_to_cpu(bl_entry->end),
+ c, jset, entry,
"invalid journal seq blacklist entry: start > end")) {
journal_entry_null_range(entry, vstruct_next(entry));
}
@@ -410,13 +476,13 @@ static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_
struct jset_entry_blacklist_v2 *bl =
container_of(entry, struct jset_entry_blacklist_v2, entry);
- pr_buf(out, "start=%llu end=%llu",
+ prt_printf(out, "start=%llu end=%llu",
le64_to_cpu(bl->start),
le64_to_cpu(bl->end));
}
static int journal_entry_usage_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
@@ -426,7 +492,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
int ret = 0;
if (journal_entry_err_on(bytes < sizeof(*u),
- c,
+ c, jset, entry,
"invalid journal entry usage: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
@@ -442,13 +508,13 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
- pr_buf(out, "type=%s v=%llu",
+ prt_printf(out, "type=%s v=%llu",
bch2_fs_usage_types[u->entry.btree_id],
le64_to_cpu(u->v));
}
static int journal_entry_data_usage_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
@@ -459,7 +525,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
if (journal_entry_err_on(bytes < sizeof(*u) ||
bytes < sizeof(*u) + u->r.nr_devs,
- c,
+ c, jset, entry,
"invalid journal entry usage: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
@@ -476,11 +542,11 @@ static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs
container_of(entry, struct jset_entry_data_usage, entry);
bch2_replicas_entry_to_text(out, &u->r);
- pr_buf(out, "=%llu", le64_to_cpu(u->v));
+ prt_printf(out, "=%llu", le64_to_cpu(u->v));
}
static int journal_entry_clock_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
@@ -490,13 +556,13 @@ static int journal_entry_clock_validate(struct bch_fs *c,
int ret = 0;
if (journal_entry_err_on(bytes != sizeof(*clock),
- c, "invalid journal entry clock: bad size")) {
+ c, jset, entry, "bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(clock->rw > 1,
- c, "invalid journal entry clock: bad rw")) {
+ c, jset, entry, "bad rw")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
@@ -511,11 +577,11 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
- pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+ prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
}
static int journal_entry_dev_usage_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
@@ -527,7 +593,7 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
int ret = 0;
if (journal_entry_err_on(bytes < expected,
- c, "invalid journal entry dev usage: bad size (%u < %u)",
+ c, jset, entry, "bad size (%u < %u)",
bytes, expected)) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
@@ -536,13 +602,13 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
dev = le32_to_cpu(u->dev);
if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
- c, "invalid journal entry dev usage: bad dev")) {
+ c, jset, entry, "bad dev")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(u->pad,
- c, "invalid journal entry dev usage: bad pad")) {
+ c, jset, entry, "bad pad")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
@@ -558,26 +624,24 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
container_of(entry, struct jset_entry_dev_usage, entry);
unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
- pr_buf(out, "dev=%u", le32_to_cpu(u->dev));
+ prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
for (i = 0; i < nr_types; i++) {
if (i < BCH_DATA_NR)
- pr_buf(out, " %s", bch2_data_types[i]);
+ prt_printf(out, " %s", bch2_data_types[i]);
else
- pr_buf(out, " (unknown data type %u)", i);
- pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+ prt_printf(out, " (unknown data type %u)", i);
+ prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
le64_to_cpu(u->d[i].buckets),
le64_to_cpu(u->d[i].sectors),
le64_to_cpu(u->d[i].fragmented));
}
- pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu",
- le64_to_cpu(u->buckets_ec),
- le64_to_cpu(u->buckets_unavailable));
+ prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
}
static int journal_entry_log_validate(struct bch_fs *c,
- const char *where,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
@@ -590,11 +654,26 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
- bch_scnmemcpy(out, l->d, strnlen(l->d, bytes));
+ prt_printf(out, "%.*s", bytes, l->d);
+}
+
+static int journal_entry_overwrite_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian, int write)
+{
+ return journal_entry_btree_keys_validate(c, jset, entry,
+ version, big_endian, READ);
+}
+
+static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
}
struct jset_entry_ops {
- int (*validate)(struct bch_fs *, const char *,
+ int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int, int);
void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
};
@@ -609,12 +688,13 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = {
#undef x
};
-int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
+int bch2_journal_entry_validate(struct bch_fs *c,
+ struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
return entry->type < BCH_JSET_ENTRY_NR
- ? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
+ ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
version, big_endian, write)
: 0;
}
@@ -623,34 +703,28 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
if (entry->type < BCH_JSET_ENTRY_NR) {
- pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
} else {
- pr_buf(out, "(unknown type %u)", entry->type);
+ prt_printf(out, "(unknown type %u)", entry->type);
}
}
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
int write)
{
- char buf[100];
struct jset_entry *entry;
int ret = 0;
vstruct_for_each(jset, entry) {
- scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
- le64_to_cpu(jset->seq),
- (u64 *) entry - jset->_data,
- le32_to_cpu(jset->u64s));
-
if (journal_entry_err_on(vstruct_next(entry) >
- vstruct_last(jset), c,
+ vstruct_last(jset), c, jset, entry,
"journal entry extends past end of jset")) {
jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
break;
}
- ret = bch2_journal_entry_validate(c, buf, entry,
+ ret = bch2_journal_entry_validate(c, jset, entry,
le32_to_cpu(jset->version),
JSET_BIG_ENDIAN(jset), write);
if (ret)
@@ -663,12 +737,8 @@ fsck_err:
static int jset_validate(struct bch_fs *c,
struct bch_dev *ca,
struct jset *jset, u64 sector,
- unsigned bucket_sectors_left,
- unsigned sectors_read,
int write)
{
- size_t bytes = vstruct_bytes(jset);
- struct bch_csum csum;
unsigned version;
int ret = 0;
@@ -678,70 +748,80 @@ static int jset_validate(struct bch_fs *c,
version = le32_to_cpu(jset->version);
if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
version < bcachefs_metadata_version_min) ||
- version >= bcachefs_metadata_version_max, c,
+ version >= bcachefs_metadata_version_max,
+ c, jset, NULL,
"%s sector %llu seq %llu: unknown journal entry version %u",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
version)) {
/* don't try to continue: */
- return EINVAL;
+ return -EINVAL;
}
- if (bytes > (sectors_read << 9) &&
- sectors_read < bucket_sectors_left)
- return JOURNAL_ENTRY_REREAD;
-
- if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
- "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
- ca ? ca->name : c->name,
- sector, le64_to_cpu(jset->seq), bytes)) {
- ret = JOURNAL_ENTRY_BAD;
- le32_add_cpu(&jset->u64s,
- -((bytes - (bucket_sectors_left << 9)) / 8));
- }
-
- if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+ if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
+ c, jset, NULL,
"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
- JSET_CSUM_TYPE(jset))) {
+ JSET_CSUM_TYPE(jset)))
ret = JOURNAL_ENTRY_BAD;
- goto csum_done;
- }
- if (write)
- goto csum_done;
-
- csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
- if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
- "%s sector %llu seq %llu: journal checksum bad",
- ca ? ca->name : c->name,
- sector, le64_to_cpu(jset->seq)))
- ret = JOURNAL_ENTRY_BAD;
-
- bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
- jset->encrypted_start,
- vstruct_end(jset) - (void *) jset->encrypted_start);
-csum_done:
/* last_seq is ignored when JSET_NO_FLUSH is true */
if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
- le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+ le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
+ c, jset, NULL,
"invalid journal entry: last_seq > seq (%llu > %llu)",
le64_to_cpu(jset->last_seq),
le64_to_cpu(jset->seq))) {
jset->last_seq = jset->seq;
return JOURNAL_ENTRY_BAD;
}
+
+ ret = jset_validate_entries(c, jset, write);
fsck_err:
return ret;
}
-static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
+static int jset_validate_early(struct bch_fs *c,
+ struct bch_dev *ca,
+ struct jset *jset, u64 sector,
+ unsigned bucket_sectors_left,
+ unsigned sectors_read)
{
- unsigned sectors = vstruct_sectors(jset, c->block_bits);
+ size_t bytes = vstruct_bytes(jset);
+ unsigned version;
+ int write = READ;
+ int ret = 0;
+
+ if (le64_to_cpu(jset->magic) != jset_magic(c))
+ return JOURNAL_ENTRY_NONE;
- return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
- jset_validate_entries(c, jset, WRITE);
+ version = le32_to_cpu(jset->version);
+ if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
+ version < bcachefs_metadata_version_min) ||
+ version >= bcachefs_metadata_version_max,
+ c, jset, NULL,
+ "%s sector %llu seq %llu: unknown journal entry version %u",
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq),
+ version)) {
+ /* don't try to continue: */
+ return -EINVAL;
+ }
+
+ if (bytes > (sectors_read << 9) &&
+ sectors_read < bucket_sectors_left)
+ return JOURNAL_ENTRY_REREAD;
+
+ if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
+ c, jset, NULL,
+ "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq), bytes))
+ le32_add_cpu(&jset->u64s,
+ -((bytes - (bucket_sectors_left << 9)) / 8));
+fsck_err:
+ return ret;
}
struct journal_read_buf {
@@ -756,12 +836,12 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
/* the bios are sized for this many pages, max: */
if (new_size > JOURNAL_ENTRY_SIZE_MAX)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
new_size = roundup_pow_of_two(new_size);
n = kvpmalloc(new_size, GFP_KERNEL);
if (!n)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
kvpfree(b->data, b->size);
b->data = n;
@@ -780,7 +860,7 @@ static int journal_read_bucket(struct bch_dev *ca,
unsigned sectors, sectors_read = 0;
u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
end = offset + ca->mi.bucket_size;
- bool saw_bad = false;
+ bool saw_bad = false, csum_good;
int ret = 0;
pr_debug("reading %u", bucket);
@@ -788,20 +868,20 @@ static int journal_read_bucket(struct bch_dev *ca,
while (offset < end) {
if (!sectors_read) {
struct bio *bio;
+ unsigned nr_bvecs;
reread:
sectors_read = min_t(unsigned,
end - offset, buf->size >> 9);
+ nr_bvecs = buf_pages(buf->data, sectors_read << 9);
- bio = bio_kmalloc(GFP_KERNEL,
- buf_pages(buf->data,
- sectors_read << 9));
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_iter.bi_sector = offset;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+ bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
+
+ bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, buf->data, sectors_read << 9);
ret = submit_bio_wait(bio);
- bio_put(bio);
+ kfree(bio);
if (bch2_dev_io_err_on(ret, ca,
"journal read error: sector %llu",
@@ -819,11 +899,10 @@ reread:
j = buf->data;
}
- ret = jset_validate(c, ca, j, offset,
- end - offset, sectors_read,
- READ);
+ ret = jset_validate_early(c, ca, j, offset,
+ end - offset, sectors_read);
switch (ret) {
- case BCH_FSCK_OK:
+ case 0:
sectors = vstruct_sectors(j, c->block_bits);
break;
case JOURNAL_ENTRY_REREAD:
@@ -837,17 +916,13 @@ reread:
case JOURNAL_ENTRY_NONE:
if (!saw_bad)
return 0;
- sectors = block_sectors(c);
- goto next_block;
- case JOURNAL_ENTRY_BAD:
- saw_bad = true;
/*
* On checksum error we don't really trust the size
* field of the journal entry we read, so try reading
* again at next block boundary:
*/
sectors = block_sectors(c);
- break;
+ goto next_block;
default:
return ret;
}
@@ -863,11 +938,25 @@ reread:
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
+ csum_good = jset_csum_good(c, j);
+ if (!csum_good)
+ saw_bad = true;
+
+ ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+ j->encrypted_start,
+ vstruct_end(j) - (void *) j->encrypted_start);
+ bch2_fs_fatal_err_on(ret, c,
+ "error decrypting journal entry: %i", ret);
+
mutex_lock(&jlist->lock);
- ret = journal_entry_add(c, ca, (struct bch_extent_ptr) {
- .dev = ca->dev_idx,
- .offset = offset,
- }, jlist, j, ret != 0);
+ ret = journal_entry_add(c, ca, (struct journal_ptr) {
+ .csum_good = csum_good,
+ .dev = ca->dev_idx,
+ .bucket = bucket,
+ .bucket_offset = offset -
+ bucket_to_sector(ca, ja->buckets[bucket]),
+ .sector = offset,
+ }, jlist, j);
mutex_unlock(&jlist->lock);
switch (ret) {
@@ -896,8 +985,9 @@ static void bch2_journal_read_device(struct closure *cl)
struct bch_fs *c = ca->fs;
struct journal_list *jlist =
container_of(cl->parent, struct journal_list, cl);
+ struct journal_replay *r, **_r;
+ struct genradix_iter iter;
struct journal_read_buf buf = { NULL, 0 };
- u64 min_seq = U64_MAX;
unsigned i;
int ret = 0;
@@ -916,26 +1006,39 @@ static void bch2_journal_read_device(struct closure *cl)
goto err;
}
- /* Find the journal bucket with the highest sequence number: */
- for (i = 0; i < ja->nr; i++) {
- if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
- ja->cur_idx = i;
+ ja->sectors_free = ca->mi.bucket_size;
- min_seq = min(ja->bucket_seq[i], min_seq);
- }
+ mutex_lock(&jlist->lock);
+ genradix_for_each_reverse(&c->journal_entries, iter, _r) {
+ r = *_r;
- /*
- * If there's duplicate journal entries in multiple buckets (which
- * definitely isn't supposed to happen, but...) - make sure to start
- * cur_idx at the last of those buckets, so we don't deadlock trying to
- * allocate
- */
- while (ja->bucket_seq[ja->cur_idx] > min_seq &&
- ja->bucket_seq[ja->cur_idx] >
- ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
- ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+ if (!r)
+ continue;
+
+ for (i = 0; i < r->nr_ptrs; i++) {
+ if (r->ptrs[i].dev == ca->dev_idx) {
+ unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+ vstruct_sectors(&r->j, c->block_bits);
+
+ ja->cur_idx = r->ptrs[i].bucket;
+ ja->sectors_free = ca->mi.bucket_size - wrote;
+ goto found;
+ }
+ }
+ }
+found:
+ mutex_unlock(&jlist->lock);
- ja->sectors_free = 0;
+ if (ja->bucket_seq[ja->cur_idx] &&
+ ja->sectors_free == ca->mi.bucket_size) {
+ bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
+ bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
+ for (i = 0; i < 3; i++) {
+ unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
+ bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
+ }
+ ja->sectors_free = 0;
+ }
/*
* Set dirty_idx to indicate the entire journal is full and needs to be
@@ -957,8 +1060,8 @@ err:
goto out;
}
-static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
{
unsigned i;
@@ -966,35 +1069,40 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
u64 offset;
- div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
+ div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
if (i)
- pr_buf(out, " ");
- pr_buf(out, "%u:%llu (offset %llu)",
+ prt_printf(out, " ");
+ prt_printf(out, "%u:%u:%u (sector %llu)",
j->ptrs[i].dev,
- (u64) j->ptrs[i].offset, offset);
+ j->ptrs[i].bucket,
+ j->ptrs[i].bucket_offset,
+ j->ptrs[i].sector);
}
}
-int bch2_journal_read(struct bch_fs *c, struct list_head *list,
- u64 *blacklist_seq, u64 *start_seq)
+int bch2_journal_read(struct bch_fs *c,
+ u64 *last_seq,
+ u64 *blacklist_seq,
+ u64 *start_seq)
{
struct journal_list jlist;
- struct journal_replay *i, *t;
+ struct journal_replay *i, **_i, *prev = NULL;
+ struct genradix_iter radix_iter;
struct bch_dev *ca;
unsigned iter;
- size_t keys = 0, entries = 0;
- bool degraded = false;
- u64 seq, last_seq = 0;
+ struct printbuf buf = PRINTBUF;
+ bool degraded = false, last_write_torn = false;
+ u64 seq;
int ret = 0;
closure_init_stack(&jlist.cl);
mutex_init(&jlist.lock);
- jlist.head = list;
+ jlist.last_seq = 0;
jlist.ret = 0;
for_each_member_device(ca, c, iter) {
- if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+ if (!c->opts.fsck &&
!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
continue;
@@ -1014,43 +1122,74 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
if (jlist.ret)
return jlist.ret;
- if (list_empty(list)) {
- bch_info(c, "journal read done, but no entries found");
- return 0;
- }
-
- i = list_last_entry(list, struct journal_replay, list);
- *start_seq = le64_to_cpu(i->j.seq) + 1;
+ *last_seq = 0;
+ *start_seq = 0;
+ *blacklist_seq = 0;
/*
* Find most recent flush entry, and ignore newer non flush entries -
* those entries will be blacklisted:
*/
- list_for_each_entry_safe_reverse(i, t, list, list) {
- if (i->ignore)
+ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+ int write = READ;
+
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
- if (!JSET_NO_FLUSH(&i->j)) {
- last_seq = le64_to_cpu(i->j.last_seq);
- *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
- break;
+ if (!*start_seq)
+ *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+ if (JSET_NO_FLUSH(&i->j)) {
+ i->ignore = true;
+ continue;
}
- journal_replay_free(c, i);
+ if (!last_write_torn && !i->csum_good) {
+ last_write_torn = true;
+ i->ignore = true;
+ continue;
+ }
+
+ if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
+ c, &i->j, NULL,
+ "invalid journal entry: last_seq > seq (%llu > %llu)",
+ le64_to_cpu(i->j.last_seq),
+ le64_to_cpu(i->j.seq)))
+ i->j.last_seq = i->j.seq;
+
+ *last_seq = le64_to_cpu(i->j.last_seq);
+ *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
+ break;
}
- if (!last_seq) {
+ if (!*start_seq) {
+ bch_info(c, "journal read done, but no entries found");
+ return 0;
+ }
+
+ if (!*last_seq) {
fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
- return -1;
+ return 0;
}
+ bch_info(c, "journal read done, replaying entries %llu-%llu",
+ *last_seq, *blacklist_seq - 1);
+
+ if (*start_seq != *blacklist_seq)
+ bch_info(c, "dropped unflushed entries %llu-%llu",
+ *blacklist_seq, *start_seq - 1);
+
/* Drop blacklisted entries and entries older than last_seq: */
- list_for_each_entry_safe(i, t, list, list) {
- if (i->ignore)
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
seq = le64_to_cpu(i->j.seq);
- if (seq < last_seq) {
+ if (seq < *last_seq) {
journal_replay_free(c, i);
continue;
}
@@ -1058,22 +1197,23 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
"found blacklisted journal entry %llu", seq);
-
- journal_replay_free(c, i);
+ i->ignore = true;
}
}
/* Check for missing entries: */
- seq = last_seq;
- list_for_each_entry(i, list, list) {
- if (i->ignore)
+ seq = *last_seq;
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
BUG_ON(seq > le64_to_cpu(i->j.seq));
while (seq < le64_to_cpu(i->j.seq)) {
u64 missing_start, missing_end;
- char buf1[200], buf2[200];
+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
while (seq < le64_to_cpu(i->j.seq) &&
bch2_journal_seq_is_blacklisted(c, seq, false))
@@ -1088,44 +1228,57 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
!bch2_journal_seq_is_blacklisted(c, seq, false))
seq++;
- if (i->list.prev != list) {
- struct printbuf out = PBUF(buf1);
- struct journal_replay *p = list_prev_entry(i, list);
-
- bch2_journal_ptrs_to_text(&out, c, p);
- pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits));
+ if (prev) {
+ bch2_journal_ptrs_to_text(&buf1, c, prev);
+ prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
} else
- sprintf(buf1, "(none)");
- bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
+ prt_printf(&buf1, "(none)");
+ bch2_journal_ptrs_to_text(&buf2, c, i);
missing_end = seq - 1;
fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
" prev at %s\n"
" next at %s",
missing_start, missing_end,
- last_seq, *blacklist_seq - 1,
- buf1, buf2);
+ *last_seq, *blacklist_seq - 1,
+ buf1.buf, buf2.buf);
+
+ printbuf_exit(&buf1);
+ printbuf_exit(&buf2);
}
+ prev = i;
seq++;
}
- list_for_each_entry(i, list, list) {
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
struct bch_replicas_padded replicas = {
.e.data_type = BCH_DATA_journal,
.e.nr_required = 1,
};
unsigned ptr;
- char buf[80];
- if (i->ignore)
+ i = *_i;
+ if (!i || i->ignore)
continue;
- ret = jset_validate_entries(c, &i->j, READ);
+ for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+
+ if (!i->ptrs[ptr].csum_good)
+ bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+ "invalid journal checksum, seq %llu%s",
+ le64_to_cpu(i->j.seq),
+ i->csum_good ? " (had good copy on another device)" : "");
+ }
+
+ ret = jset_validate(c,
+ bch_dev_bkey_exists(c, i->ptrs[0].dev),
+ &i->j,
+ i->ptrs[0].sector,
+ READ);
if (ret)
- goto fsck_err;
+ goto err;
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
@@ -1137,29 +1290,21 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
* the devices - this is wrong:
*/
+ printbuf_reset(&buf);
+ bch2_replicas_entry_to_text(&buf, &replicas.e);
+
if (!degraded &&
- (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
- "superblock not marked as containing replicas %s",
- (bch2_replicas_entry_to_text(&PBUF(buf),
- &replicas.e), buf)))) {
+ fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
+ "superblock not marked as containing replicas %s",
+ buf.buf)) {
ret = bch2_mark_replicas(c, &replicas.e);
if (ret)
- return ret;
+ goto err;
}
-
- for_each_jset_key(k, _n, entry, &i->j)
- keys++;
- entries++;
}
-
- bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
- keys, entries, *start_seq);
-
- if (*start_seq != *blacklist_seq)
- bch_info(c, "dropped unflushed entries %llu-%llu",
- *blacklist_seq, *start_seq - 1);
+err:
fsck_err:
+ printbuf_exit(&buf);
return ret;
}
@@ -1194,8 +1339,7 @@ static void __journal_write_alloc(struct journal *j,
if (!ca->mi.durability ||
ca->mi.state != BCH_MEMBER_STATE_rw ||
!ja->nr ||
- bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
- ca->dev_idx) ||
+ bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
sectors > ja->sectors_free)
continue;
@@ -1286,49 +1430,6 @@ done:
return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
}
-static void journal_write_compact(struct jset *jset)
-{
- struct jset_entry *i, *next, *prev = NULL;
-
- /*
- * Simple compaction, dropping empty jset_entries (from journal
- * reservations that weren't fully used) and merging jset_entries that
- * can be.
- *
- * If we wanted to be really fancy here, we could sort all the keys in
- * the jset and drop keys that were overwritten - probably not worth it:
- */
- vstruct_for_each_safe(jset, i, next) {
- unsigned u64s = le16_to_cpu(i->u64s);
-
- /* Empty entry: */
- if (!u64s)
- continue;
-
- /* Can we merge with previous entry? */
- if (prev &&
- i->btree_id == prev->btree_id &&
- i->level == prev->level &&
- i->type == prev->type &&
- i->type == BCH_JSET_ENTRY_btree_keys &&
- le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
- memmove_u64s_down(vstruct_next(prev),
- i->_data,
- u64s);
- le16_add_cpu(&prev->u64s, u64s);
- continue;
- }
-
- /* Couldn't merge, move i into new position (after prev): */
- prev = prev ? vstruct_next(prev) : jset->start;
- if (i != prev)
- memmove_u64s_down(prev, i, jset_u64s(u64s));
- }
-
- prev = prev ? vstruct_next(prev) : jset->start;
- jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
{
/* we aren't holding j->lock: */
@@ -1354,7 +1455,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
{
- return j->buf + j->reservations.unwritten_idx;
+ return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
}
static void journal_write_done(struct closure *cl)
@@ -1391,15 +1492,20 @@ static void journal_write_done(struct closure *cl)
journal_seq_pin(j, seq)->devs = w->devs_written;
if (!err) {
- j->seq_ondisk = seq;
-
if (!JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
+
+ bch2_do_discards(c);
+ closure_wake_up(&c->freelist_wait);
+
+ bch2_reset_alloc_cursors(c);
}
} else if (!j->err_seq || seq < j->err_seq)
j->err_seq = seq;
+ j->seq_ondisk = seq;
+
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
* more buckets:
@@ -1407,7 +1513,8 @@ static void journal_write_done(struct closure *cl)
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
- journal_reclaim_kick(&c->journal);
+ if (j->watermark)
+ journal_reclaim_kick(&c->journal);
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
@@ -1415,7 +1522,7 @@ static void journal_write_done(struct closure *cl)
v = atomic64_read(&j->reservations.counter);
do {
old.v = new.v = v;
- BUG_ON(new.idx == new.unwritten_idx);
+ BUG_ON(journal_state_count(new, new.unwritten_idx));
new.unwritten_idx++;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
@@ -1426,13 +1533,24 @@ static void journal_write_done(struct closure *cl)
closure_wake_up(&w->wait);
journal_wake(j);
- if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
- mod_delayed_work(c->io_complete_wq, &j->write_work, 0);
- spin_unlock(&j->lock);
-
- if (new.unwritten_idx != new.idx &&
- !journal_state_count(new, new.unwritten_idx))
+ if (!journal_state_count(new, new.unwritten_idx) &&
+ journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+ } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+ new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
+ struct journal_buf *buf = journal_cur_buf(j);
+ long delta = buf->expires - jiffies;
+
+ /*
+ * We don't close a journal entry to write it while there's
+ * previous entries still in flight - the current journal entry
+ * might want to be written now:
+ */
+
+ mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+ }
+
+ spin_unlock(&j->lock);
}
static void journal_write_endio(struct bio *bio)
@@ -1477,12 +1595,10 @@ static void do_journal_write(struct closure *cl)
sectors);
bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
- bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
ca->prev_journal_sector = bio->bi_iter.bi_sector;
@@ -1494,7 +1610,7 @@ static void do_journal_write(struct closure *cl)
bch2_bio_map(bio, w->data, sectors << 9);
- trace_journal_write(bio);
+ trace_and_count(c, journal_write, bio);
closure_bio_submit(bio, cl);
ca->journal.bucket_seq[ca->journal.cur_idx] =
@@ -1505,6 +1621,52 @@ static void do_journal_write(struct closure *cl)
return;
}
+static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
+{
+ struct jset_entry *i, *next, *prev = NULL;
+
+ /*
+ * Simple compaction, dropping empty jset_entries (from journal
+ * reservations that weren't fully used) and merging jset_entries that
+ * can be.
+ *
+ * If we wanted to be really fancy here, we could sort all the keys in
+ * the jset and drop keys that were overwritten - probably not worth it:
+ */
+ vstruct_for_each_safe(jset, i, next) {
+ unsigned u64s = le16_to_cpu(i->u64s);
+
+ /* Empty entry: */
+ if (!u64s)
+ continue;
+
+ if (i->type == BCH_JSET_ENTRY_btree_root)
+ bch2_journal_entry_to_btree_root(c, i);
+
+ /* Can we merge with previous entry? */
+ if (prev &&
+ i->btree_id == prev->btree_id &&
+ i->level == prev->level &&
+ i->type == prev->type &&
+ i->type == BCH_JSET_ENTRY_btree_keys &&
+ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+ memmove_u64s_down(vstruct_next(prev),
+ i->_data,
+ u64s);
+ le16_add_cpu(&prev->u64s, u64s);
+ continue;
+ }
+
+ /* Couldn't merge, move i into new position (after prev): */
+ prev = prev ? vstruct_next(prev) : jset->start;
+ if (i != prev)
+ memmove_u64s_down(prev, i, jset_u64s(u64s));
+ }
+
+ prev = prev ? vstruct_next(prev) : jset->start;
+ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+}
+
void bch2_journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
@@ -1514,7 +1676,7 @@ void bch2_journal_write(struct closure *cl)
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
- char *journal_debug_buf = NULL;
+ struct printbuf journal_debug_buf = PRINTBUF;
bool validate_before_checksum = false;
unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
int ret;
@@ -1527,8 +1689,26 @@ void bch2_journal_write(struct closure *cl)
j->write_start_time = local_clock();
spin_lock(&j->lock);
- if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
- (w->noflush ||
+
+ /*
+ * If the journal is in an error state - we did an emergency shutdown -
+ * we prefer to continue doing journal writes. We just mark them as
+ * noflush so they'll never be used, but they'll still be visible by the
+ * list_journal tool - this helps in debugging.
+ *
+ * There's a caveat: the first journal write after marking the
+ * superblock dirty must always be a flush write, because on startup
+ * from a clean shutdown we didn't necessarily read the journal and the
+ * new journal write might overwrite whatever was in the journal
+ * previously - we can't leave the journal without any flush writes in
+ * it.
+ *
+ * So if we're in an error state, and we're still starting up, we don't
+ * write anything at all.
+ */
+ if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
+ (bch2_journal_error(j) ||
+ w->noflush ||
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
@@ -1538,9 +1718,13 @@ void bch2_journal_write(struct closure *cl)
w->last_seq = 0;
j->nr_noflush_writes++;
- } else {
+ } else if (!bch2_journal_error(j)) {
j->last_flush_write = jiffies;
j->nr_flush_writes++;
+ clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+ } else {
+ spin_unlock(&j->lock);
+ goto err;
}
spin_unlock(&j->lock);
@@ -1554,7 +1738,7 @@ void bch2_journal_write(struct closure *cl)
* entry:
*/
- bch2_journal_entries_to_btree_roots(c, jset);
+ bch2_journal_entries_postprocess(c, jset);
start = end = vstruct_last(jset);
@@ -1568,10 +1752,8 @@ void bch2_journal_write(struct closure *cl)
le32_add_cpu(&jset->u64s, u64s);
BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
- journal_write_compact(jset);
-
jset->magic = cpu_to_le64(jset_magic(c));
- jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
+ jset->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
? cpu_to_le32(BCH_JSET_VERSION_OLD)
: cpu_to_le32(c->sb.version);
@@ -1588,18 +1770,21 @@ void bch2_journal_write(struct closure *cl)
validate_before_checksum = true;
if (validate_before_checksum &&
- jset_validate_for_write(c, jset))
+ jset_validate(c, NULL, jset, 0, WRITE))
goto err;
- bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
+ if (bch2_fs_fatal_err_on(ret, c,
+ "error decrypting journal entry: %i", ret))
+ goto err;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
journal_nonce(jset), jset);
if (!validate_before_checksum &&
- jset_validate_for_write(c, jset))
+ jset_validate(c, NULL, jset, 0, WRITE))
goto err;
sectors = vstruct_sectors(jset, c->block_bits);
@@ -1618,11 +1803,8 @@ retry_alloc:
goto retry_alloc;
}
- if (ret) {
- journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
- if (journal_debug_buf)
- __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
- }
+ if (ret)
+ __bch2_journal_debug_to_text(&journal_debug_buf, j);
/*
* write is allocated, no longer need to account for it in
@@ -1639,8 +1821,8 @@ retry_alloc:
if (ret) {
bch_err(c, "Unable to allocate journal write:\n%s",
- journal_debug_buf);
- kfree(journal_debug_buf);
+ journal_debug_buf.buf);
+ printbuf_exit(&journal_debug_buf);
bch2_fatal_error(c);
continue_at(cl, journal_write_done, c->io_complete_wq);
return;
@@ -1648,7 +1830,7 @@ retry_alloc:
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
- if (test_bit(JOURNAL_NOCHANGES, &j->flags))
+ if (c->opts.nochanges)
goto no_io;
for_each_rw_member(ca, c, i)
@@ -1662,9 +1844,7 @@ retry_alloc:
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_opf = REQ_OP_FLUSH;
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
@@ -1677,6 +1857,6 @@ no_io:
continue_at(cl, journal_write_done, c->io_complete_wq);
return;
err:
- bch2_inconsistent_error(c);
+ bch2_fatal_error(c);
continue_at(cl, journal_write_done, c->io_complete_wq);
}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index d8425fe0d67b..8801e98104bd 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -7,12 +7,16 @@
* during cache_registration
*/
struct journal_replay {
- struct list_head list;
- struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
+ struct journal_ptr {
+ bool csum_good;
+ u8 dev;
+ u32 bucket;
+ u32 bucket_offset;
+ u64 sector;
+ } ptrs[BCH_REPLICAS_MAX];
unsigned nr_ptrs;
- /* checksum error, but we may want to try using it anyways: */
- bool bad;
+ bool csum_good;
bool ignore;
/* must be last: */
struct jset j;
@@ -36,16 +40,24 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
(entry = __jset_entry_type_next(jset, entry, type)); \
entry = vstruct_next(entry))
-#define for_each_jset_key(k, _n, entry, jset) \
- for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
- vstruct_for_each_safe(entry, k, _n)
+#define jset_entry_for_each_key(_e, _k) \
+ for (_k = (_e)->start; \
+ _k < vstruct_last(_e); \
+ _k = bkey_next(_k))
-int bch2_journal_entry_validate(struct bch_fs *, const char *,
+#define for_each_jset_key(k, entry, jset) \
+ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\
+ jset_entry_for_each_key(entry, k)
+
+int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int, int);
void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
struct jset_entry *);
-int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
+void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
+ struct journal_replay *);
+
+int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
void bch2_journal_write(struct closure *);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index b9bf381ef15e..37c6846a30aa 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -2,6 +2,8 @@
#include "bcachefs.h"
#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "errcode.h"
#include "error.h"
#include "journal.h"
#include "journal_io.h"
@@ -11,7 +13,6 @@
#include <linux/kthread.h>
#include <linux/sched/mm.h>
-#include <linux/sched/task.h>
#include <trace/events/bcachefs.h>
/* Free space calculations: */
@@ -35,10 +36,8 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
struct journal_device *ja,
enum journal_space_from from)
{
- unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags)
- ? ((journal_space_from(ja, from) -
- ja->cur_idx - 1 + ja->nr) % ja->nr)
- : ja->nr;
+ unsigned available = (journal_space_from(ja, from) -
+ ja->cur_idx - 1 + ja->nr) % ja->nr;
/*
* Don't use the last bucket unless writing the new last_seq
@@ -62,25 +61,13 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
old.v, new.v)) != old.v);
}
-static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
-{
- unsigned sectors = 0;
-
- while (!sectors && *idx != j->reservations.idx) {
- sectors = j->buf[*idx].sectors;
-
- *idx = (*idx + 1) & JOURNAL_BUF_MASK;
- }
-
- return sectors;
-}
-
static struct journal_space
journal_dev_space_available(struct journal *j, struct bch_dev *ca,
enum journal_space_from from)
{
struct journal_device *ja = &ca->journal;
- unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
+ unsigned sectors, buckets, unwritten;
+ u64 seq;
if (from == journal_space_total)
return (struct journal_space) {
@@ -95,7 +82,14 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
* We that we don't allocate the space for a journal entry
* until we write it out - thus, account for it here:
*/
- while ((unwritten = get_unwritten_sectors(j, &idx))) {
+ for (seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++) {
+ unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
+
+ if (!unwritten)
+ continue;
+
/* entry won't fit on this device, skip: */
if (unwritten > ca->mi.bucket_size)
continue;
@@ -203,7 +197,7 @@ void bch2_journal_space_available(struct journal *j)
j->can_discard = can_discard;
if (nr_online < c->opts.metadata_replicas_required) {
- ret = cur_entry_insufficient_devices;
+ ret = JOURNAL_ERR_insufficient_devices;
goto out;
}
@@ -216,29 +210,13 @@ void bch2_journal_space_available(struct journal *j)
clean = j->space[journal_space_clean].total;
total = j->space[journal_space_total].total;
- if (!clean_ondisk &&
- j->reservations.idx ==
- j->reservations.unwritten_idx) {
- char *buf = kmalloc(4096, GFP_ATOMIC);
-
- bch_err(c, "journal stuck");
- if (buf) {
- __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
- pr_err("\n%s", buf);
- kfree(buf);
- }
-
- bch2_fatal_error(c);
- ret = cur_entry_journal_stuck;
- } else if (!j->space[journal_space_discarded].next_entry)
- ret = cur_entry_journal_full;
- else if (!fifo_free(&j->pin))
- ret = cur_entry_journal_pin_full;
+ if (!j->space[journal_space_discarded].next_entry)
+ ret = JOURNAL_ERR_journal_full;
if ((j->space[journal_space_clean_ondisk].next_entry <
j->space[journal_space_clean_ondisk].total) &&
(clean - clean_ondisk <= total / 8) &&
- (clean_ondisk * 2 > clean ))
+ (clean_ondisk * 2 > clean))
set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
@@ -252,7 +230,7 @@ out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
journal_set_remaining(j, u64s_remaining);
- journal_check_may_get_unreserved(j);
+ journal_set_watermark(j);
if (!ret)
journal_wake(j);
@@ -287,12 +265,13 @@ void bch2_journal_do_discards(struct journal *j)
struct journal_device *ja = &ca->journal;
while (should_discard_bucket(j, ja)) {
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+ if (!c->opts.nochanges &&
+ ca->mi.discard &&
+ bdev_max_discard_sectors(ca->disk_sb.bdev))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca,
ja->buckets[ja->discard_idx]),
- ca->mi.bucket_size, GFP_NOIO, 0);
+ ca->mi.bucket_size, GFP_NOIO);
spin_lock(&j->lock);
ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
@@ -323,9 +302,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
*/
while (!fifo_empty(&j->pin) &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
- BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
- BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
- BUG_ON(!fifo_pop(&j->pin, temp));
+ fifo_pop(&j->pin, temp);
popped = true;
}
@@ -352,13 +329,13 @@ void bch2_journal_pin_put(struct journal *j, u64 seq)
}
}
-static inline void __journal_pin_drop(struct journal *j,
+static inline bool __journal_pin_drop(struct journal *j,
struct journal_entry_pin *pin)
{
struct journal_entry_pin_list *pin_list;
if (!journal_pin_active(pin))
- return;
+ return false;
if (j->flush_in_progress == pin)
j->flush_in_progress_dropped = true;
@@ -371,27 +348,36 @@ static inline void __journal_pin_drop(struct journal *j,
* Unpinning a journal entry make make journal_next_bucket() succeed, if
* writing a new last_seq will now make another bucket available:
*/
- if (atomic_dec_and_test(&pin_list->count) &&
- pin_list == &fifo_peek_front(&j->pin))
- bch2_journal_reclaim_fast(j);
- else if (fifo_used(&j->pin) == 1 &&
- atomic_read(&pin_list->count) == 1)
- journal_wake(j);
+ return atomic_dec_and_test(&pin_list->count) &&
+ pin_list == &fifo_peek_front(&j->pin);
}
void bch2_journal_pin_drop(struct journal *j,
struct journal_entry_pin *pin)
{
spin_lock(&j->lock);
- __journal_pin_drop(j, pin);
+ if (__journal_pin_drop(j, pin))
+ bch2_journal_reclaim_fast(j);
spin_unlock(&j->lock);
}
+enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+{
+ if (fn == bch2_btree_node_flush0 ||
+ fn == bch2_btree_node_flush1)
+ return JOURNAL_PIN_btree;
+ else if (fn == bch2_btree_key_cache_journal_flush)
+ return JOURNAL_PIN_key_cache;
+ else
+ return JOURNAL_PIN_other;
+}
+
void bch2_journal_pin_set(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
struct journal_entry_pin_list *pin_list;
+ bool reclaim;
spin_lock(&j->lock);
@@ -408,18 +394,19 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
pin_list = journal_seq_pin(j, seq);
- __journal_pin_drop(j, pin);
+ reclaim = __journal_pin_drop(j, pin);
atomic_inc(&pin_list->count);
pin->seq = seq;
pin->flush = flush_fn;
- if (flush_fn == bch2_btree_key_cache_journal_flush)
- list_add(&pin->list, &pin_list->key_cache_list);
- else if (flush_fn)
- list_add(&pin->list, &pin_list->list);
+ if (flush_fn)
+ list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
else
list_add(&pin->list, &pin_list->flushed);
+
+ if (reclaim)
+ bch2_journal_reclaim_fast(j);
spin_unlock(&j->lock);
/*
@@ -450,37 +437,37 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
static struct journal_entry_pin *
journal_get_next_pin(struct journal *j,
- bool get_any,
- bool get_key_cache,
- u64 max_seq, u64 *seq)
+ u64 seq_to_flush,
+ unsigned allowed_below_seq,
+ unsigned allowed_above_seq,
+ u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
+ unsigned i;
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
- if (*seq > max_seq && !get_any && !get_key_cache)
+ if (*seq > seq_to_flush && !allowed_above_seq)
break;
- if (*seq <= max_seq || get_any) {
- ret = list_first_entry_or_null(&pin_list->list,
- struct journal_entry_pin, list);
- if (ret)
- return ret;
- }
-
- if (*seq <= max_seq || get_any || get_key_cache) {
- ret = list_first_entry_or_null(&pin_list->key_cache_list,
- struct journal_entry_pin, list);
- if (ret)
- return ret;
- }
+ for (i = 0; i < JOURNAL_PIN_NR; i++)
+ if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
+ ((1U << i) & allowed_above_seq)) {
+ ret = list_first_entry_or_null(&pin_list->list[i],
+ struct journal_entry_pin, list);
+ if (ret)
+ return ret;
+ }
}
return NULL;
}
/* returns true if we did work */
-static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
+static size_t journal_flush_pins(struct journal *j,
+ u64 seq_to_flush,
+ unsigned allowed_below_seq,
+ unsigned allowed_above_seq,
unsigned min_any,
unsigned min_key_cache)
{
@@ -493,15 +480,25 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
lockdep_assert_held(&j->reclaim_lock);
while (1) {
+ unsigned allowed_above = allowed_above_seq;
+ unsigned allowed_below = allowed_below_seq;
+
+ if (min_any) {
+ allowed_above |= ~0;
+ allowed_below |= ~0;
+ }
+
+ if (min_key_cache) {
+ allowed_above |= 1U << JOURNAL_PIN_key_cache;
+ allowed_below |= 1U << JOURNAL_PIN_key_cache;
+ }
+
cond_resched();
j->last_flushed = jiffies;
spin_lock(&j->lock);
- pin = journal_get_next_pin(j,
- min_any != 0,
- min_key_cache != 0,
- seq_to_flush, &seq);
+ pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
if (pin) {
BUG_ON(j->flush_in_progress);
j->flush_in_progress = pin;
@@ -598,7 +595,7 @@ static u64 journal_seq_to_flush(struct journal *j)
* 512 journal entries or 25% of all journal buckets, then
* journal_next_bucket() should not stall.
*/
-static int __bch2_journal_reclaim(struct journal *j, bool direct)
+static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool kthread = (current->flags & PF_KTHREAD) != 0;
@@ -647,8 +644,11 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
min_nr = 1;
- trace_journal_reclaim_start(c,
- min_nr,
+ min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
+
+ trace_and_count(c, journal_reclaim_start, c,
+ direct, kicked,
+ min_nr, min_key_cache,
j->prereserved.reserved,
j->prereserved.remaining,
atomic_read(&c->btree_cache.dirty),
@@ -656,20 +656,19 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));
- min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
-
nr_flushed = journal_flush_pins(j, seq_to_flush,
+ ~0, 0,
min_nr, min_key_cache);
if (direct)
j->nr_direct_reclaim += nr_flushed;
else
j->nr_background_reclaim += nr_flushed;
- trace_journal_reclaim_finish(c, nr_flushed);
+ trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
if (nr_flushed)
wake_up(&j->reclaim_wait);
- } while ((min_nr || min_key_cache) && !direct);
+ } while ((min_nr || min_key_cache) && nr_flushed && !direct);
memalloc_noreclaim_restore(flags);
@@ -678,7 +677,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
int bch2_journal_reclaim(struct journal *j)
{
- return __bch2_journal_reclaim(j, true);
+ return __bch2_journal_reclaim(j, true, true);
}
static int bch2_journal_reclaim_thread(void *arg)
@@ -686,6 +685,7 @@ static int bch2_journal_reclaim_thread(void *arg)
struct journal *j = arg;
struct bch_fs *c = container_of(j, struct bch_fs, journal);
unsigned long delay, now;
+ bool journal_empty;
int ret = 0;
set_freezable();
@@ -693,10 +693,12 @@ static int bch2_journal_reclaim_thread(void *arg)
j->last_flushed = jiffies;
while (!ret && !kthread_should_stop()) {
+ bool kicked = j->reclaim_kicked;
+
j->reclaim_kicked = false;
mutex_lock(&j->reclaim_lock);
- ret = __bch2_journal_reclaim(j, false);
+ ret = __bch2_journal_reclaim(j, false, kicked);
mutex_unlock(&j->reclaim_lock);
now = jiffies;
@@ -707,15 +709,22 @@ static int bch2_journal_reclaim_thread(void *arg)
j->next_reclaim = now + delay;
while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
if (kthread_should_stop())
break;
if (j->reclaim_kicked)
break;
- if (time_after_eq(jiffies, j->next_reclaim))
- break;
- freezable_schedule_timeout(j->next_reclaim - jiffies);
+ spin_lock(&j->lock);
+ journal_empty = fifo_empty(&j->pin);
+ spin_unlock(&j->lock);
+
+ if (journal_empty)
+ schedule();
+ else if (time_after(j->next_reclaim, jiffies))
+ schedule_timeout(j->next_reclaim - jiffies);
+ else
+ break;
}
__set_current_state(TASK_RUNNING);
}
@@ -739,15 +748,17 @@ int bch2_journal_reclaim_start(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct task_struct *p;
+ int ret;
if (j->reclaim_thread)
return 0;
p = kthread_create(bch2_journal_reclaim_thread, j,
"bch-reclaim/%s", c->name);
- if (IS_ERR(p)) {
- bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p));
- return PTR_ERR(p);
+ ret = PTR_ERR_OR_ZERO(p);
+ if (ret) {
+ bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret));
+ return ret;
}
get_task_struct(p);
@@ -767,7 +778,12 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
- *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0;
+ if (journal_flush_pins(j, seq_to_flush,
+ (1U << JOURNAL_PIN_key_cache)|
+ (1U << JOURNAL_PIN_other), 0, 0, 0) ||
+ journal_flush_pins(j, seq_to_flush,
+ (1U << JOURNAL_PIN_btree), 0, 0, 0))
+ *did_work = true;
spin_lock(&j->lock);
/*
@@ -776,8 +792,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
*/
ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
journal_last_seq(j) > seq_to_flush ||
- (fifo_used(&j->pin) == 1 &&
- atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+ !fifo_used(&j->pin);
spin_unlock(&j->lock);
mutex_unlock(&j->reclaim_lock);
@@ -825,10 +840,12 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
seq = 0;
spin_lock(&j->lock);
- while (!ret && seq < j->pin.back) {
+ while (!ret) {
struct bch_replicas_padded replicas;
seq = max(seq, journal_last_seq(j));
+ if (seq >= j->pin.back)
+ break;
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
journal_seq_pin(j, seq)->devs);
seq++;
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
new file mode 100644
index 000000000000..fcefbbe7eda8
--- /dev/null
+++ b/fs/bcachefs/journal_sb.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal_sb.h"
+#include "darray.h"
+
+#include <linux/sort.h>
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+ const u64 *l = _l;
+ const u64 *r = _r;
+
+ return cmp_int(*l, *r);
+}
+
+static int bch2_sb_journal_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_journal *journal = field_to_type(f, journal);
+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+ int ret = -BCH_ERR_invalid_sb_journal;
+ unsigned nr;
+ unsigned i;
+ u64 *b;
+
+ nr = bch2_nr_journal_buckets(journal);
+ if (!nr)
+ return 0;
+
+ b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
+ if (!b)
+ return -BCH_ERR_ENOMEM_sb_journal_validate;
+
+ for (i = 0; i < nr; i++)
+ b[i] = le64_to_cpu(journal->buckets[i]);
+
+ sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+ if (!b[0]) {
+ prt_printf(err, "journal bucket at sector 0");
+ goto err;
+ }
+
+ if (b[0] < le16_to_cpu(m->first_bucket)) {
+ prt_printf(err, "journal bucket %llu before first bucket %u",
+ b[0], le16_to_cpu(m->first_bucket));
+ goto err;
+ }
+
+ if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
+ prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+ b[nr - 1], le64_to_cpu(m->nbuckets));
+ goto err;
+ }
+
+ for (i = 0; i + 1 < nr; i++)
+ if (b[i] == b[i + 1]) {
+ prt_printf(err, "duplicate journal buckets %llu", b[i]);
+ goto err;
+ }
+
+ ret = 0;
+err:
+ kfree(b);
+ return ret;
+}
+
+static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_journal *journal = field_to_type(f, journal);
+ unsigned i, nr = bch2_nr_journal_buckets(journal);
+
+ prt_printf(out, "Buckets: ");
+ for (i = 0; i < nr; i++)
+ prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
+ prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+ .validate = bch2_sb_journal_validate,
+ .to_text = bch2_sb_journal_to_text,
+};
+
+struct u64_range {
+ u64 start;
+ u64 end;
+};
+
+static int u64_range_cmp(const void *_l, const void *_r)
+{
+ const struct u64_range *l = _l;
+ const struct u64_range *r = _r;
+
+ return cmp_int(l->start, r->start);
+}
+
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+ int ret = -BCH_ERR_invalid_sb_journal;
+ unsigned nr;
+ unsigned i;
+ struct u64_range *b;
+
+ nr = bch2_sb_field_journal_v2_nr_entries(journal);
+ if (!nr)
+ return 0;
+
+ b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
+ if (!b)
+ return -BCH_ERR_ENOMEM_sb_journal_v2_validate;
+
+ for (i = 0; i < nr; i++) {
+ b[i].start = le64_to_cpu(journal->d[i].start);
+ b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+ }
+
+ sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
+
+ if (!b[0].start) {
+ prt_printf(err, "journal bucket at sector 0");
+ goto err;
+ }
+
+ if (b[0].start < le16_to_cpu(m->first_bucket)) {
+ prt_printf(err, "journal bucket %llu before first bucket %u",
+ b[0].start, le16_to_cpu(m->first_bucket));
+ goto err;
+ }
+
+ if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) {
+ prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+ b[nr - 1].end - 1, le64_to_cpu(m->nbuckets));
+ goto err;
+ }
+
+ for (i = 0; i + 1 < nr; i++) {
+ if (b[i].end > b[i + 1].start) {
+ prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
+ b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
+ goto err;
+ }
+ }
+
+ ret = 0;
+err:
+ kfree(b);
+ return ret;
+}
+
+static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+ unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
+
+ prt_printf(out, "Buckets: ");
+ for (i = 0; i < nr; i++)
+ prt_printf(out, " %llu-%llu",
+ le64_to_cpu(journal->d[i].start),
+ le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
+ prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
+ .validate = bch2_sb_journal_v2_validate,
+ .to_text = bch2_sb_journal_v2_to_text,
+};
+
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
+ u64 *buckets, unsigned nr)
+{
+ struct bch_sb_field_journal_v2 *j;
+ unsigned i, dst = 0, nr_compacted = 1;
+
+ if (c)
+ lockdep_assert_held(&c->sb_lock);
+
+ if (!nr) {
+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
+ return 0;
+ }
+
+ for (i = 0; i + 1 < nr; i++)
+ if (buckets[i] + 1 != buckets[i + 1])
+ nr_compacted++;
+
+ j = bch2_sb_resize_journal_v2(&ca->disk_sb,
+ (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
+ if (!j)
+ return -BCH_ERR_ENOSPC_sb_journal;
+
+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+
+ j->d[dst].start = le64_to_cpu(buckets[0]);
+ j->d[dst].nr = le64_to_cpu(1);
+
+ for (i = 1; i < nr; i++) {
+ if (buckets[i] == buckets[i - 1] + 1) {
+ le64_add_cpu(&j->d[dst].nr, 1);
+ } else {
+ dst++;
+ j->d[dst].start = le64_to_cpu(buckets[i]);
+ j->d[dst].nr = le64_to_cpu(1);
+ }
+ }
+
+ BUG_ON(dst + 1 != nr_compacted);
+ return 0;
+}
diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
new file mode 100644
index 000000000000..ba40a7e8d90a
--- /dev/null
+++ b/fs/bcachefs/journal_sb.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "super-io.h"
+#include "vstructs.h"
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+ return j
+ ? (__le64 *) vstruct_end(&j->field) - j->buckets
+ : 0;
+}
+
+static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
+{
+ if (!j)
+ return 0;
+
+ return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
+
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 3cc63fc202ab..d6b9f2cdf8e7 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -103,7 +103,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
sb_blacklist_u64s(nr + 1));
if (!bl) {
- ret = -ENOMEM;
+ ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
goto out;
}
@@ -168,7 +168,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
GFP_KERNEL);
if (!t)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_blacklist_table_init;
t->nr = nr;
@@ -201,17 +201,17 @@ static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
if (le64_to_cpu(e->start) >=
le64_to_cpu(e->end)) {
- pr_buf(err, "entry %u start >= end (%llu >= %llu)",
+ prt_printf(err, "entry %u start >= end (%llu >= %llu)",
i, le64_to_cpu(e->start), le64_to_cpu(e->end));
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_journal_seq_blacklist;
}
if (i + 1 < nr &&
le64_to_cpu(e[0].end) >
le64_to_cpu(e[1].start)) {
- pr_buf(err, "entry %u out of order with next entry (%llu > %llu)",
+ prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_journal_seq_blacklist;
}
}
@@ -229,12 +229,13 @@ static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
for (i = bl->start; i < bl->start + nr; i++) {
if (i != bl->start)
- pr_buf(out, " ");
+ prt_printf(out, " ");
- pr_buf(out, "%llu-%llu",
+ prt_printf(out, "%llu-%llu",
le64_to_cpu(i->start),
le64_to_cpu(i->end));
}
+ prt_newline(out);
}
const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
@@ -271,7 +272,7 @@ retry:
!test_bit(BCH_FS_STOPPING, &c->flags))
b = bch2_btree_iter_next_node(&iter);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_iter_exit(&trans, &iter);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index d6d751214116..8d8c0b3d5a30 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -25,6 +25,8 @@ struct journal_buf {
struct closure_waitlist wait;
u64 last_seq; /* copy of data->last_seq */
+ long expires;
+ u64 flush_time;
unsigned buf_size; /* size in bytes of @data */
unsigned sectors; /* maximum size for current entry */
@@ -41,9 +43,15 @@ struct journal_buf {
* flushed:
*/
+enum journal_pin_type {
+ JOURNAL_PIN_btree,
+ JOURNAL_PIN_key_cache,
+ JOURNAL_PIN_other,
+ JOURNAL_PIN_NR,
+};
+
struct journal_entry_pin_list {
- struct list_head list;
- struct list_head key_cache_list;
+ struct list_head list[JOURNAL_PIN_NR];
struct list_head flushed;
atomic_t count;
struct bch_devs_list devs;
@@ -139,52 +147,78 @@ enum journal_space_from {
journal_space_nr,
};
-/*
- * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
- * either because something's waiting on the write to complete or because it's
- * been dirty too long and the timer's expired.
- */
-
-enum {
+enum journal_flags {
JOURNAL_REPLAY_DONE,
JOURNAL_STARTED,
- JOURNAL_NEED_WRITE,
- JOURNAL_MAY_GET_UNRESERVED,
JOURNAL_MAY_SKIP_FLUSH,
- JOURNAL_NOCHANGES,
+ JOURNAL_NEED_FLUSH_WRITE,
+};
+
+#define JOURNAL_WATERMARKS() \
+ x(any) \
+ x(copygc) \
+ x(reserved)
+
+enum journal_watermark {
+#define x(n) JOURNAL_WATERMARK_##n,
+ JOURNAL_WATERMARKS()
+#undef x
};
+#define JOURNAL_WATERMARK_MASK 3
+
+/* Reasons we may fail to get a journal reservation: */
+#define JOURNAL_ERRORS() \
+ x(ok) \
+ x(blocked) \
+ x(max_in_flight) \
+ x(journal_full) \
+ x(journal_pin_full) \
+ x(journal_stuck) \
+ x(insufficient_devices)
+
+enum journal_errors {
+#define x(n) JOURNAL_ERR_##n,
+ JOURNAL_ERRORS()
+#undef x
+};
+
+typedef DARRAY(u64) darray_u64;
+
/* Embedded in struct bch_fs */
struct journal {
/* Fastpath stuff up front: */
-
- unsigned long flags;
+ struct {
union journal_res_state reservations;
+ enum journal_watermark watermark;
+
+ union journal_preres_state prereserved;
+
+ } __aligned(SMP_CACHE_BYTES);
+
+ unsigned long flags;
/* Max size of current journal entry */
unsigned cur_entry_u64s;
unsigned cur_entry_sectors;
+ /* Reserved space in journal entry to be used just prior to write */
+ unsigned entry_u64s_reserved;
+
+
/*
* 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
* insufficient devices:
*/
- enum {
- cur_entry_ok,
- cur_entry_blocked,
- cur_entry_journal_full,
- cur_entry_journal_pin_full,
- cur_entry_journal_stuck,
- cur_entry_insufficient_devices,
- } cur_entry_error;
-
- union journal_preres_state prereserved;
-
- /* Reserved space in journal entry to be used just prior to write */
- unsigned entry_u64s_reserved;
+ enum journal_errors cur_entry_error;
unsigned buf_size_want;
+ /*
+ * We may queue up some things to be journalled (log messages) before
+ * the journal has actually started - stash them here:
+ */
+ darray_u64 early_journal_entries;
/*
* Two journal entries -- one is currently open for new entries, the
@@ -245,6 +279,10 @@ struct journal {
spinlock_t err_lock;
struct mutex reclaim_lock;
+ /*
+ * Used for waiting until journal reclaim has freed up space in the
+ * journal:
+ */
wait_queue_head_t reclaim_wait;
struct task_struct *reclaim_thread;
bool reclaim_kicked;
@@ -264,21 +302,20 @@ struct journal {
unsigned long last_flush_write;
u64 res_get_blocked_start;
- u64 need_write_time;
u64 write_start_time;
u64 nr_flush_writes;
u64 nr_noflush_writes;
- struct time_stats *flush_write_time;
- struct time_stats *noflush_write_time;
- struct time_stats *blocked_time;
- struct time_stats *flush_seq_time;
+ struct bch2_time_stats *flush_write_time;
+ struct bch2_time_stats *noflush_write_time;
+ struct bch2_time_stats *blocked_time;
+ struct bch2_time_stats *flush_seq_time;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map res_map;
#endif
-};
+} __aligned(SMP_CACHE_BYTES);
/*
* Embedded in struct bch_dev. First three fields refer to the array of journal
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index cda77835b9ea..cf5998e519e7 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey.h"
#include "keylist.h"
int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
@@ -30,22 +31,6 @@ int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
return 0;
}
-void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
-{
- struct bkey_i *where;
-
- for_each_keylist_key(l, where)
- if (bkey_cmp(insert->k.p, where->k.p) < 0)
- break;
-
- memmove_u64s_up((u64 *) where + insert->k.u64s,
- where,
- ((u64 *) l->top) - ((u64 *) where));
-
- l->top_p += insert->k.u64s;
- bkey_copy(where, insert);
-}
-
void bch2_keylist_pop_front(struct keylist *l)
{
l->top_p -= bch2_keylist_front(l)->k.u64s;
@@ -62,6 +47,6 @@ void bch2_verify_keylist_sorted(struct keylist *l)
for_each_keylist_key(l, k)
BUG_ON(bkey_next(k) != l->top &&
- bpos_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
+ bpos_ge(k->k.p, bkey_next(k)->k.p));
}
#endif
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
index 195799bb20bc..fe759c7031e0 100644
--- a/fs/bcachefs/keylist.h
+++ b/fs/bcachefs/keylist.h
@@ -5,7 +5,6 @@
#include "keylist_types.h"
int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
-void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
void bch2_keylist_pop_front(struct keylist *);
static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
@@ -17,7 +16,6 @@ static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
{
if (l->keys_p != inline_keys)
kfree(l->keys_p);
- bch2_keylist_init(l, inline_keys);
}
static inline void bch2_keylist_push(struct keylist *l)
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
new file mode 100644
index 000000000000..c2dece27da2d
--- /dev/null
+++ b/fs/bcachefs/lru.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+
+/* KEY_TYPE_lru is obsolete: */
+int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
+{
+ const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+ if (bkey_val_bytes(k.k) < sizeof(*lru)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*lru));
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ if (!lru_pos_time(k.k->p)) {
+ prt_printf(err, "lru entry at time=0");
+ return -BCH_ERR_invalid_bkey;
+
+ }
+
+ return 0;
+}
+
+void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+ prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
+}
+
+void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
+{
+ prt_printf(out, "%llu:%llu -> %llu:%llu",
+ lru_pos_id(lru),
+ lru_pos_time(lru),
+ u64_to_bucket(lru.offset).inode,
+ u64_to_bucket(lru.offset).offset);
+}
+
+static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
+ u64 dev_bucket, u64 time, unsigned key_type)
+{
+ struct bkey_i *k;
+ int ret = 0;
+
+ if (!time)
+ return 0;
+
+ k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+ ret = PTR_ERR_OR_ZERO(k);
+ if (unlikely(ret))
+ return ret;
+
+ bkey_init(&k->k);
+ k->k.type = key_type;
+ k->k.p = lru_pos(lru_id, dev_bucket, time);
+
+ EBUG_ON(lru_pos_id(k->k.p) != lru_id);
+ EBUG_ON(lru_pos_time(k->k.p) != time);
+ EBUG_ON(k->k.p.offset != dev_bucket);
+
+ return bch2_trans_update_buffered(trans, BTREE_ID_lru, k);
+}
+
+int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+{
+ return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted);
+}
+
+int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+{
+ return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
+}
+
+int bch2_lru_change(struct btree_trans *trans,
+ u16 lru_id, u64 dev_bucket,
+ u64 old_time, u64 new_time)
+{
+ if (old_time == new_time)
+ return 0;
+
+ return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?:
+ bch2_lru_set(trans, lru_id, dev_bucket, new_time);
+}
+
+static const char * const bch2_lru_types[] = {
+#define x(n) #n,
+ BCH_LRU_TYPES()
+#undef x
+ NULL
+};
+
+static int bch2_check_lru_key(struct btree_trans *trans,
+ struct btree_iter *lru_iter,
+ struct bkey_s_c lru_k,
+ struct bpos *last_flushed_pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+ enum bch_lru_type type = lru_type(lru_k);
+ struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
+ u64 idx;
+ int ret;
+
+ if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
+ "lru key points to nonexistent device:bucket %llu:%llu",
+ alloc_pos.inode, alloc_pos.offset))
+ return bch2_btree_delete_at(trans, lru_iter, 0);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ a = bch2_alloc_to_v4(k, &a_convert);
+
+ switch (type) {
+ case BCH_LRU_read:
+ idx = alloc_lru_idx_read(*a);
+ break;
+ case BCH_LRU_fragmentation:
+ idx = a->fragmentation_lru;
+ break;
+ }
+
+ if (lru_k.k->type != KEY_TYPE_set ||
+ lru_pos_time(lru_k.k->p) != idx) {
+ if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) {
+ *last_flushed_pos = lru_k.k->p;
+ ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+ -BCH_ERR_transaction_restart_write_buffer_flush;
+ goto out;
+ }
+
+ if (c->opts.reconstruct_alloc ||
+ fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
+ " %s\n"
+ " for %s",
+ bch2_lru_types[type],
+ lru_pos_time(lru_k.k->p),
+ (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
+ ret = bch2_btree_delete_at(trans, lru_iter, 0);
+ }
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ return ret;
+}
+
+int bch2_check_lrus(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bpos last_flushed_pos = POS_MIN;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos));
+
+ bch2_trans_exit(&trans);
+ return ret;
+
+}
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
new file mode 100644
index 000000000000..78a6076999ed
--- /dev/null
+++ b/fs/bcachefs/lru.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_H
+#define _BCACHEFS_LRU_H
+
+#define LRU_TIME_BITS 48
+#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
+
+static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
+{
+ EBUG_ON(time > LRU_TIME_MAX);
+
+ return POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
+}
+
+static inline u64 lru_pos_id(struct bpos pos)
+{
+ return pos.inode >> LRU_TIME_BITS;
+}
+
+static inline u64 lru_pos_time(struct bpos pos)
+{
+ return pos.inode & ~(~0ULL << LRU_TIME_BITS);
+}
+
+#define BCH_LRU_TYPES() \
+ x(read) \
+ x(fragmentation)
+
+enum bch_lru_type {
+#define x(n) BCH_LRU_##n,
+ BCH_LRU_TYPES()
+#undef x
+};
+
+#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
+
+static inline enum bch_lru_type lru_type(struct bkey_s_c l)
+{
+ u16 lru_id = l.k->p.inode >> 48;
+
+ if (lru_id == BCH_LRU_FRAGMENTATION_START)
+ return BCH_LRU_fragmentation;
+ return BCH_LRU_read;
+}
+
+int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
+void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
+
+#define bch2_bkey_ops_lru ((struct bkey_ops) { \
+ .key_invalid = bch2_lru_invalid, \
+ .val_to_text = bch2_lru_to_text, \
+})
+
+int bch2_lru_del(struct btree_trans *, u16, u64, u64);
+int bch2_lru_set(struct btree_trans *, u16, u64, u64);
+int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
+
+int bch2_check_lrus(struct bch_fs *);
+
+#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 6defc33322b3..d93db07f0c87 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -8,6 +8,7 @@
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
+#include "errcode.h"
#include "extents.h"
#include "io.h"
#include "journal.h"
@@ -35,85 +36,74 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
return 0;
}
-static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags,
- enum btree_id btree_id)
+static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ unsigned dev_idx,
+ int flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i *n;
+ int ret;
+
+ if (!bch2_bkey_has_device_c(k, dev_idx))
+ return 0;
+
+ n = bch2_bkey_make_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
+ if (ret)
+ return ret;
+
+ /*
+ * If the new extent no longer has any pointers, bch2_extent_normalize()
+ * will do the appropriate thing with it (turning it into a
+ * KEY_TYPE_error key, or just a discard if it was a cached extent)
+ */
+ bch2_extent_normalize(c, bkey_i_to_s(n));
+
+ /*
+ * Since we're not inserting through an extent iterator
+ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * we aren't using the extent overwrite path to delete, we're
+ * just using the normal key deletion path:
+ */
+ if (bkey_deleted(&n->k))
+ n->k.size = 0;
+
+ return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_buf sk;
+ enum btree_id id;
int ret = 0;
- bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- while ((bch2_trans_begin(&trans),
- (k = bch2_btree_iter_peek(&iter)).k) &&
- !(ret = bkey_err(k))) {
- if (!bch2_bkey_has_device(k, dev_idx)) {
- bch2_btree_iter_advance(&iter);
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ if (!btree_type_has_ptrs(id))
continue;
- }
-
- bch2_bkey_buf_reassemble(&sk, c, k);
- ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
- dev_idx, flags, false);
- if (ret)
- break;
-
- /*
- * If the new extent no longer has any pointers, bch2_extent_normalize()
- * will do the appropriate thing with it (turning it into a
- * KEY_TYPE_error key, or just a discard if it was a cached extent)
- */
- bch2_extent_normalize(c, bkey_i_to_s(sk.k));
-
- /*
- * Since we're not inserting through an extent iterator
- * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
- * we aren't using the extent overwrite path to delete, we're
- * just using the normal key deletion path:
- */
- if (bkey_deleted(&sk.k->k))
- sk.k->k.size = 0;
-
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(&trans, &iter, sk.k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
-
- /*
- * don't want to leave ret == -EINTR, since if we raced and
- * something else overwrote the key we could spuriously return
- * -EINTR below:
- */
- if (ret == -EINTR)
- ret = 0;
+ ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags));
if (ret)
break;
}
- bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(&sk, c);
-
- BUG_ON(ret == -EINTR);
return ret;
}
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
-{
- return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?:
- __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink);
-}
-
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_trans trans;
@@ -140,8 +130,7 @@ retry:
while (bch2_trans_begin(&trans),
(b = bch2_btree_iter_peek_node(&iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
- if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
- dev_idx))
+ if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
goto next;
bch2_bkey_buf_copy(&k, c, &b->key);
@@ -154,19 +143,20 @@ retry:
}
ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
- if (ret == -EINTR) {
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
ret = 0;
continue;
}
if (ret) {
- bch_err(c, "Error updating btree node key: %i", ret);
+ bch_err(c, "Error updating btree node key: %s",
+ bch2_err_str(ret));
break;
}
next:
bch2_btree_iter_next_node(&iter);
}
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_iter_exit(&trans, &iter);
@@ -175,16 +165,13 @@ next:
goto err;
}
- /* flush relevant btree updates */
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
-
+ bch2_btree_interior_updates_flush(c);
ret = 0;
err:
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&k, c);
- BUG_ON(ret == -EINTR);
+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
return ret;
}
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9e6db2917a19..d7bcdc88657a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -1,20 +1,23 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "bkey_buf.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "btree_update_interior.h"
-#include "buckets.h"
+#include "btree_write_buffer.h"
#include "disk_groups.h"
#include "ec.h"
+#include "errcode.h"
+#include "error.h"
#include "inode.h"
#include "io.h"
#include "journal_reclaim.h"
#include "move.h"
#include "replicas.h"
-#include "subvolume.h"
#include "super-io.h"
#include "keylist.h"
@@ -23,507 +26,241 @@
#include <trace/events/bcachefs.h>
-#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
+static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
+{
+ mutex_lock(&c->data_progress_lock);
+ list_add(&stats->list, &c->data_progress_list);
+ mutex_unlock(&c->data_progress_lock);
+}
+
+static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
+{
+ mutex_lock(&c->data_progress_lock);
+ list_del(&stats->list);
+ mutex_unlock(&c->data_progress_lock);
+}
struct moving_io {
- struct list_head list;
- struct closure cl;
- bool read_completed;
+ struct list_head read_list;
+ struct list_head io_list;
+ struct move_bucket_in_flight *b;
+ struct closure cl;
+ bool read_completed;
- unsigned read_sectors;
- unsigned write_sectors;
+ unsigned read_sectors;
+ unsigned write_sectors;
- struct bch_read_bio rbio;
+ struct bch_read_bio rbio;
- struct migrate_write write;
+ struct data_update write;
/* Must be last since it is variable size */
- struct bio_vec bi_inline_vecs[0];
+ struct bio_vec bi_inline_vecs[0];
};
-struct moving_context {
- /* Closure for waiting on all reads and writes to complete */
- struct closure cl;
-
- struct bch_move_stats *stats;
-
- struct list_head reads;
-
- /* in flight sectors: */
- atomic_t read_sectors;
- atomic_t write_sectors;
-
- wait_queue_head_t wait;
-};
-
-static int insert_snapshot_whiteouts(struct btree_trans *trans,
- enum btree_id id,
- struct bpos old_pos,
- struct bpos new_pos)
+static void move_free(struct moving_io *io)
{
- struct bch_fs *c = trans->c;
- struct btree_iter iter, update_iter;
- struct bkey_s_c k;
- struct snapshots_seen s;
- int ret;
-
- if (!btree_type_has_snapshots(id))
- return 0;
-
- snapshots_seen_init(&s);
-
- if (!bkey_cmp(old_pos, new_pos))
- return 0;
-
- if (!snapshot_t(c, old_pos.snapshot)->children[0])
- return 0;
-
- bch2_trans_iter_init(trans, &iter, id, old_pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
- while (1) {
-next:
- k = bch2_btree_iter_prev(&iter);
- ret = bkey_err(k);
- if (ret)
- break;
-
- if (bkey_cmp(old_pos, k.k->p))
- break;
-
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
- struct bkey_i *update;
- size_t i;
-
- for (i = 0; i < s.nr; i++)
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
- goto next;
-
- update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+ struct moving_context *ctxt = io->write.ctxt;
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- break;
+ if (io->b)
+ atomic_dec(&io->b->count);
- bkey_init(&update->k);
- update->k.p = new_pos;
- update->k.p.snapshot = k.k->p.snapshot;
-
- bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&update_iter) ?:
- bch2_trans_update(trans, &update_iter, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
- bch2_trans_iter_exit(trans, &update_iter);
- if (ret)
- break;
+ bch2_data_update_exit(&io->write);
- ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
- if (ret)
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
- kfree(s.d);
+ mutex_lock(&ctxt->lock);
+ list_del(&io->io_list);
+ wake_up(&ctxt->wait);
+ mutex_unlock(&ctxt->lock);
- return ret;
+ kfree(io);
}
-static int bch2_migrate_index_update(struct bch_write_op *op)
+static void move_write_done(struct bch_write_op *op)
{
- struct bch_fs *c = op->c;
- struct btree_trans trans;
- struct btree_iter iter;
- struct migrate_write *m =
- container_of(op, struct migrate_write, op);
- struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
- struct keylist *keys = &op->insert_keys;
- struct bkey_buf _new, _insert;
- int ret = 0;
-
- bch2_bkey_buf_init(&_new);
- bch2_bkey_buf_init(&_insert);
- bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
- bch2_trans_iter_init(&trans, &iter, m->btree_id,
- bkey_start_pos(&bch2_keylist_front(keys)->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
- while (1) {
- struct bkey_s_c k;
- struct bkey_i *insert;
- struct bkey_i_extent *new;
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- struct bpos next_pos;
- bool did_work = false;
- bool should_check_enospc;
- s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-
- bch2_trans_begin(&trans);
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- new = bkey_i_to_extent(bch2_keylist_front(keys));
-
- if (bversion_cmp(k.k->version, new->k.version) ||
- !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
- goto nomatch;
-
- bkey_reassemble(_insert.k, k);
- insert = _insert.k;
-
- bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
- new = bkey_i_to_extent(_new.k);
- bch2_cut_front(iter.pos, &new->k_i);
-
- bch2_cut_front(iter.pos, insert);
- bch2_cut_back(new->k.p, insert);
- bch2_cut_back(insert->k.p, &new->k_i);
-
- if (m->data_cmd == DATA_REWRITE) {
- struct bch_extent_ptr *new_ptr, *old_ptr = (void *)
- bch2_bkey_has_device(bkey_i_to_s_c(insert),
- m->data_opts.rewrite_dev);
- if (!old_ptr)
- goto nomatch;
-
- if (old_ptr->cached)
- extent_for_each_ptr(extent_i_to_s(new), new_ptr)
- new_ptr->cached = true;
-
- __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
- }
-
- extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
- if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
- /*
- * raced with another move op? extent already
- * has a pointer to the device we just wrote
- * data to
- */
- continue;
- }
-
- bch2_extent_ptr_decoded_append(insert, &p);
- did_work = true;
- }
+ struct moving_io *io = container_of(op, struct moving_io, write.op);
+ struct moving_context *ctxt = io->write.ctxt;
- if (!did_work)
- goto nomatch;
+ if (io->write.op.error)
+ ctxt->write_error = true;
- bch2_bkey_narrow_crcs(insert,
- (struct bch_extent_crc_unpacked) { 0 });
- bch2_extent_normalize(c, bkey_i_to_s(insert));
- bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
- op->opts.background_target,
- op->opts.data_replicas);
+ atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
+ atomic_dec(&io->write.ctxt->write_ios);
+ move_free(io);
+ closure_put(&ctxt->cl);
+}
- ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
- &should_check_enospc,
- &i_sectors_delta,
- &disk_sectors_delta);
- if (ret)
- goto err;
+static void move_write(struct moving_io *io)
+{
+ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+ move_free(io);
+ return;
+ }
- if (disk_sectors_delta > (s64) op->res.sectors) {
- ret = bch2_disk_reservation_add(c, &op->res,
- disk_sectors_delta - op->res.sectors,
- !should_check_enospc
- ? BCH_DISK_RESERVATION_NOFAIL : 0);
- if (ret)
- goto out;
- }
+ closure_get(&io->write.ctxt->cl);
+ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
+ atomic_inc(&io->write.ctxt->write_ios);
- next_pos = insert->k.p;
-
- ret = insert_snapshot_whiteouts(&trans, m->btree_id,
- k.k->p, insert->k.p) ?:
- bch2_trans_update(&trans, &iter, insert,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(&trans, &op->res,
- op_journal_seq(op),
- BTREE_INSERT_NOFAIL|
- m->data_opts.btree_insert_flags);
- if (!ret) {
- bch2_btree_iter_set_pos(&iter, next_pos);
- atomic_long_inc(&c->extent_migrate_done);
- if (ec_ob)
- bch2_ob_add_backpointer(c, ec_ob, &insert->k);
- }
-err:
- if (ret == -EINTR)
- ret = 0;
- if (ret)
- break;
-next:
- while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
- bch2_keylist_pop_front(keys);
- if (bch2_keylist_empty(keys))
- goto out;
- }
- continue;
-nomatch:
- if (m->ctxt) {
- BUG_ON(k.k->p.offset <= iter.pos.offset);
- atomic64_inc(&m->ctxt->stats->keys_raced);
- atomic64_add(k.k->p.offset - iter.pos.offset,
- &m->ctxt->stats->sectors_raced);
- }
- atomic_long_inc(&c->extent_migrate_raced);
- trace_move_race(&new->k);
- bch2_btree_iter_advance(&iter);
- goto next;
- }
-out:
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(&_insert, c);
- bch2_bkey_buf_exit(&_new, c);
- BUG_ON(ret == -EINTR);
- return ret;
+ bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
}
-void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
{
- /* write bio must own pages: */
- BUG_ON(!m->op.wbio.bio.bi_vcnt);
-
- m->ptr = rbio->pick.ptr;
- m->offset = rbio->data_pos.offset - rbio->pick.crc.offset;
- m->op.devs_have = rbio->devs_have;
- m->op.pos = rbio->data_pos;
- m->op.version = rbio->version;
- m->op.crc = rbio->pick.crc;
- m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
-
- if (m->data_cmd == DATA_REWRITE)
- bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
+ struct moving_io *io =
+ list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
+
+ return io && io->read_completed ? io : NULL;
}
-int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
- struct write_point_specifier wp,
- struct bch_io_opts io_opts,
- enum data_cmd data_cmd,
- struct data_opts data_opts,
- enum btree_id btree_id,
- struct bkey_s_c k)
+static void move_read_endio(struct bio *bio)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct bch_extent_crc_unpacked crc;
- struct extent_ptr_decoded p;
- int ret;
-
- m->btree_id = btree_id;
- m->data_cmd = data_cmd;
- m->data_opts = data_opts;
- m->nr_ptrs_reserved = 0;
-
- bch2_write_op_init(&m->op, c, io_opts);
-
- if (!bch2_bkey_is_incompressible(k))
- m->op.compression_type =
- bch2_compression_opt_to_type[io_opts.background_compression ?:
- io_opts.compression];
- else
- m->op.incompressible = true;
-
- m->op.target = data_opts.target,
- m->op.write_point = wp;
-
- /*
- * op->csum_type is normally initialized from the fs/file's current
- * options - but if an extent is encrypted, we require that it stays
- * encrypted:
- */
- bkey_for_each_crc(k.k, ptrs, crc, entry)
- if (bch2_csum_type_is_encryption(crc.csum_type)) {
- m->op.nonce = crc.nonce + crc.offset;
- m->op.csum_type = crc.csum_type;
- break;
- }
-
- if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
- m->op.alloc_reserve = RESERVE_MOVINGGC;
- m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
- } else {
- /* XXX: this should probably be passed in */
- m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
- }
+ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+ struct moving_context *ctxt = io->write.ctxt;
- m->op.flags |= BCH_WRITE_PAGES_STABLE|
- BCH_WRITE_PAGES_OWNED|
- BCH_WRITE_DATA_ENCODED|
- BCH_WRITE_FROM_INTERNAL;
+ atomic_sub(io->read_sectors, &ctxt->read_sectors);
+ atomic_dec(&ctxt->read_ios);
+ io->read_completed = true;
- m->op.nr_replicas = data_opts.nr_replicas;
- m->op.nr_replicas_required = data_opts.nr_replicas;
- m->op.index_update_fn = bch2_migrate_index_update;
+ wake_up(&ctxt->wait);
+ closure_put(&ctxt->cl);
+}
- switch (data_cmd) {
- case DATA_ADD_REPLICAS: {
- /*
- * DATA_ADD_REPLICAS is used for moving data to a different
- * device in the background, and due to compression the new copy
- * might take up more space than the old copy:
- */
-#if 0
- int nr = (int) io_opts.data_replicas -
- bch2_bkey_nr_ptrs_allocated(k);
-#endif
- int nr = (int) io_opts.data_replicas;
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
+ struct btree_trans *trans)
+{
+ struct moving_io *io;
- if (nr > 0) {
- m->op.nr_replicas = m->nr_ptrs_reserved = nr;
+ if (trans)
+ bch2_trans_unlock(trans);
- ret = bch2_disk_reservation_get(c, &m->op.res,
- k.k->size, m->op.nr_replicas, 0);
- if (ret)
- return ret;
- }
- break;
+ while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
+ list_del(&io->read_list);
+ move_write(io);
}
- case DATA_REWRITE: {
- unsigned compressed_sectors = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev == data_opts.rewrite_dev) {
- if (p.ptr.cached)
- m->op.flags |= BCH_WRITE_CACHED;
-
- if (!p.ptr.cached &&
- crc_is_compressed(p.crc))
- compressed_sectors += p.crc.compressed_size;
- }
+}
- if (compressed_sectors) {
- ret = bch2_disk_reservation_add(c, &m->op.res,
- k.k->size * m->op.nr_replicas,
- BCH_DISK_RESERVATION_NOFAIL);
- if (ret)
- return ret;
- }
- break;
- }
- case DATA_PROMOTE:
- m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
- m->op.flags |= BCH_WRITE_CACHED;
- break;
- default:
- BUG();
- }
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
+ struct btree_trans *trans)
+{
+ unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
- return 0;
+ move_ctxt_wait_event(ctxt, trans,
+ !atomic_read(&ctxt->write_sectors) ||
+ atomic_read(&ctxt->write_sectors) != sectors_pending);
}
-static void move_free(struct closure *cl)
+void bch2_moving_ctxt_exit(struct moving_context *ctxt)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
- struct moving_context *ctxt = io->write.ctxt;
- struct bio_vec *bv;
- unsigned i;
+ struct bch_fs *c = ctxt->c;
- bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
+ move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+ closure_sync(&ctxt->cl);
- bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
- if (bv->bv_page)
- __free_page(bv->bv_page);
+ EBUG_ON(atomic_read(&ctxt->write_sectors));
+ EBUG_ON(atomic_read(&ctxt->write_ios));
+ EBUG_ON(atomic_read(&ctxt->read_sectors));
+ EBUG_ON(atomic_read(&ctxt->read_ios));
- wake_up(&ctxt->wait);
+ if (ctxt->stats) {
+ progress_list_del(c, ctxt->stats);
+ trace_move_data(c,
+ atomic64_read(&ctxt->stats->sectors_moved),
+ atomic64_read(&ctxt->stats->keys_moved));
+ }
- kfree(io);
+ mutex_lock(&c->moving_context_lock);
+ list_del(&ctxt->list);
+ mutex_unlock(&c->moving_context_lock);
}
-static void move_write_done(struct closure *cl)
+void bch2_moving_ctxt_init(struct moving_context *ctxt,
+ struct bch_fs *c,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
+ memset(ctxt, 0, sizeof(*ctxt));
- atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
- closure_return_with_destructor(cl, move_free);
-}
+ ctxt->c = c;
+ ctxt->fn = (void *) _RET_IP_;
+ ctxt->rate = rate;
+ ctxt->stats = stats;
+ ctxt->wp = wp;
+ ctxt->wait_on_copygc = wait_on_copygc;
-static void move_write(struct closure *cl)
-{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
+ closure_init_stack(&ctxt->cl);
- if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
- closure_return_with_destructor(cl, move_free);
- return;
- }
+ mutex_init(&ctxt->lock);
+ INIT_LIST_HEAD(&ctxt->reads);
+ INIT_LIST_HEAD(&ctxt->ios);
+ init_waitqueue_head(&ctxt->wait);
- bch2_migrate_read_done(&io->write, &io->rbio);
+ mutex_lock(&c->moving_context_lock);
+ list_add(&ctxt->list, &c->moving_context_list);
+ mutex_unlock(&c->moving_context_lock);
- atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
- closure_call(&io->write.op.cl, bch2_write, NULL, cl);
- continue_at(cl, move_write_done, NULL);
+ if (stats) {
+ progress_list_add(c, stats);
+ stats->data_type = BCH_DATA_user;
+ }
}
-static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
+void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
{
- struct moving_io *io =
- list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
-
- return io && io->read_completed ? io : NULL;
+ memset(stats, 0, sizeof(*stats));
+ scnprintf(stats->name, sizeof(stats->name), "%s", name);
}
-static void move_read_endio(struct bio *bio)
+static int bch2_extent_drop_ptrs(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct data_update_opts data_opts)
{
- struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
- struct moving_context *ctxt = io->write.ctxt;
-
- atomic_sub(io->read_sectors, &ctxt->read_sectors);
- io->read_completed = true;
+ struct bch_fs *c = trans->c;
+ struct bkey_i *n;
+ int ret;
- if (next_pending_write(ctxt))
- wake_up(&ctxt->wait);
+ n = bch2_bkey_make_mut(trans, k);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
- closure_put(&ctxt->cl);
-}
+ while (data_opts.kill_ptrs) {
+ unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+ struct bch_extent_ptr *ptr;
-static void do_pending_writes(struct moving_context *ctxt)
-{
- struct moving_io *io;
-
- while ((io = next_pending_write(ctxt))) {
- list_del(&io->list);
- closure_call(&io->cl, move_write, NULL, &ctxt->cl);
+ bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+ data_opts.kill_ptrs ^= 1U << drop;
}
-}
-#define move_ctxt_wait_event(_ctxt, _cond) \
-do { \
- do_pending_writes(_ctxt); \
- \
- if (_cond) \
- break; \
- __wait_event((_ctxt)->wait, \
- next_pending_write(_ctxt) || (_cond)); \
-} while (1)
-
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
-{
- unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
+ /*
+ * If the new extent no longer has any pointers, bch2_extent_normalize()
+ * will do the appropriate thing with it (turning it into a
+ * KEY_TYPE_error key, or just a discard if it was a cached extent)
+ */
+ bch2_extent_normalize(c, bkey_i_to_s(n));
- move_ctxt_wait_event(ctxt,
- !atomic_read(&ctxt->write_sectors) ||
- atomic_read(&ctxt->write_sectors) != sectors_pending);
+ /*
+ * Since we're not inserting through an extent iterator
+ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * we aren't using the extent overwrite path to delete, we're
+ * just using the normal key deletion path:
+ */
+ if (bkey_deleted(&n->k))
+ n->k.size = 0;
+
+ return bch2_trans_relock(trans) ?:
+ bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}
static int bch2_move_extent(struct btree_trans *trans,
+ struct btree_iter *iter,
struct moving_context *ctxt,
- struct write_point_specifier wp,
+ struct move_bucket_in_flight *bucket_in_flight,
struct bch_io_opts io_opts,
enum btree_id btree_id,
struct bkey_s_c k,
- enum data_cmd data_cmd,
- struct data_opts data_opts)
+ struct data_update_opts data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -533,13 +270,20 @@ static int bch2_move_extent(struct btree_trans *trans,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
- move_ctxt_wait_event(ctxt,
- atomic_read(&ctxt->write_sectors) <
- SECTORS_IN_FLIGHT_PER_DEVICE);
+ bch2_data_update_opts_normalize(k, &data_opts);
+
+ if (!data_opts.rewrite_ptrs &&
+ !data_opts.extra_replicas) {
+ if (data_opts.kill_ptrs)
+ return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+ return 0;
+ }
- move_ctxt_wait_event(ctxt,
- atomic_read(&ctxt->read_sectors) <
- SECTORS_IN_FLIGHT_PER_DEVICE);
+ /*
+ * Before memory allocations & taking nocow locks in
+ * bch2_data_update_init():
+ */
+ bch2_trans_unlock(trans);
/* write path might have to decompress data: */
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
@@ -551,11 +295,12 @@ static int bch2_move_extent(struct btree_trans *trans,
if (!io)
goto err;
+ INIT_LIST_HEAD(&io->io_list);
io->write.ctxt = ctxt;
io->read_sectors = k.k->size;
io->write_sectors = k.k->size;
- bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
+ bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
bio_set_prio(&io->write.op.wbio.bio,
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
@@ -565,27 +310,53 @@ static int bch2_move_extent(struct btree_trans *trans,
io->rbio.c = c;
io->rbio.opts = io_opts;
- bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
+ bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
io->rbio.bio.bi_vcnt = pages;
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
- bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
+ io->rbio.bio.bi_opf = REQ_OP_READ;
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
io->rbio.bio.bi_end_io = move_read_endio;
- ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
- data_cmd, data_opts, btree_id, k);
- if (ret)
+ ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
+ io_opts, data_opts, btree_id, k);
+ if (ret && ret != -BCH_ERR_unwritten_extent_update)
goto err_free_pages;
- atomic64_inc(&ctxt->stats->keys_moved);
- atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+ if (ret == -BCH_ERR_unwritten_extent_update) {
+ bch2_update_unwritten_extent(trans, &io->write);
+ move_free(io);
+ return 0;
+ }
+
+ BUG_ON(ret);
+
+ io->write.ctxt = ctxt;
+ io->write.op.end_io = move_write_done;
+
+ if (ctxt->stats) {
+ atomic64_inc(&ctxt->stats->keys_moved);
+ atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+ }
+
+ if (bucket_in_flight) {
+ io->b = bucket_in_flight;
+ atomic_inc(&io->b->count);
+ }
+
+ this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
+ trace_move_extent_read(k.k);
- trace_move_extent(k.k);
+ mutex_lock(&ctxt->lock);
atomic_add(io->read_sectors, &ctxt->read_sectors);
- list_add_tail(&io->list, &ctxt->reads);
+ atomic_inc(&ctxt->read_ios);
+
+ list_add_tail(&io->read_list, &ctxt->reads);
+ list_add_tail(&io->io_list, &ctxt->ios);
+ mutex_unlock(&ctxt->lock);
/*
* dropped by move_read_endio() - guards against use after free of
@@ -603,7 +374,7 @@ err_free_pages:
err_free:
kfree(io);
err:
- trace_move_alloc_fail(k.k);
+ trace_and_count(c, move_extent_alloc_mem_fail, k.k);
return ret;
}
@@ -621,7 +392,7 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos,
if (ret)
goto err;
- if (!k.k || bkey_cmp(k.k->p, pos)) {
+ if (!k.k || !bkey_eq(k.k->p, pos)) {
ret = -ENOENT;
goto err;
}
@@ -638,65 +409,111 @@ err:
return ret;
}
-static int __bch2_move_data(struct bch_fs *c,
- struct moving_context *ctxt,
- struct bch_ratelimit *rate,
- struct write_point_specifier wp,
- struct bpos start,
- struct bpos end,
- move_pred_fn pred, void *arg,
- struct bch_move_stats *stats,
- enum btree_id btree_id)
+static int move_ratelimit(struct btree_trans *trans,
+ struct moving_context *ctxt)
{
- bool kthread = (current->flags & PF_KTHREAD) != 0;
+ struct bch_fs *c = trans->c;
+ u64 delay;
+
+ if (ctxt->wait_on_copygc) {
+ bch2_trans_unlock(trans);
+ wait_event_killable(c->copygc_running_wq,
+ !c->copygc_running ||
+ kthread_should_stop());
+ }
+
+ do {
+ delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
+
+ if (delay) {
+ bch2_trans_unlock(trans);
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ return 1;
+ }
+
+ if (delay)
+ schedule_timeout(delay);
+
+ if (unlikely(freezing(current))) {
+ move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
+ try_to_freeze();
+ }
+ } while (delay);
+
+ /*
+ * XXX: these limits really ought to be per device, SSDs and hard drives
+ * will want different limits
+ */
+ move_ctxt_wait_event(ctxt, trans,
+ atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+ atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+ atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
+ atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
+
+ return 0;
+}
+
+static int move_get_io_opts(struct btree_trans *trans,
+ struct bch_io_opts *io_opts,
+ struct bkey_s_c k, u64 *cur_inum)
+{
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ if (*cur_inum == k.k->p.inode)
+ return 0;
+
+ ret = lookup_inode(trans,
+ SPOS(0, k.k->p.inode, k.k->p.snapshot),
+ &inode);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
+
+ if (!ret)
+ bch2_inode_opts_get(io_opts, trans->c, &inode);
+ else
+ *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+ *cur_inum = k.k->p.inode;
+ return 0;
+}
+
+static int __bch2_move_data(struct moving_context *ctxt,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ enum btree_id btree_id)
+{
+ struct bch_fs *c = ctxt->c;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct bkey_buf sk;
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct data_opts data_opts;
- enum data_cmd data_cmd;
- u64 delay, cur_inum = U64_MAX;
+ struct data_update_opts data_opts;
+ u64 cur_inum = U64_MAX;
int ret = 0, ret2;
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
- stats->data_type = BCH_DATA_user;
- stats->btree_id = btree_id;
- stats->pos = start;
+ if (ctxt->stats) {
+ ctxt->stats->data_type = BCH_DATA_user;
+ ctxt->stats->btree_id = btree_id;
+ ctxt->stats->pos = start;
+ }
bch2_trans_iter_init(&trans, &iter, btree_id, start,
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS);
- if (rate)
- bch2_ratelimit_reset(rate);
-
- while (1) {
- do {
- delay = rate ? bch2_ratelimit_delay(rate) : 0;
-
- if (delay) {
- bch2_trans_unlock(&trans);
- set_current_state(TASK_INTERRUPTIBLE);
- }
-
- if (kthread && (ret = kthread_should_stop())) {
- __set_current_state(TASK_RUNNING);
- goto out;
- }
-
- if (delay)
- schedule_timeout(delay);
-
- if (unlikely(freezing(current))) {
- bch2_trans_unlock(&trans);
- move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
- try_to_freeze();
- }
- } while (delay);
+ if (ctxt->rate)
+ bch2_ratelimit_reset(ctxt->rate);
+ while (!move_ratelimit(&trans, ctxt)) {
bch2_trans_begin(&trans);
k = bch2_btree_iter_peek(&iter);
@@ -704,64 +521,45 @@ static int __bch2_move_data(struct bch_fs *c,
break;
ret = bkey_err(k);
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
- if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+ if (bkey_ge(bkey_start_pos(k.k), end))
break;
- stats->pos = iter.pos;
+ if (ctxt->stats)
+ ctxt->stats->pos = iter.pos;
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
- if (btree_id == BTREE_ID_extents &&
- cur_inum != k.k->p.inode) {
- struct bch_inode_unpacked inode;
-
- io_opts = bch2_opts_to_inode_opts(c->opts);
-
- ret = lookup_inode(&trans,
- SPOS(0, k.k->p.inode, k.k->p.snapshot),
- &inode);
- if (ret == -EINTR)
- continue;
-
- if (!ret)
- bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
-
- cur_inum = k.k->p.inode;
- }
+ ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+ if (ret)
+ continue;
- switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
- case DATA_SKIP:
+ memset(&data_opts, 0, sizeof(data_opts));
+ if (!pred(c, arg, k, &io_opts, &data_opts))
goto next;
- case DATA_SCRUB:
- BUG();
- case DATA_ADD_REPLICAS:
- case DATA_REWRITE:
- case DATA_PROMOTE:
- break;
- default:
- BUG();
- }
- /* unlock before doing IO: */
+ /*
+ * The iterator gets unlocked by __bch2_read_extent - need to
+ * save a copy of @k elsewhere:
+ */
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
bch2_trans_unlock(&trans);
- ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
- data_cmd, data_opts);
+ ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL,
+ io_opts, btree_id, k, data_opts);
if (ret2) {
- if (ret2 == -EINTR)
+ if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt);
+ bch2_move_ctxt_wait_for_io(ctxt, &trans);
continue;
}
@@ -769,14 +567,14 @@ static int __bch2_move_data(struct bch_fs *c,
goto next;
}
- if (rate)
- bch2_ratelimit_increment(rate, k.k->size);
+ if (ctxt->rate)
+ bch2_ratelimit_increment(ctxt->rate, k.k->size);
next:
- atomic64_add(k.k->size, &stats->sectors_seen);
+ if (ctxt->stats)
+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
next_nondata:
bch2_btree_iter_advance(&iter);
}
-out:
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
@@ -785,40 +583,20 @@ out:
return ret;
}
-static inline void progress_list_add(struct bch_fs *c,
- struct bch_move_stats *stats)
-{
- mutex_lock(&c->data_progress_lock);
- list_add(&stats->list, &c->data_progress_list);
- mutex_unlock(&c->data_progress_lock);
-}
-
-static inline void progress_list_del(struct bch_fs *c,
- struct bch_move_stats *stats)
-{
- mutex_lock(&c->data_progress_lock);
- list_del(&stats->list);
- mutex_unlock(&c->data_progress_lock);
-}
-
int bch2_move_data(struct bch_fs *c,
enum btree_id start_btree_id, struct bpos start_pos,
enum btree_id end_btree_id, struct bpos end_pos,
struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
struct write_point_specifier wp,
- move_pred_fn pred, void *arg,
- struct bch_move_stats *stats)
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
{
- struct moving_context ctxt = { .stats = stats };
+ struct moving_context ctxt;
enum btree_id id;
int ret;
- progress_list_add(c, stats);
- closure_init_stack(&ctxt.cl);
- INIT_LIST_HEAD(&ctxt.reads);
- init_waitqueue_head(&ctxt.wait);
-
- stats->data_type = BCH_DATA_user;
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
for (id = start_btree_id;
id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
@@ -829,31 +607,282 @@ int bch2_move_data(struct bch_fs *c,
id != BTREE_ID_reflink)
continue;
- ret = __bch2_move_data(c, &ctxt, rate, wp,
+ ret = __bch2_move_data(&ctxt,
id == start_btree_id ? start_pos : POS_MIN,
id == end_btree_id ? end_pos : POS_MAX,
- pred, arg, stats, id);
+ pred, arg, id);
if (ret)
break;
}
+ bch2_moving_ctxt_exit(&ctxt);
- move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
- closure_sync(&ctxt.cl);
+ return ret;
+}
- EBUG_ON(atomic_read(&ctxt.write_sectors));
+void bch2_verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ struct bch_backpointer bp;
+ struct bpos bp_pos = POS_MIN;
+ unsigned nr_bps = 0;
+ int ret;
- trace_move_data(c,
- atomic64_read(&stats->sectors_moved),
- atomic64_read(&stats->keys_moved));
+ bch2_trans_begin(trans);
- progress_list_del(c, stats);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ bucket, BTREE_ITER_CACHED);
+again:
+ ret = lockrestart_do(trans,
+ bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+
+ if (!ret && k.k->type == KEY_TYPE_alloc_v4) {
+ struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+
+ if (a.v->gen == gen &&
+ a.v->dirty_sectors) {
+ if (a.v->data_type == BCH_DATA_btree) {
+ bch2_trans_unlock(trans);
+ if (bch2_btree_interior_updates_flush(c))
+ goto again;
+ goto failed_to_evacuate;
+ }
+ }
+ }
+
+ set_btree_iter_dontneed(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+ return;
+failed_to_evacuate:
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+ return;
+
+ prt_printf(&buf, bch2_log_msg(c, "failed to evacuate bucket "));
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ while (1) {
+ bch2_trans_begin(trans);
+
+ ret = bch2_get_next_backpointer(trans, bucket, gen,
+ &bp_pos, &bp,
+ BTREE_ITER_CACHED);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+ if (bkey_eq(bp_pos, POS_MAX))
+ break;
+
+ k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+ if (!k.k)
+ continue;
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (++nr_bps > 10)
+ break;
+ bp_pos = bpos_nosnap_successor(bp_pos);
+ }
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
+
+int __bch2_evacuate_bucket(struct btree_trans *trans,
+ struct moving_context *ctxt,
+ struct move_bucket_in_flight *bucket_in_flight,
+ struct bpos bucket, int gen,
+ struct data_update_opts _data_opts)
+{
+ struct bch_fs *c = ctxt->c;
+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+ struct btree_iter iter;
+ struct bkey_buf sk;
+ struct bch_backpointer bp;
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+ struct bkey_s_c k;
+ struct data_update_opts data_opts;
+ unsigned dirty_sectors, bucket_size;
+ u64 fragmentation;
+ u64 cur_inum = U64_MAX;
+ struct bpos bp_pos = POS_MIN;
+ int ret = 0;
+
+ bch2_bkey_buf_init(&sk);
+
+ /*
+ * We're not run in a context that handles transaction restarts:
+ */
+ bch2_trans_begin(trans);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ bucket, BTREE_ITER_CACHED);
+ ret = lockrestart_do(trans,
+ bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret) {
+ bch_err(c, "%s: error looking up alloc key: %s", __func__, bch2_err_str(ret));
+ goto err;
+ }
+
+ a = bch2_alloc_to_v4(k, &a_convert);
+ dirty_sectors = a->dirty_sectors;
+ bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
+ fragmentation = a->fragmentation_lru;
+
+ ret = bch2_btree_write_buffer_flush(trans);
+ if (ret) {
+ bch_err(c, "%s: error flushing btree write buffer: %s", __func__, bch2_err_str(ret));
+ goto err;
+ }
+
+ while (!(ret = move_ratelimit(trans, ctxt))) {
+ bch2_trans_begin(trans);
+
+ ret = bch2_get_next_backpointer(trans, bucket, gen,
+ &bp_pos, &bp,
+ BTREE_ITER_CACHED);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+ if (bkey_eq(bp_pos, POS_MAX))
+ break;
+
+ if (!bp.level) {
+ const struct bch_extent_ptr *ptr;
+ struct bkey_s_c k;
+ unsigned i = 0;
+
+ k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+ if (!k.k)
+ goto next;
+
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
+
+ ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
+ if (ret) {
+ bch2_trans_iter_exit(trans, &iter);
+ continue;
+ }
+
+ data_opts = _data_opts;
+ data_opts.target = io_opts.background_target;
+ data_opts.rewrite_ptrs = 0;
+
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ if (ptr->dev == bucket.inode) {
+ data_opts.rewrite_ptrs |= 1U << i;
+ if (ptr->cached) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto next;
+ }
+ }
+ i++;
+ }
+
+ ret = bch2_move_extent(trans, &iter, ctxt,
+ bucket_in_flight,
+ io_opts, bp.btree_id, k, data_opts);
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret == -ENOMEM) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(ctxt, trans);
+ continue;
+ }
+ if (ret)
+ goto err;
+
+ if (ctxt->rate)
+ bch2_ratelimit_increment(ctxt->rate, k.k->size);
+ if (ctxt->stats)
+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+ } else {
+ struct btree *b;
+
+ b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ continue;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+ if (!b)
+ goto next;
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+
+ if (ctxt->rate)
+ bch2_ratelimit_increment(ctxt->rate,
+ c->opts.btree_node_size >> 9);
+ if (ctxt->stats) {
+ atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
+ atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+ }
+ }
+next:
+ bp_pos = bpos_nosnap_successor(bp_pos);
+ }
+
+ trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
+err:
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
-typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *,
- struct btree *, struct bch_io_opts *,
- struct data_opts *);
+int bch2_evacuate_bucket(struct bch_fs *c,
+ struct bpos bucket, int gen,
+ struct data_update_opts data_opts,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc)
+{
+ struct btree_trans trans;
+ struct moving_context ctxt;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+ ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts);
+ bch2_moving_ctxt_exit(&ctxt);
+ bch2_trans_exit(&trans);
+
+ return ret;
+}
+
+typedef bool (*move_btree_pred)(struct bch_fs *, void *,
+ struct btree *, struct bch_io_opts *,
+ struct data_update_opts *);
static int bch2_move_btree(struct bch_fs *c,
enum btree_id start_btree_id, struct bpos start_pos,
@@ -867,8 +896,7 @@ static int bch2_move_btree(struct bch_fs *c,
struct btree_iter iter;
struct btree *b;
enum btree_id id;
- struct data_opts data_opts;
- enum data_cmd cmd;
+ struct data_update_opts data_opts;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
@@ -897,27 +925,18 @@ retry:
stats->pos = iter.pos;
- switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
- case DATA_SKIP:
+ if (!pred(c, arg, b, &io_opts, &data_opts))
goto next;
- case DATA_SCRUB:
- BUG();
- case DATA_ADD_REPLICAS:
- case DATA_REWRITE:
- break;
- default:
- BUG();
- }
ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
next:
bch2_btree_iter_next_node(&iter);
}
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_iter_exit(&trans, &iter);
@@ -929,30 +948,18 @@ next:
bch2_trans_exit(&trans);
if (ret)
- bch_err(c, "error %i in bch2_move_btree", ret);
+ bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
- /* flush relevant btree updates */
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
+ bch2_btree_interior_updates_flush(c);
progress_list_del(c, stats);
return ret;
}
-#if 0
-static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
-{
- return DATA_SCRUB;
-}
-#endif
-
-static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool rereplicate_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
unsigned nr_good = bch2_bkey_durability(c, k);
unsigned replicas = bkey_is_btree_ptr(k.k)
@@ -960,43 +967,50 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
: io_opts->data_replicas;
if (!nr_good || nr_good >= replicas)
- return DATA_SKIP;
+ return false;
data_opts->target = 0;
- data_opts->nr_replicas = 1;
+ data_opts->extra_replicas = replicas - nr_good;
data_opts->btree_insert_flags = 0;
- return DATA_ADD_REPLICAS;
+ return true;
}
-static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool migrate_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
struct bch_ioctl_data *op = arg;
+ unsigned i = 0;
- if (!bch2_bkey_has_device(k, op->migrate.dev))
- return DATA_SKIP;
-
+ data_opts->rewrite_ptrs = 0;
data_opts->target = 0;
- data_opts->nr_replicas = 1;
+ data_opts->extra_replicas = 0;
data_opts->btree_insert_flags = 0;
- data_opts->rewrite_dev = op->migrate.dev;
- return DATA_REWRITE;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr->dev == op->migrate.dev)
+ data_opts->rewrite_ptrs |= 1U << i;
+ i++;
+ }
+
+ return data_opts->rewrite_ptrs != 0;
}
-static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
-static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool migrate_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
@@ -1025,21 +1039,21 @@ static bool bformat_needs_redo(struct bkey_format *f)
return false;
}
-static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
+static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
if (b->version_ondisk != c->sb.version ||
btree_node_need_rewrite(b) ||
bformat_needs_redo(&b->format)) {
data_opts->target = 0;
- data_opts->nr_replicas = 1;
+ data_opts->extra_replicas = 0;
data_opts->btree_insert_flags = 0;
- return DATA_REWRITE;
+ return true;
}
- return DATA_SKIP;
+ return false;
}
int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
@@ -1070,7 +1084,7 @@ int bch2_data_job(struct bch_fs *c,
switch (op.op) {
case BCH_DATA_OP_REREPLICATE:
- bch_move_stats_init(stats, "rereplicate");
+ bch2_move_stats_init(stats, "rereplicate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
@@ -1083,15 +1097,18 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_move_data(c,
op.start_btree, op.start_pos,
op.end_btree, op.end_pos,
- NULL, writepoint_hashed((unsigned long) current),
- rereplicate_pred, c, stats) ?: ret;
+ NULL,
+ stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_MIGRATE:
if (op.migrate.dev >= c->sb.nr_devices)
return -EINVAL;
- bch_move_stats_init(stats, "migrate");
+ bch2_move_stats_init(stats, "migrate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
@@ -1104,12 +1121,15 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_move_data(c,
op.start_btree, op.start_pos,
op.end_btree, op.end_pos,
- NULL, writepoint_hashed((unsigned long) current),
- migrate_pred, &op, stats) ?: ret;
+ NULL,
+ stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
- bch_move_stats_init(stats, "rewrite_old_nodes");
+ bch2_move_stats_init(stats, "rewrite_old_nodes");
ret = bch2_scan_old_btree_nodes(c, stats);
break;
default:
@@ -1118,3 +1138,67 @@ int bch2_data_job(struct bch_fs *c,
return ret;
}
+
+void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct bch_move_stats *stats;
+
+ mutex_lock(&c->data_progress_lock);
+ list_for_each_entry(stats, &c->data_progress_list, list) {
+ prt_printf(out, "%s: data type %s btree_id %s position: ",
+ stats->name,
+ bch2_data_types[stats->data_type],
+ bch2_btree_ids[stats->btree_id]);
+ bch2_bpos_to_text(out, stats->pos);
+ prt_printf(out, "%s", "\n");
+ }
+ mutex_unlock(&c->data_progress_lock);
+}
+
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt)
+{
+ struct moving_io *io;
+
+ prt_printf(out, "%ps:", ctxt->fn);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "reads: %u sectors %u",
+ atomic_read(&ctxt->read_ios),
+ atomic_read(&ctxt->read_sectors));
+ prt_newline(out);
+
+ prt_printf(out, "writes: %u sectors %u",
+ atomic_read(&ctxt->write_ios),
+ atomic_read(&ctxt->write_sectors));
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+
+ mutex_lock(&ctxt->lock);
+ list_for_each_entry(io, &ctxt->ios, io_list) {
+ bch2_write_op_to_text(out, &io->write.op);
+ }
+ mutex_unlock(&ctxt->lock);
+
+ printbuf_indent_sub(out, 4);
+}
+
+void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct moving_context *ctxt;
+
+ mutex_lock(&c->moving_context_lock);
+ list_for_each_entry(ctxt, &c->moving_context_list, list)
+ bch2_moving_ctxt_to_text(out, ctxt);
+ mutex_unlock(&c->moving_context_lock);
+}
+
+void bch2_fs_move_init(struct bch_fs *c)
+{
+ INIT_LIST_HEAD(&c->moving_context_list);
+ mutex_init(&c->moving_context_lock);
+
+ INIT_LIST_HEAD(&c->data_progress_list);
+ mutex_init(&c->data_progress_lock);
+}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 98323ad93e7c..50a6f7d7a292 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -4,53 +4,64 @@
#include "btree_iter.h"
#include "buckets.h"
-#include "io_types.h"
+#include "data_update.h"
#include "move_types.h"
struct bch_read_bio;
-struct moving_context;
-
-enum data_cmd {
- DATA_SKIP,
- DATA_SCRUB,
- DATA_ADD_REPLICAS,
- DATA_REWRITE,
- DATA_PROMOTE,
-};
-struct data_opts {
- u16 target;
- u8 rewrite_dev;
- u8 nr_replicas;
- int btree_insert_flags;
-};
+struct moving_context {
+ struct bch_fs *c;
+ struct list_head list;
+ void *fn;
-struct migrate_write {
- enum btree_id btree_id;
- enum data_cmd data_cmd;
- struct data_opts data_opts;
+ struct bch_ratelimit *rate;
+ struct bch_move_stats *stats;
+ struct write_point_specifier wp;
+ bool wait_on_copygc;
+ bool write_error;
- unsigned nr_ptrs_reserved;
+ /* For waiting on outstanding reads and writes: */
+ struct closure cl;
- struct moving_context *ctxt;
+ struct mutex lock;
+ struct list_head reads;
+ struct list_head ios;
- /* what we read: */
- struct bch_extent_ptr ptr;
- u64 offset;
+ /* in flight sectors: */
+ atomic_t read_sectors;
+ atomic_t write_sectors;
+ atomic_t read_ios;
+ atomic_t write_ios;
- struct bch_write_op op;
+ wait_queue_head_t wait;
};
-void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
-int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
- struct write_point_specifier,
- struct bch_io_opts,
- enum data_cmd, struct data_opts,
- enum btree_id, struct bkey_s_c);
-
-typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
- struct bkey_s_c,
- struct bch_io_opts *, struct data_opts *);
+void bch2_verify_bucket_evacuated(struct btree_trans *, struct bpos, int);
+
+#define move_ctxt_wait_event(_ctxt, _trans, _cond) \
+do { \
+ bool cond_finished = false; \
+ bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \
+ \
+ if (_cond) \
+ break; \
+ __wait_event((_ctxt)->wait, \
+ bch2_moving_ctxt_next_pending_write(_ctxt) || \
+ (cond_finished = (_cond))); \
+ if (cond_finished) \
+ break; \
+} while (1)
+
+typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
+ struct bch_io_opts *, struct data_update_opts *);
+
+void bch2_moving_ctxt_exit(struct moving_context *);
+void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
+ struct bch_ratelimit *, struct bch_move_stats *,
+ struct write_point_specifier, bool);
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
+ struct btree_trans *);
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
@@ -58,21 +69,30 @@ int bch2_move_data(struct bch_fs *,
enum btree_id, struct bpos,
enum btree_id, struct bpos,
struct bch_ratelimit *,
+ struct bch_move_stats *,
struct write_point_specifier,
- move_pred_fn, void *,
- struct bch_move_stats *);
-
+ bool,
+ move_pred_fn, void *);
+
+int __bch2_evacuate_bucket(struct btree_trans *,
+ struct moving_context *,
+ struct move_bucket_in_flight *,
+ struct bpos, int,
+ struct data_update_opts);
+int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
+ struct data_update_opts,
+ struct bch_ratelimit *,
+ struct bch_move_stats *,
+ struct write_point_specifier,
+ bool);
int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
-static inline void bch_move_stats_init(struct bch_move_stats *stats, char *name)
-{
- memset(stats, 0, sizeof(*stats));
-
- scnprintf(stats->name, sizeof(stats->name),
- "%s", name);
-}
+void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
+void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *);
+void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
+void bch2_fs_move_init(struct bch_fs *);
#endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index 9df6d18137a5..baf1f8570b3f 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -16,4 +16,21 @@ struct bch_move_stats {
atomic64_t sectors_raced;
};
+struct move_bucket_key {
+ struct bpos bucket;
+ u8 gen;
+};
+
+struct move_bucket {
+ struct move_bucket_key k;
+ unsigned sectors;
+};
+
+struct move_bucket_in_flight {
+ struct move_bucket_in_flight *next;
+ struct rhash_head hash;
+ struct move_bucket bucket;
+ atomic_t count;
+};
+
#endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 4a8c4e3a15e0..d13a120da267 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -6,22 +6,27 @@
*/
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_iter.h"
#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "clock.h"
#include "disk_groups.h"
+#include "errcode.h"
#include "error.h"
#include "extents.h"
#include "eytzinger.h"
#include "io.h"
#include "keylist.h"
+#include "lru.h"
#include "move.h"
#include "movinggc.h"
#include "super-io.h"
#include <trace/events/bcachefs.h>
+#include <linux/bsearch.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/math64.h>
@@ -29,261 +34,233 @@
#include <linux/sort.h>
#include <linux/wait.h>
-/*
- * We can't use the entire copygc reserve in one iteration of copygc: we may
- * need the buckets we're freeing up to go back into the copygc reserve to make
- * forward progress, but if the copygc reserve is full they'll be available for
- * any allocation - and it's possible that in a given iteration, we free up most
- * of the buckets we're going to free before we allocate most of the buckets
- * we're going to allocate.
- *
- * If we only use half of the reserve per iteration, then in steady state we'll
- * always have room in the reserve for the buckets we're going to need in the
- * next iteration:
- */
-#define COPYGC_BUCKETS_PER_ITER(ca) \
- ((ca)->free[RESERVE_MOVINGGC].size / 2)
-
-static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
+struct buckets_in_flight {
+ struct rhashtable table;
+ struct move_bucket_in_flight *first;
+ struct move_bucket_in_flight *last;
+ size_t nr;
+ size_t sectors;
+};
+
+static const struct rhashtable_params bch_move_bucket_params = {
+ .head_offset = offsetof(struct move_bucket_in_flight, hash),
+ .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
+ .key_len = sizeof(struct move_bucket_key),
+};
+
+static struct move_bucket_in_flight *
+move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
{
- const struct copygc_heap_entry *l = _l;
- const struct copygc_heap_entry *r = _r;
+ struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL);
+ int ret;
- return cmp_int(l->dev, r->dev) ?:
- cmp_int(l->offset, r->offset);
-}
+ if (!new)
+ return ERR_PTR(-ENOMEM);
-static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
-{
- copygc_heap *h = &c->copygc_heap;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p = { 0 };
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct copygc_heap_entry search = {
- .dev = p.ptr.dev,
- .offset = p.ptr.offset,
- };
-
- ssize_t i = eytzinger0_find_le(h->data, h->used,
- sizeof(h->data[0]),
- bucket_offset_cmp, &search);
-#if 0
- /* eytzinger search verify code: */
- ssize_t j = -1, k;
-
- for (k = 0; k < h->used; k++)
- if (h->data[k].offset <= ptr->offset &&
- (j < 0 || h->data[k].offset > h->data[j].offset))
- j = k;
-
- BUG_ON(i != j);
-#endif
- if (i >= 0 &&
- p.ptr.dev == h->data[i].dev &&
- p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
- p.ptr.gen == h->data[i].gen) {
- /*
- * We need to use the journal reserve here, because
- * - journal reclaim depends on btree key cache
- * flushing to make forward progress,
- * - which has to make forward progress when the
- * journal is pre-reservation full,
- * - and depends on allocation - meaning allocator and
- * copygc
- */
-
- data_opts->target = io_opts->background_target;
- data_opts->nr_replicas = 1;
- data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_JOURNAL_RESERVED;
- data_opts->rewrite_dev = p.ptr.dev;
-
- if (p.has_ec)
- data_opts->nr_replicas += p.ec.redundancy;
-
- return DATA_REWRITE;
- }
+ new->bucket = b;
+
+ ret = rhashtable_lookup_insert_fast(&list->table, &new->hash,
+ bch_move_bucket_params);
+ if (ret) {
+ kfree(new);
+ return ERR_PTR(ret);
}
- return DATA_SKIP;
+ if (!list->first)
+ list->first = new;
+ else
+ list->last->next = new;
+
+ list->last = new;
+ list->nr++;
+ list->sectors += b.sectors;
+ return new;
}
-static bool have_copygc_reserve(struct bch_dev *ca)
+static int bch2_bucket_is_movable(struct btree_trans *trans,
+ struct move_bucket *b, u64 time)
{
- bool ret;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_alloc_v4 _a;
+ const struct bch_alloc_v4 *a;
+ int ret;
- spin_lock(&ca->fs->freelist_lock);
- ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
- ca->allocator_state != ALLOCATOR_running;
- spin_unlock(&ca->fs->freelist_lock);
+ if (bch2_bucket_is_open(trans->c,
+ b->k.bucket.inode,
+ b->k.bucket.offset))
+ return 0;
- return ret;
-}
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ b->k.bucket, BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ bch2_trans_iter_exit(trans, &iter);
-static inline int fragmentation_cmp(copygc_heap *heap,
- struct copygc_heap_entry l,
- struct copygc_heap_entry r)
-{
- return cmp_int(l.fragmentation, r.fragmentation);
+ if (ret)
+ return ret;
+
+ a = bch2_alloc_to_v4(k, &_a);
+ b->k.gen = a->gen;
+ b->sectors = a->dirty_sectors;
+
+ ret = data_type_movable(a->data_type) &&
+ a->fragmentation_lru &&
+ a->fragmentation_lru <= time;
+
+ if (!ret) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ pr_debug("%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ return ret;
}
-static int bch2_copygc(struct bch_fs *c)
+static void move_buckets_wait(struct btree_trans *trans,
+ struct moving_context *ctxt,
+ struct buckets_in_flight *list,
+ bool flush)
{
- copygc_heap *h = &c->copygc_heap;
- struct copygc_heap_entry e, *i;
- struct bucket_array *buckets;
- struct bch_move_stats move_stats;
- u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0;
- u64 sectors_reserved = 0;
- u64 buckets_to_move, buckets_not_moved = 0;
- struct bch_dev *ca;
- unsigned dev_idx;
- size_t b, heap_size = 0;
+ struct move_bucket_in_flight *i;
int ret;
- bch_move_stats_init(&move_stats, "copygc");
+ while ((i = list->first)) {
+ if (flush)
+ move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
- /*
- * Find buckets with lowest sector counts, skipping completely
- * empty buckets, by building a maxheap sorted by sector count,
- * and repeatedly replacing the maximum element until all
- * buckets have been visited.
- */
- h->used = 0;
+ if (atomic_read(&i->count))
+ break;
- for_each_rw_member(ca, c, dev_idx)
- heap_size += ca->mi.nbuckets >> 7;
+ /*
+ * moving_ctxt_exit calls bch2_write as it flushes pending
+ * reads, which inits another btree_trans; this one must be
+ * unlocked:
+ */
+ bch2_verify_bucket_evacuated(trans, i->bucket.k.bucket, i->bucket.k.gen);
- if (h->size < heap_size) {
- free_heap(&c->copygc_heap);
- if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
- bch_err(c, "error allocating copygc heap");
- return 0;
- }
- }
+ list->first = i->next;
+ if (!list->first)
+ list->last = NULL;
- for_each_rw_member(ca, c, dev_idx) {
- closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
-
- spin_lock(&ca->fs->freelist_lock);
- sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
- spin_unlock(&ca->fs->freelist_lock);
-
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
- struct bucket *g = buckets->b + b;
- struct bucket_mark m = READ_ONCE(g->mark);
- struct copygc_heap_entry e;
-
- if (m.owned_by_allocator ||
- m.data_type != BCH_DATA_user ||
- !bucket_sectors_used(m) ||
- bucket_sectors_used(m) >= ca->mi.bucket_size)
- continue;
-
- WARN_ON(m.stripe && !g->stripe_redundancy);
-
- e = (struct copygc_heap_entry) {
- .dev = dev_idx,
- .gen = m.gen,
- .replicas = 1 + g->stripe_redundancy,
- .fragmentation = bucket_sectors_used(m) * (1U << 15)
- / ca->mi.bucket_size,
- .sectors = bucket_sectors_used(m),
- .offset = bucket_to_sector(ca, b),
- };
- heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
- }
- up_read(&ca->bucket_lock);
- }
+ list->nr--;
+ list->sectors -= i->bucket.sectors;
- if (!h->used) {
- bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!");
- return 0;
+ ret = rhashtable_remove_fast(&list->table, &i->hash,
+ bch_move_bucket_params);
+ BUG_ON(ret);
+ kfree(i);
}
- /*
- * Our btree node allocations also come out of RESERVE_MOVINGGC:
- */
- sectors_reserved = (sectors_reserved * 3) / 4;
- if (!sectors_reserved) {
- bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
- return -1;
- }
+ bch2_trans_unlock(trans);
+}
- for (i = h->data; i < h->data + h->used; i++) {
- sectors_to_move += i->sectors;
- sectors_to_write += i->sectors * i->replicas;
- }
+static bool bucket_in_flight(struct buckets_in_flight *list,
+ struct move_bucket_key k)
+{
+ return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
+}
- while (sectors_to_write > sectors_reserved) {
- BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
- sectors_to_write -= e.sectors * e.replicas;
- }
+typedef DARRAY(struct move_bucket) move_buckets;
- buckets_to_move = h->used;
+static int bch2_copygc_get_buckets(struct btree_trans *trans,
+ struct moving_context *ctxt,
+ struct buckets_in_flight *buckets_in_flight,
+ move_buckets *buckets)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4);
+ size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
+ int ret;
- if (!buckets_to_move) {
- bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!",
- sectors_reserved);
- return 0;
- }
+ move_buckets_wait(trans, ctxt, buckets_in_flight, false);
+
+ ret = bch2_btree_write_buffer_flush(trans);
+ if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
+ __func__, bch2_err_str(ret)))
+ return ret;
+
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+ lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
+ lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
+ 0, k, ({
+ struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
+ int ret = 0;
+
+ saw++;
+
+ if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)))
+ not_movable++;
+ else if (bucket_in_flight(buckets_in_flight, b.k))
+ in_flight++;
+ else {
+ ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
+ if (ret >= 0)
+ sectors += b.sectors;
+ }
+ ret;
+ }));
- eytzinger0_sort(h->data, h->used,
- sizeof(h->data[0]),
- bucket_offset_cmp, NULL);
+ pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
+ buckets_in_flight->nr, buckets_in_flight->sectors,
+ saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret);
- ret = bch2_move_data(c,
- 0, POS_MIN,
- BTREE_ID_NR, POS_MAX,
- NULL,
- writepoint_ptr(&c->copygc_write_point),
- copygc_pred, NULL,
- &move_stats);
+ return ret < 0 ? ret : 0;
+}
- for_each_rw_member(ca, c, dev_idx) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
- for (i = h->data; i < h->data + h->used; i++) {
- struct bucket_mark m;
- size_t b;
-
- if (i->dev != dev_idx)
- continue;
-
- b = sector_to_bucket(ca, i->offset);
- m = READ_ONCE(buckets->b[b].mark);
-
- if (i->gen == m.gen &&
- bucket_sectors_used(m)) {
- sectors_not_moved += bucket_sectors_used(m);
- buckets_not_moved++;
- }
+static int bch2_copygc(struct btree_trans *trans,
+ struct moving_context *ctxt,
+ struct buckets_in_flight *buckets_in_flight)
+{
+ struct bch_fs *c = trans->c;
+ struct data_update_opts data_opts = {
+ .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
+ };
+ move_buckets buckets = { 0 };
+ struct move_bucket_in_flight *f;
+ struct move_bucket *i;
+ u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
+ int ret = 0;
+
+ ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
+ if (ret)
+ goto err;
+
+ darray_for_each(buckets, i) {
+ if (unlikely(freezing(current)))
+ break;
+
+ f = move_bucket_in_flight_add(buckets_in_flight, *i);
+ ret = PTR_ERR_OR_ZERO(f);
+ if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */
+ continue;
+ if (ret == -ENOMEM) { /* flush IO, continue later */
+ ret = 0;
+ break;
}
- up_read(&ca->bucket_lock);
+
+ ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket,
+ f->bucket.k.gen, data_opts);
+ if (ret)
+ goto err;
}
+err:
+ darray_exit(&buckets);
- if (sectors_not_moved && !ret)
- bch_warn_ratelimited(c,
- "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
- sectors_not_moved, sectors_to_move,
- buckets_not_moved, buckets_to_move,
- atomic64_read(&move_stats.sectors_moved),
- atomic64_read(&move_stats.keys_raced),
- atomic64_read(&move_stats.sectors_raced));
-
- trace_copygc(c,
- atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
- buckets_to_move, buckets_not_moved);
- return 0;
+ /* no entries in LRU btree found, or got to end: */
+ if (ret == -ENOENT)
+ ret = 0;
+
+ if (ret < 0 && !bch2_err_matches(ret, EROFS))
+ bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
+
+ moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
+ trace_and_count(c, copygc, c, moved, 0, 0, 0);
+ return ret;
}
/*
@@ -305,13 +282,18 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
struct bch_dev *ca;
unsigned dev_idx;
s64 wait = S64_MAX, fragmented_allowed, fragmented;
+ unsigned i;
for_each_rw_member(ca, c, dev_idx) {
struct bch_dev_usage usage = bch2_dev_usage_read(ca);
- fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) *
- ca->mi.bucket_size) >> 1);
- fragmented = usage.d[BCH_DATA_user].fragmented;
+ fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_stripe) *
+ ca->mi.bucket_size) >> 1);
+ fragmented = 0;
+
+ for (i = 0; i < BCH_DATA_NR; i++)
+ if (data_type_movable(i))
+ fragmented += usage.d[i].fragmented;
wait = min(wait, max(0LL, fragmented_allowed - fragmented));
}
@@ -319,26 +301,75 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
return wait;
}
+void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ prt_printf(out, "Currently waiting for: ");
+ prt_human_readable_u64(out, max(0LL, c->copygc_wait -
+ atomic64_read(&c->io_clock[WRITE].now)) << 9);
+ prt_newline(out);
+
+ prt_printf(out, "Currently waiting since: ");
+ prt_human_readable_u64(out, max(0LL,
+ atomic64_read(&c->io_clock[WRITE].now) -
+ c->copygc_wait_at) << 9);
+ prt_newline(out);
+
+ prt_printf(out, "Currently calculated wait: ");
+ prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
+ prt_newline(out);
+}
+
static int bch2_copygc_thread(void *arg)
{
struct bch_fs *c = arg;
+ struct btree_trans trans;
+ struct moving_context ctxt;
+ struct bch_move_stats move_stats;
struct io_clock *clock = &c->io_clock[WRITE];
+ struct buckets_in_flight move_buckets;
u64 last, wait;
+ int ret = 0;
+
+ memset(&move_buckets, 0, sizeof(move_buckets));
+
+ ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params);
+ if (ret) {
+ bch_err(c, "error allocating copygc buckets in flight: %s",
+ bch2_err_str(ret));
+ return ret;
+ }
set_freezable();
+ bch2_trans_init(&trans, c, 0, 0);
+
+ bch2_move_stats_init(&move_stats, "copygc");
+ bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
+ writepoint_ptr(&c->copygc_write_point),
+ false);
- while (!kthread_should_stop()) {
+ while (!ret && !kthread_should_stop()) {
+ bch2_trans_unlock(&trans);
cond_resched();
- if (kthread_wait_freezable(c->copy_gc_enabled))
- break;
+ if (!c->copy_gc_enabled) {
+ move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+ kthread_wait_freezable(c->copy_gc_enabled);
+ }
+
+ if (unlikely(freezing(current))) {
+ move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+ __refrigerator(false);
+ continue;
+ }
last = atomic64_read(&clock->now);
wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) {
- trace_copygc_wait(c, wait, last + wait);
+ c->copygc_wait_at = last;
c->copygc_wait = last + wait;
+ move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+ trace_and_count(c, copygc_wait, c, wait, last + wait);
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
continue;
@@ -346,10 +377,17 @@ static int bch2_copygc_thread(void *arg)
c->copygc_wait = 0;
- if (bch2_copygc(c))
- break;
+ c->copygc_running = true;
+ ret = bch2_copygc(&trans, &ctxt, &move_buckets);
+ c->copygc_running = false;
+
+ wake_up(&c->copygc_running_wq);
}
+ move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+ bch2_trans_exit(&trans);
+ bch2_moving_ctxt_exit(&ctxt);
+
return 0;
}
@@ -365,6 +403,7 @@ void bch2_copygc_stop(struct bch_fs *c)
int bch2_copygc_start(struct bch_fs *c)
{
struct task_struct *t;
+ int ret;
if (c->copygc_thread)
return 0;
@@ -376,9 +415,10 @@ int bch2_copygc_start(struct bch_fs *c)
return -ENOMEM;
t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
- if (IS_ERR(t)) {
- bch_err(c, "error creating copygc thread: %li", PTR_ERR(t));
- return PTR_ERR(t);
+ ret = PTR_ERR_OR_ZERO(t);
+ if (ret) {
+ bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret));
+ return ret;
}
get_task_struct(t);
@@ -391,4 +431,6 @@ int bch2_copygc_start(struct bch_fs *c)
void bch2_fs_copygc_init(struct bch_fs *c)
{
+ init_waitqueue_head(&c->copygc_running_wq);
+ c->copygc_running = false;
}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
index 922738247d03..ea181fef5bc9 100644
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
@@ -2,6 +2,9 @@
#ifndef _BCACHEFS_MOVINGGC_H
#define _BCACHEFS_MOVINGGC_H
+unsigned long bch2_copygc_wait_amount(struct bch_fs *);
+void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
+
void bch2_copygc_stop(struct bch_fs *);
int bch2_copygc_start(struct bch_fs *);
void bch2_fs_copygc_init(struct bch_fs *);
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
new file mode 100644
index 000000000000..396357cd8f2f
--- /dev/null
+++ b/fs/bcachefs/nocow_locking.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "nocow_locking.h"
+#include "util.h"
+
+#include <linux/closure.h>
+
+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
+{
+ u64 dev_bucket = bucket_to_u64(bucket);
+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(l->b); i++)
+ if (l->b[i] == dev_bucket && atomic_read(&l->l[i]))
+ return true;
+ return false;
+}
+
+#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0)
+
+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
+{
+ u64 dev_bucket = bucket_to_u64(bucket);
+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+ int lock_val = flags ? 1 : -1;
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(l->b); i++)
+ if (l->b[i] == dev_bucket) {
+ BUG_ON(sign(atomic_read(&l->l[i])) != lock_val);
+
+ if (!atomic_sub_return(lock_val, &l->l[i]))
+ closure_wake_up(&l->wait);
+ return;
+ }
+
+ BUG();
+}
+
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
+ u64 dev_bucket, int flags)
+{
+ int v, lock_val = flags ? 1 : -1;
+ unsigned i;
+
+ spin_lock(&l->lock);
+
+ for (i = 0; i < ARRAY_SIZE(l->b); i++)
+ if (l->b[i] == dev_bucket)
+ goto got_entry;
+
+ for (i = 0; i < ARRAY_SIZE(l->b); i++)
+ if (!atomic_read(&l->l[i])) {
+ l->b[i] = dev_bucket;
+ goto take_lock;
+ }
+fail:
+ spin_unlock(&l->lock);
+ return false;
+got_entry:
+ v = atomic_read(&l->l[i]);
+ if (lock_val > 0 ? v < 0 : v > 0)
+ goto fail;
+take_lock:
+ atomic_add(lock_val, &l->l[i]);
+ spin_unlock(&l->lock);
+ return true;
+}
+
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+ struct nocow_lock_bucket *l,
+ u64 dev_bucket, int flags)
+{
+ if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
+ struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
+ u64 start_time = local_clock();
+
+ __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
+ bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+ }
+}
+
+void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
+{
+ unsigned i, nr_zero = 0;
+ struct nocow_lock_bucket *l;
+
+ for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) {
+ unsigned v = 0;
+
+ for (i = 0; i < ARRAY_SIZE(l->l); i++)
+ v |= atomic_read(&l->l[i]);
+
+ if (!v) {
+ nr_zero++;
+ continue;
+ }
+
+ if (nr_zero)
+ prt_printf(out, "(%u empty entries)\n", nr_zero);
+ nr_zero = 0;
+
+ for (i = 0; i < ARRAY_SIZE(l->l); i++)
+ if (atomic_read(&l->l[i]))
+ prt_printf(out, "%llu: %i ", l->b[i], atomic_read(&l->l[i]));
+ prt_newline(out);
+ }
+
+ if (nr_zero)
+ prt_printf(out, "(%u empty entries)\n", nr_zero);
+}
+
+int bch2_fs_nocow_locking_init(struct bch_fs *c)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++)
+ spin_lock_init(&c->nocow_locks.l[i].lock);
+
+ return 0;
+}
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
new file mode 100644
index 000000000000..ff8e4af52edc
--- /dev/null
+++ b/fs/bcachefs/nocow_locking.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_NOCOW_LOCKING_H
+#define _BCACHEFS_NOCOW_LOCKING_H
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "nocow_locking_types.h"
+
+#include <linux/hash.h>
+
+static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+ u64 dev_bucket)
+{
+ unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS);
+
+ return t->l + (h & (BUCKET_NOCOW_LOCKS - 1));
+}
+
+#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0)
+
+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
+ struct nocow_lock_bucket *, u64, int);
+
+static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+ struct bpos bucket, int flags)
+{
+ u64 dev_bucket = bucket_to_u64(bucket);
+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+ __bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
+}
+
+static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
+ struct bpos bucket, int flags)
+{
+ u64 dev_bucket = bucket_to_u64(bucket);
+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+ return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
+}
+
+void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
+
+int bch2_fs_nocow_locking_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h
new file mode 100644
index 000000000000..bd12bf677924
--- /dev/null
+++ b/fs/bcachefs/nocow_locking_types.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H
+#define _BCACHEFS_NOCOW_LOCKING_TYPES_H
+
+#define BUCKET_NOCOW_LOCKS_BITS 10
+#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS)
+
+struct nocow_lock_bucket {
+ struct closure_waitlist wait;
+ spinlock_t lock;
+ u64 b[4];
+ atomic_t l[4];
+} __aligned(SMP_CACHE_BYTES);
+
+struct bucket_nocow_lock_table {
+ struct nocow_lock_bucket l[BUCKET_NOCOW_LOCKS];
+};
+
+#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */
+
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 71bf26eb13d5..04e2989cd6b3 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -9,7 +9,12 @@
#include "super-io.h"
#include "util.h"
-#define x(t, n) #t,
+#define x(t, n) [n] = #t,
+
+const char * const bch2_metadata_versions[] = {
+ BCH_METADATA_VERSIONS()
+ NULL
+};
const char * const bch2_error_actions[] = {
BCH_ERROR_ACTIONS()
@@ -28,6 +33,7 @@ const char * const bch2_sb_compat[] = {
const char * const bch2_btree_ids[] = {
BCH_BTREE_IDS()
+ "interior btree node",
NULL
};
@@ -96,6 +102,16 @@ const char * const bch2_d_types[BCH_DT_MAX] = {
[DT_SUBVOL] = "subvol",
};
+u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
+{
+ BUG();
+}
+
+void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
+{
+ BUG();
+}
+
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
{
#define x(_name, ...) \
@@ -209,62 +225,75 @@ static int bch2_mount_opt_lookup(const char *name)
return bch2_opt_lookup(name);
}
-static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v)
+int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
{
if (v < opt->min) {
- if (msg)
- pr_err("invalid %s%s: too small (min %llu)",
- msg, opt->attr.name, opt->min);
+ if (err)
+ prt_printf(err, "%s: too small (min %llu)",
+ opt->attr.name, opt->min);
return -ERANGE;
}
if (opt->max && v >= opt->max) {
- if (msg)
- pr_err("invalid %s%s: too big (max %llu)",
- msg, opt->attr.name, opt->max);
+ if (err)
+ prt_printf(err, "%s: too big (max %llu)",
+ opt->attr.name, opt->max);
return -ERANGE;
}
if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
- if (msg)
- pr_err("invalid %s %s: not a multiple of 512",
- msg, opt->attr.name);
+ if (err)
+ prt_printf(err, "%s: not a multiple of 512",
+ opt->attr.name);
return -EINVAL;
}
if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
- if (msg)
- pr_err("invalid %s%s: must be a power of two",
- msg, opt->attr.name);
+ if (err)
+ prt_printf(err, "%s: must be a power of two",
+ opt->attr.name);
return -EINVAL;
}
return 0;
}
-int bch2_opt_parse(struct bch_fs *c, const char *msg,
+int bch2_opt_parse(struct bch_fs *c,
const struct bch_option *opt,
- const char *val, u64 *res)
+ const char *val, u64 *res,
+ struct printbuf *err)
{
ssize_t ret;
switch (opt->type) {
case BCH_OPT_BOOL:
ret = kstrtou64(val, 10, res);
- if (ret < 0)
+ if (ret < 0 || (*res != 0 && *res != 1)) {
+ if (err)
+ prt_printf(err, "%s: must be bool",
+ opt->attr.name);
return ret;
+ }
break;
case BCH_OPT_UINT:
ret = opt->flags & OPT_HUMAN_READABLE
? bch2_strtou64_h(val, res)
: kstrtou64(val, 10, res);
- if (ret < 0)
+ if (ret < 0) {
+ if (err)
+ prt_printf(err, "%s: must be a number",
+ opt->attr.name);
return ret;
+ }
break;
case BCH_OPT_STR:
ret = match_string(opt->choices, -1, val);
- if (ret < 0)
+ if (ret < 0) {
+ if (err)
+ prt_printf(err, "%s: invalid selection",
+ opt->attr.name);
return ret;
+ }
*res = ret;
break;
@@ -273,44 +302,49 @@ int bch2_opt_parse(struct bch_fs *c, const char *msg,
return 0;
ret = opt->parse(c, val, res);
- if (ret < 0)
+ if (ret < 0) {
+ if (err)
+ prt_printf(err, "%s: parse error",
+ opt->attr.name);
return ret;
+ }
}
- return bch2_opt_validate(opt, msg, *res);
+ return bch2_opt_validate(opt, *res, err);
}
-void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
+void bch2_opt_to_text(struct printbuf *out,
+ struct bch_fs *c, struct bch_sb *sb,
const struct bch_option *opt, u64 v,
unsigned flags)
{
if (flags & OPT_SHOW_MOUNT_STYLE) {
if (opt->type == BCH_OPT_BOOL) {
- pr_buf(out, "%s%s",
+ prt_printf(out, "%s%s",
v ? "" : "no",
opt->attr.name);
return;
}
- pr_buf(out, "%s=", opt->attr.name);
+ prt_printf(out, "%s=", opt->attr.name);
}
switch (opt->type) {
case BCH_OPT_BOOL:
case BCH_OPT_UINT:
if (opt->flags & OPT_HUMAN_READABLE)
- bch2_hprint(out, v);
+ prt_human_readable_u64(out, v);
else
- pr_buf(out, "%lli", v);
+ prt_printf(out, "%lli", v);
break;
case BCH_OPT_STR:
if (flags & OPT_SHOW_FULL_LIST)
- bch2_string_opt_to_text(out, opt->choices, v);
+ prt_string_option(out, opt->choices, v);
else
- pr_buf(out, opt->choices[v]);
+ prt_printf(out, "%s", opt->choices[v]);
break;
case BCH_OPT_FN:
- opt->to_text(out, c, v);
+ opt->to_text(out, c, sb, v);
break;
default:
BUG();
@@ -356,6 +390,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
char *copied_opts, *copied_opts_start;
char *opt, *name, *val;
int ret, id;
+ struct printbuf err = PRINTBUF;
u64 v;
if (!options)
@@ -375,8 +410,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
if (id < 0)
goto bad_opt;
- ret = bch2_opt_parse(c, "mount option ",
- &bch2_opt_table[id], val, &v);
+ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
if (ret < 0)
goto bad_val;
} else {
@@ -419,7 +453,7 @@ bad_opt:
ret = -1;
goto out;
bad_val:
- pr_err("Invalid value %s for mount option %s", val, name);
+ pr_err("Invalid mount option %s", err.buf);
ret = -1;
goto out;
no_val:
@@ -428,9 +462,26 @@ no_val:
goto out;
out:
kfree(copied_opts_start);
+ printbuf_exit(&err);
return ret;
}
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
+{
+ const struct bch_option *opt = bch2_opt_table + id;
+ u64 v;
+
+ v = opt->get_sb(sb);
+
+ if (opt->flags & OPT_SB_FIELD_ILOG2)
+ v = 1ULL << v;
+
+ if (opt->flags & OPT_SB_FIELD_SECTORS)
+ v <<= 9;
+
+ return v;
+}
+
/*
* Initial options from superblock - here we don't want any options undefined,
* any options the superblock doesn't specify are set to 0:
@@ -438,28 +489,14 @@ out:
int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
{
unsigned id;
- int ret;
for (id = 0; id < bch2_opts_nr; id++) {
const struct bch_option *opt = bch2_opt_table + id;
- u64 v;
- if (opt->get_sb == NO_SB_OPT)
+ if (opt->get_sb == BCH2_NO_SB_OPT)
continue;
- v = opt->get_sb(sb);
-
- if (opt->flags & OPT_SB_FIELD_ILOG2)
- v = 1ULL << v;
-
- if (opt->flags & OPT_SB_FIELD_SECTORS)
- v <<= 9;
-
- ret = bch2_opt_validate(opt, "superblock option ", v);
- if (ret)
- return ret;
-
- bch2_opt_set_by_id(opts, id, v);
+ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
}
return 0;
@@ -467,7 +504,7 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
{
- if (opt->set_sb == SET_NO_SB_OPT)
+ if (opt->set_sb == SET_BCH2_NO_SB_OPT)
return;
if (opt->flags & OPT_SB_FIELD_SECTORS)
@@ -481,7 +518,7 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
{
- if (opt->set_sb == SET_NO_SB_OPT)
+ if (opt->set_sb == SET_BCH2_NO_SB_OPT)
return;
mutex_lock(&c->sb_lock);
@@ -494,33 +531,11 @@ void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
{
- struct bch_io_opts ret = { 0 };
-#define x(_name, _bits) \
- if (opt_defined(src, _name)) \
- opt_set(ret, _name, src._name);
- BCH_INODE_OPTS()
-#undef x
- return ret;
-}
-
-struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
-{
- struct bch_opts ret = { 0 };
-#define x(_name, _bits) \
- if (opt_defined(src, _name)) \
- opt_set(ret, _name, src._name);
- BCH_INODE_OPTS()
-#undef x
- return ret;
-}
-
-void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
-{
-#define x(_name, _bits) \
- if (opt_defined(src, _name)) \
- opt_set(*dst, _name, src._name);
+ return (struct bch_io_opts) {
+#define x(_name, _bits) ._name = src._name,
BCH_INODE_OPTS()
#undef x
+ };
}
bool bch2_opt_is_inode_opt(enum bch_opt_id id)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index c325a094ae43..719693b333da 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -8,6 +8,7 @@
#include <linux/sysfs.h>
#include "bcachefs_format.h"
+extern const char * const bch2_metadata_versions[];
extern const char * const bch2_error_actions[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
@@ -42,7 +43,8 @@ static inline const char *bch2_d_type_str(unsigned d_type)
*/
/* dummy option, for options that aren't stored in the superblock */
-LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0);
+u64 BCH2_NO_SB_OPT(const struct bch_sb *);
+void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
/* When can be set: */
enum opt_flags {
@@ -90,6 +92,12 @@ enum opt_type {
#define RATELIMIT_ERRORS_DEFAULT false
#endif
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCACHEFS_VERBOSE_DEFAULT true
+#else
+#define BCACHEFS_VERBOSE_DEFAULT false
+#endif
+
#define BCH_OPTS() \
x(block_size, u16, \
OPT_FS|OPT_FORMAT| \
@@ -163,22 +171,22 @@ enum opt_type {
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_METADATA_TARGET, 0, \
- "(target)", "Device or disk group for metadata writes") \
+ "(target)", "Device or label for metadata writes") \
x(foreground_target, u16, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_FOREGROUND_TARGET, 0, \
- "(target)", "Device or disk group for foreground writes") \
+ "(target)", "Device or label for foreground writes") \
x(background_target, u16, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_BACKGROUND_TARGET, 0, \
- "(target)", "Device or disk group to move data to in the background")\
+ "(target)", "Device or label to move data to in the background")\
x(promote_target, u16, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_PROMOTE_TARGET, 0, \
- "(target)", "Device or disk group to promote data to on read")\
+ "(target)", "Device or label to promote data to on read") \
x(erasure_code, u16, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
@@ -202,8 +210,13 @@ enum opt_type {
x(btree_node_mem_ptr_optimization, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- NO_SB_OPT, true, \
+ BCH2_NO_SB_OPT, true, \
NULL, "Stash pointer to in memory btree node in btree ptr")\
+ x(btree_write_buffer_size, u32, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_UINT(16, (1U << 20) - 1), \
+ BCH2_NO_SB_OPT, 1U << 13, \
+ NULL, "Number of btree write buffer entries") \
x(gc_reserve_percent, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(5, 21), \
@@ -229,7 +242,7 @@ enum opt_type {
x(inline_data, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- NO_SB_OPT, true, \
+ BCH2_NO_SB_OPT, true, \
NULL, "Enable inline data extents") \
x(acl, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
@@ -254,26 +267,26 @@ enum opt_type {
x(degraded, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Allow mounting in degraded mode") \
x(very_degraded, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Allow mounting in when data will be missing") \
x(discard, u8, \
OPT_FS|OPT_MOUNT|OPT_DEVICE, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, true, \
NULL, "Enable discard/TRIM support") \
x(verbose, u8, \
- OPT_FS|OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \
NULL, "Extra debugging information during mount/recovery")\
x(journal_flush_delay, u32, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
- OPT_UINT(0, U32_MAX), \
+ OPT_UINT(1, U32_MAX), \
BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \
NULL, "Delay in milliseconds before automatic journal commits")\
x(journal_flush_disabled, u8, \
@@ -288,47 +301,57 @@ enum opt_type {
OPT_UINT(0, U32_MAX), \
BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \
NULL, "Delay in milliseconds before automatic journal reclaim")\
+ x(move_bytes_in_flight, u32, \
+ OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(1024, U32_MAX), \
+ BCH2_NO_SB_OPT, 1U << 20, \
+ NULL, "Maximum Amount of IO to keep in flight by the move path")\
+ x(move_ios_in_flight, u32, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(1, 1024), \
+ BCH2_NO_SB_OPT, 32, \
+ NULL, "Maximum number of IOs to keep in flight by the move path")\
x(fsck, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Run fsck on mount") \
x(fix_errors, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Fix errors during fsck without asking") \
x(ratelimit_errors, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \
+ BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \
NULL, "Ratelimit error messages during fsck") \
x(nochanges, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Super read only mode - no writes at all will be issued,\n"\
"even if we have to replay the journal") \
x(norecovery, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Don't replay the journal") \
- x(rebuild_replicas, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- NO_SB_OPT, false, \
- NULL, "Rebuild the superblock replicas section") \
x(keep_journal, u8, \
0, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Don't free journal entries/keys after startup")\
x(read_entire_journal, u8, \
0, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Read all journal entries, not just dirty ones")\
+ x(read_journal_only, u8, \
+ 0, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Only read the journal, skip the rest of recovery")\
x(journal_transaction_names, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
@@ -337,53 +360,82 @@ enum opt_type {
x(noexcl, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Don't open device in exclusive mode") \
+ x(direct_io, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Use O_DIRECT (userspace only)") \
x(sb, u64, \
OPT_MOUNT, \
OPT_UINT(0, S64_MAX), \
- NO_SB_OPT, BCH_SB_SECTOR, \
+ BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
"offset", "Sector offset of superblock") \
x(read_only, u8, \
OPT_FS, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, NULL) \
x(nostart, u8, \
0, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Don\'t start filesystem, only open devices") \
x(reconstruct_alloc, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Reconstruct alloc btree") \
x(version_upgrade, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, "Set superblock to latest version,\n" \
"allowing any new features to be used") \
+ x(buckets_nouse, u8, \
+ 0, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Allocate the buckets_nouse bitmap") \
x(project, u8, \
OPT_INODE, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, false, \
NULL, NULL) \
+ x(nocow, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_BOOL(), \
+ BCH_SB_NOCOW, false, \
+ NULL, "Nocow mode: Writes will be done in place when possible.\n"\
+ "Snapshots and reflink will still caused writes to be COW\n"\
+ "Implicitly disables data checksumming, compression and encryption")\
+ x(nocow_enabled, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Enable nocow mode: enables runtime locking in\n"\
+ "data move path needed if nocow will ever be in use\n")\
+ x(no_data_io, u8, \
+ OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Skip submit_bio() for data reads and writes, " \
+ "for performance testing purposes") \
x(fs_size, u64, \
OPT_DEVICE, \
OPT_UINT(0, S64_MAX), \
- NO_SB_OPT, 0, \
+ BCH2_NO_SB_OPT, 0, \
"size", "Size of filesystem on device") \
x(bucket, u32, \
OPT_DEVICE, \
OPT_UINT(0, S64_MAX), \
- NO_SB_OPT, 0, \
+ BCH2_NO_SB_OPT, 0, \
"size", "Size of filesystem on device") \
x(durability, u8, \
OPT_DEVICE, \
OPT_UINT(0, BCH_REPLICAS_MAX), \
- NO_SB_OPT, 1, \
+ BCH2_NO_SB_OPT, 1, \
"n", "Data written to this device will be considered\n"\
"to have already been replicated n times")
@@ -442,17 +494,9 @@ struct bch_option {
enum opt_flags flags;
u64 min, max;
- union {
- struct {
- };
- struct {
- const char * const *choices;
- };
- struct {
- int (*parse)(struct bch_fs *, const char *, u64 *);
- void (*to_text)(struct printbuf *, struct bch_fs *, u64);
- };
- };
+ const char * const *choices;
+ int (*parse)(struct bch_fs *, const char *, u64 *);
+ void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
const char *hint;
const char *help;
@@ -465,18 +509,20 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
int bch2_opt_lookup(const char *);
-int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *,
- const char *, u64 *);
+int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
+ const char *, u64 *, struct printbuf *);
#define OPT_SHOW_FULL_LIST (1 << 0)
#define OPT_SHOW_MOUNT_STYLE (1 << 1)
-void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
const struct bch_option *, u64, unsigned);
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
@@ -486,18 +532,12 @@ int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
/* inode opts: */
struct bch_io_opts {
-#define x(_name, _bits) unsigned _name##_defined:1;
- BCH_INODE_OPTS()
-#undef x
-
#define x(_name, _bits) u##_bits _name;
BCH_INODE_OPTS()
#undef x
};
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
-struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
-void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
new file mode 100644
index 000000000000..c41daa180682
--- /dev/null
+++ b/fs/bcachefs/printbuf.c
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: LGPL-2.1+
+/* Copyright (C) 2022 Kent Overstreet */
+
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+#include "printbuf.h"
+
+static inline unsigned printbuf_linelen(struct printbuf *buf)
+{
+ return buf->pos - buf->last_newline;
+}
+
+int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
+{
+ unsigned new_size;
+ char *buf;
+
+ if (!out->heap_allocated)
+ return 0;
+
+ /* Reserved space for terminating nul: */
+ extra += 1;
+
+ if (out->pos + extra < out->size)
+ return 0;
+
+ new_size = roundup_pow_of_two(out->size + extra);
+
+ /*
+ * Note: output buffer must be freeable with kfree(), it's not required
+ * that the user use printbuf_exit().
+ */
+ buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
+
+ if (!buf) {
+ out->allocation_failure = true;
+ return -ENOMEM;
+ }
+
+ out->buf = buf;
+ out->size = new_size;
+ return 0;
+}
+
+void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
+{
+ int len;
+
+ do {
+ va_list args2;
+
+ va_copy(args2, args);
+ len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+ } while (len + 1 >= printbuf_remaining(out) &&
+ !bch2_printbuf_make_room(out, len + 1));
+
+ len = min_t(size_t, len,
+ printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+ out->pos += len;
+}
+
+void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
+{
+ va_list args;
+ int len;
+
+ do {
+ va_start(args, fmt);
+ len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
+ va_end(args);
+ } while (len + 1 >= printbuf_remaining(out) &&
+ !bch2_printbuf_make_room(out, len + 1));
+
+ len = min_t(size_t, len,
+ printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+ out->pos += len;
+}
+
+/**
+ * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null
+ * terminated
+ */
+const char *bch2_printbuf_str(const struct printbuf *buf)
+{
+ /*
+ * If we've written to a printbuf then it's guaranteed to be a null
+ * terminated string - but if we haven't, then we might not have
+ * allocated a buffer at all:
+ */
+ return buf->pos
+ ? buf->buf
+ : "";
+}
+
+/**
+ * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it
+ * against accidental use.
+ */
+void bch2_printbuf_exit(struct printbuf *buf)
+{
+ if (buf->heap_allocated) {
+ kfree(buf->buf);
+ buf->buf = ERR_PTR(-EINTR); /* poison value */
+ }
+}
+
+void bch2_printbuf_tabstops_reset(struct printbuf *buf)
+{
+ buf->nr_tabstops = 0;
+}
+
+void bch2_printbuf_tabstop_pop(struct printbuf *buf)
+{
+ if (buf->nr_tabstops)
+ --buf->nr_tabstops;
+}
+
+/*
+ * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces from previous tabpstop
+ *
+ * In the future this function may allocate memory if setting more than
+ * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
+ * of line.
+ */
+int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
+{
+ unsigned prev_tabstop = buf->nr_tabstops
+ ? buf->_tabstops[buf->nr_tabstops - 1]
+ : 0;
+
+ if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops)))
+ return -EINVAL;
+
+ buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces;
+ buf->has_indent_or_tabstops = true;
+ return 0;
+}
+
+/**
+ * printbuf_indent_add - add to the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to add to the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces more spaces.
+ */
+void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
+{
+ if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
+ spaces = 0;
+
+ buf->indent += spaces;
+ prt_chars(buf, ' ', spaces);
+
+ buf->has_indent_or_tabstops = true;
+}
+
+/**
+ * printbuf_indent_sub - subtract from the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to subtract from the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces less spaces.
+ */
+void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
+{
+ if (WARN_ON_ONCE(spaces > buf->indent))
+ spaces = buf->indent;
+
+ if (buf->last_newline + buf->indent == buf->pos) {
+ buf->pos -= spaces;
+ printbuf_nul_terminate(buf);
+ }
+ buf->indent -= spaces;
+
+ if (!buf->indent && !buf->nr_tabstops)
+ buf->has_indent_or_tabstops = false;
+}
+
+void bch2_prt_newline(struct printbuf *buf)
+{
+ unsigned i;
+
+ bch2_printbuf_make_room(buf, 1 + buf->indent);
+
+ __prt_char(buf, '\n');
+
+ buf->last_newline = buf->pos;
+
+ for (i = 0; i < buf->indent; i++)
+ __prt_char(buf, ' ');
+
+ printbuf_nul_terminate(buf);
+
+ buf->last_field = buf->pos;
+ buf->cur_tabstop = 0;
+}
+
+/*
+ * Returns spaces from start of line, if set, or 0 if unset:
+ */
+static inline unsigned cur_tabstop(struct printbuf *buf)
+{
+ return buf->cur_tabstop < buf->nr_tabstops
+ ? buf->_tabstops[buf->cur_tabstop]
+ : 0;
+}
+
+static void __prt_tab(struct printbuf *out)
+{
+ int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
+
+ prt_chars(out, ' ', spaces);
+
+ out->last_field = out->pos;
+ out->cur_tabstop++;
+}
+
+/**
+ * prt_tab - Advance printbuf to the next tabstop
+ *
+ * @buf: printbuf to control
+ *
+ * Advance output to the next tabstop by printing spaces.
+ */
+void bch2_prt_tab(struct printbuf *out)
+{
+ if (WARN_ON(!cur_tabstop(out)))
+ return;
+
+ __prt_tab(out);
+}
+
+static void __prt_tab_rjust(struct printbuf *buf)
+{
+ unsigned move = buf->pos - buf->last_field;
+ int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
+
+ if (pad > 0) {
+ bch2_printbuf_make_room(buf, pad);
+
+ if (buf->last_field + pad < buf->size)
+ memmove(buf->buf + buf->last_field + pad,
+ buf->buf + buf->last_field,
+ min(move, buf->size - 1 - buf->last_field - pad));
+
+ if (buf->last_field < buf->size)
+ memset(buf->buf + buf->last_field, ' ',
+ min((unsigned) pad, buf->size - buf->last_field));
+
+ buf->pos += pad;
+ printbuf_nul_terminate(buf);
+ }
+
+ buf->last_field = buf->pos;
+ buf->cur_tabstop++;
+}
+
+/**
+ * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * previous output
+ *
+ * @buf: printbuf to control
+ *
+ * Advance output to the next tabstop by inserting spaces immediately after the
+ * previous tabstop, right justifying previously outputted text.
+ */
+void bch2_prt_tab_rjust(struct printbuf *buf)
+{
+ if (WARN_ON(!cur_tabstop(buf)))
+ return;
+
+ __prt_tab_rjust(buf);
+}
+
+/**
+ * prt_bytes_indented - Print an array of chars, handling embedded control characters
+ *
+ * @out: printbuf to output to
+ * @str: string to print
+ * @count: number of bytes to print
+ *
+ * The following contol characters are handled as so:
+ * \n: prt_newline newline that obeys current indent level
+ * \t: prt_tab advance to next tabstop
+ * \r: prt_tab_rjust advance to next tabstop, with right justification
+ */
+void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
+{
+ const char *unprinted_start = str;
+ const char *end = str + count;
+
+ if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) {
+ prt_bytes(out, str, count);
+ return;
+ }
+
+ while (str != end) {
+ switch (*str) {
+ case '\n':
+ prt_bytes(out, unprinted_start, str - unprinted_start);
+ unprinted_start = str + 1;
+ bch2_prt_newline(out);
+ break;
+ case '\t':
+ if (likely(cur_tabstop(out))) {
+ prt_bytes(out, unprinted_start, str - unprinted_start);
+ unprinted_start = str + 1;
+ __prt_tab(out);
+ }
+ break;
+ case '\r':
+ if (likely(cur_tabstop(out))) {
+ prt_bytes(out, unprinted_start, str - unprinted_start);
+ unprinted_start = str + 1;
+ __prt_tab_rjust(out);
+ }
+ break;
+ }
+
+ str++;
+ }
+
+ prt_bytes(out, unprinted_start, str - unprinted_start);
+}
+
+/**
+ * prt_human_readable_u64 - Print out a u64 in human readable units
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ */
+void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v)
+{
+ bch2_printbuf_make_room(buf, 10);
+ buf->pos += string_get_size(v, 1, !buf->si_units,
+ buf->buf + buf->pos,
+ printbuf_remaining_size(buf));
+}
+
+/**
+ * prt_human_readable_s64 - Print out a s64 in human readable units
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ */
+void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v)
+{
+ if (v < 0)
+ prt_char(buf, '-');
+ bch2_prt_human_readable_u64(buf, abs(v));
+}
+
+/**
+ * prt_units_u64 - Print out a u64 according to printbuf unit options
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void bch2_prt_units_u64(struct printbuf *out, u64 v)
+{
+ if (out->human_readable_units)
+ bch2_prt_human_readable_u64(out, v);
+ else
+ bch2_prt_printf(out, "%llu", v);
+}
+
+/**
+ * prt_units_s64 - Print out a s64 according to printbuf unit options
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void bch2_prt_units_s64(struct printbuf *out, s64 v)
+{
+ if (v < 0)
+ prt_char(out, '-');
+ bch2_prt_units_u64(out, abs(v));
+}
+
+void bch2_prt_string_option(struct printbuf *out,
+ const char * const list[],
+ size_t selected)
+{
+ size_t i;
+
+ for (i = 0; list[i]; i++)
+ bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
+}
+
+void bch2_prt_bitflags(struct printbuf *out,
+ const char * const list[], u64 flags)
+{
+ unsigned bit, nr = 0;
+ bool first = true;
+
+ while (list[nr])
+ nr++;
+
+ while (flags && (bit = __ffs(flags)) < nr) {
+ if (!first)
+ bch2_prt_printf(out, ",");
+ first = false;
+ bch2_prt_printf(out, "%s", list[bit]);
+ flags ^= 1 << bit;
+ }
+}
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
new file mode 100644
index 000000000000..2191423d9f22
--- /dev/null
+++ b/fs/bcachefs/printbuf.h
@@ -0,0 +1,284 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/* Copyright (C) 2022 Kent Overstreet */
+
+#ifndef _BCACHEFS_PRINTBUF_H
+#define _BCACHEFS_PRINTBUF_H
+
+/*
+ * Printbufs: Simple strings for printing to, with optional heap allocation
+ *
+ * This code has provisions for use in userspace, to aid in making other code
+ * portable between kernelspace and userspace.
+ *
+ * Basic example:
+ * struct printbuf buf = PRINTBUF;
+ *
+ * prt_printf(&buf, "foo=");
+ * foo_to_text(&buf, foo);
+ * printk("%s", buf.buf);
+ * printbuf_exit(&buf);
+ *
+ * Or
+ * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
+ *
+ * We can now write pretty printers instead of writing code that dumps
+ * everything to the kernel log buffer, and then those pretty-printers can be
+ * used by other code that outputs to kernel log, sysfs, debugfs, etc.
+ *
+ * Memory allocation: Outputing to a printbuf may allocate memory. This
+ * allocation is done with GFP_KERNEL, by default: use the newer
+ * memalloc_*_(save|restore) functions as needed.
+ *
+ * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
+ * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
+ *
+ * It's allowed to grab the output buffer and free it later with kfree() instead
+ * of using printbuf_exit(), if the user just needs a heap allocated string at
+ * the end.
+ *
+ * Memory allocation failures: We don't return errors directly, because on
+ * memory allocation failure we usually don't want to bail out and unwind - we
+ * want to print what we've got, on a best-effort basis. But code that does want
+ * to return -ENOMEM may check printbuf.allocation_failure.
+ *
+ * Indenting, tabstops:
+ *
+ * To aid is writing multi-line pretty printers spread across multiple
+ * functions, printbufs track the current indent level.
+ *
+ * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
+ * level, respectively.
+ *
+ * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
+ * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
+ * prt_tab_rjust() will also advance the current line of text up to the next
+ * tabstop, but it does so by shifting text since the previous tabstop up to the
+ * next tabstop - right justifying it.
+ *
+ * Make sure you use prt_newline() instead of \n in the format string for indent
+ * level and tabstops to work corretly.
+ *
+ * Output units: printbuf->units exists to tell pretty-printers how to output
+ * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
+ * human readable bytes. prt_units() obeys it.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+enum printbuf_si {
+ PRINTBUF_UNITS_2, /* use binary powers of 2^10 */
+ PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */
+};
+
+#define PRINTBUF_INLINE_TABSTOPS 6
+
+struct printbuf {
+ char *buf;
+ unsigned size;
+ unsigned pos;
+ unsigned last_newline;
+ unsigned last_field;
+ unsigned indent;
+ /*
+ * If nonzero, allocations will be done with GFP_ATOMIC:
+ */
+ u8 atomic;
+ bool allocation_failure:1;
+ bool heap_allocated:1;
+ enum printbuf_si si_units:1;
+ bool human_readable_units:1;
+ bool has_indent_or_tabstops:1;
+ bool suppress_indent_tabstop_handling:1;
+ u8 nr_tabstops;
+
+ /*
+ * Do not modify directly: use printbuf_tabstop_add(),
+ * printbuf_tabstop_get()
+ */
+ u8 cur_tabstop;
+ u8 _tabstops[PRINTBUF_INLINE_TABSTOPS];
+};
+
+int bch2_printbuf_make_room(struct printbuf *, unsigned);
+__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...);
+__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list);
+const char *bch2_printbuf_str(const struct printbuf *);
+void bch2_printbuf_exit(struct printbuf *);
+
+void bch2_printbuf_tabstops_reset(struct printbuf *);
+void bch2_printbuf_tabstop_pop(struct printbuf *);
+int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
+
+void bch2_printbuf_indent_add(struct printbuf *, unsigned);
+void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
+
+void bch2_prt_newline(struct printbuf *);
+void bch2_prt_tab(struct printbuf *);
+void bch2_prt_tab_rjust(struct printbuf *);
+
+void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned);
+void bch2_prt_human_readable_u64(struct printbuf *, u64);
+void bch2_prt_human_readable_s64(struct printbuf *, s64);
+void bch2_prt_units_u64(struct printbuf *, u64);
+void bch2_prt_units_s64(struct printbuf *, s64);
+void bch2_prt_string_option(struct printbuf *, const char * const[], size_t);
+void bch2_prt_bitflags(struct printbuf *, const char * const[], u64);
+
+/* Initializer for a heap allocated printbuf: */
+#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
+
+/* Initializer a printbuf that points to an external buffer: */
+#define PRINTBUF_EXTERN(_buf, _size) \
+((struct printbuf) { \
+ .buf = _buf, \
+ .size = _size, \
+})
+
+/*
+ * Returns size remaining of output buffer:
+ */
+static inline unsigned printbuf_remaining_size(struct printbuf *out)
+{
+ return out->pos < out->size ? out->size - out->pos : 0;
+}
+
+/*
+ * Returns number of characters we can print to the output buffer - i.e.
+ * excluding the terminating nul:
+ */
+static inline unsigned printbuf_remaining(struct printbuf *out)
+{
+ return out->pos < out->size ? out->size - out->pos - 1 : 0;
+}
+
+static inline unsigned printbuf_written(struct printbuf *out)
+{
+ return out->size ? min(out->pos, out->size - 1) : 0;
+}
+
+/*
+ * Returns true if output was truncated:
+ */
+static inline bool printbuf_overflowed(struct printbuf *out)
+{
+ return out->pos >= out->size;
+}
+
+static inline void printbuf_nul_terminate(struct printbuf *out)
+{
+ bch2_printbuf_make_room(out, 1);
+
+ if (out->pos < out->size)
+ out->buf[out->pos] = 0;
+ else if (out->size)
+ out->buf[out->size - 1] = 0;
+}
+
+/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
+static inline void __prt_char_reserved(struct printbuf *out, char c)
+{
+ if (printbuf_remaining(out))
+ out->buf[out->pos] = c;
+ out->pos++;
+}
+
+/* Doesn't nul terminate: */
+static inline void __prt_char(struct printbuf *out, char c)
+{
+ bch2_printbuf_make_room(out, 1);
+ __prt_char_reserved(out, c);
+}
+
+static inline void prt_char(struct printbuf *out, char c)
+{
+ __prt_char(out, c);
+ printbuf_nul_terminate(out);
+}
+
+static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
+{
+ unsigned i, can_print = min(n, printbuf_remaining(out));
+
+ for (i = 0; i < can_print; i++)
+ out->buf[out->pos++] = c;
+ out->pos += n - can_print;
+}
+
+static inline void prt_chars(struct printbuf *out, char c, unsigned n)
+{
+ bch2_printbuf_make_room(out, n);
+ __prt_chars_reserved(out, c, n);
+ printbuf_nul_terminate(out);
+}
+
+static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
+{
+ unsigned i, can_print;
+
+ bch2_printbuf_make_room(out, n);
+
+ can_print = min(n, printbuf_remaining(out));
+
+ for (i = 0; i < can_print; i++)
+ out->buf[out->pos++] = ((char *) b)[i];
+ out->pos += n - can_print;
+
+ printbuf_nul_terminate(out);
+}
+
+static inline void prt_str(struct printbuf *out, const char *str)
+{
+ prt_bytes(out, str, strlen(str));
+}
+
+static inline void prt_str_indented(struct printbuf *out, const char *str)
+{
+ bch2_prt_bytes_indented(out, str, strlen(str));
+}
+
+static inline void prt_hex_byte(struct printbuf *out, u8 byte)
+{
+ bch2_printbuf_make_room(out, 2);
+ __prt_char_reserved(out, hex_asc_hi(byte));
+ __prt_char_reserved(out, hex_asc_lo(byte));
+ printbuf_nul_terminate(out);
+}
+
+static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
+{
+ bch2_printbuf_make_room(out, 2);
+ __prt_char_reserved(out, hex_asc_upper_hi(byte));
+ __prt_char_reserved(out, hex_asc_upper_lo(byte));
+ printbuf_nul_terminate(out);
+}
+
+/**
+ * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
+ */
+static inline void printbuf_reset(struct printbuf *buf)
+{
+ buf->pos = 0;
+ buf->allocation_failure = 0;
+ buf->indent = 0;
+ buf->nr_tabstops = 0;
+ buf->cur_tabstop = 0;
+}
+
+/**
+ * printbuf_atomic_inc - mark as entering an atomic section
+ */
+static inline void printbuf_atomic_inc(struct printbuf *buf)
+{
+ buf->atomic++;
+}
+
+/**
+ * printbuf_atomic_inc - mark as leaving an atomic section
+ */
+static inline void printbuf_atomic_dec(struct printbuf *buf)
+{
+ buf->atomic--;
+}
+
+#endif /* _BCACHEFS_PRINTBUF_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 6fb8224f565e..331f22835d18 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -1,44 +1,81 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_update.h"
+#include "errcode.h"
#include "inode.h"
#include "quota.h"
#include "subvolume.h"
#include "super-io.h"
-static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f,
+static const char * const bch2_quota_types[] = {
+ "user",
+ "group",
+ "project",
+};
+
+static const char * const bch2_quota_counters[] = {
+ "space",
+ "inodes",
+};
+
+static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
struct printbuf *err)
{
struct bch_sb_field_quota *q = field_to_type(f, quota);
if (vstruct_bytes(&q->field) < sizeof(*q)) {
- pr_buf(err, "wrong size (got %llu should be %zu)",
+ prt_printf(err, "wrong size (got %zu should be %zu)",
vstruct_bytes(&q->field), sizeof(*q));
+ return -BCH_ERR_invalid_sb_quota;
}
return 0;
}
+static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_quota *q = field_to_type(f, quota);
+ unsigned qtyp, counter;
+
+ for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
+ prt_printf(out, "%s: flags %llx",
+ bch2_quota_types[qtyp],
+ le64_to_cpu(q->q[qtyp].flags));
+
+ for (counter = 0; counter < Q_COUNTERS; counter++)
+ prt_printf(out, " %s timelimit %u warnlimit %u",
+ bch2_quota_counters[counter],
+ le32_to_cpu(q->q[qtyp].c[counter].timelimit),
+ le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
+
+ prt_newline(out);
+ }
+}
+
const struct bch_sb_field_ops bch_sb_field_ops_quota = {
- .validate = bch2_sb_validate_quota,
+ .validate = bch2_sb_quota_validate,
+ .to_text = bch2_sb_quota_to_text,
};
-const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
- if (k.k->p.inode >= QTYP_NR)
- return "invalid quota type";
+ if (k.k->p.inode >= QTYP_NR) {
+ prt_printf(err, "invalid quota type (%llu >= %u)",
+ k.k->p.inode, QTYP_NR);
+ return -BCH_ERR_invalid_bkey;
+ }
- if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
- return "incorrect value size";
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_quota));
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ return 0;
}
-static const char * const bch2_quota_counters[] = {
- "space",
- "inodes",
-};
-
void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
@@ -46,7 +83,7 @@ void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
unsigned i;
for (i = 0; i < Q_COUNTERS; i++)
- pr_buf(out, "%s hardlimit %llu softlimit %llu",
+ prt_printf(out, "%s hardlimit %llu softlimit %llu",
bch2_quota_counters[i],
le64_to_cpu(dq.v->c[i].hardlimit),
le64_to_cpu(dq.v->c[i].softlimit));
@@ -58,6 +95,113 @@ void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
#include <linux/fs.h>
#include <linux/quota.h>
+static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
+{
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 20);
+
+ prt_str(out, "i_fieldmask");
+ prt_tab(out);
+ prt_printf(out, "%x", i->i_fieldmask);
+ prt_newline(out);
+
+ prt_str(out, "i_flags");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_flags);
+ prt_newline(out);
+
+ prt_str(out, "i_spc_timelimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_spc_timelimit);
+ prt_newline(out);
+
+ prt_str(out, "i_ino_timelimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_ino_timelimit);
+ prt_newline(out);
+
+ prt_str(out, "i_rt_spc_timelimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_rt_spc_timelimit);
+ prt_newline(out);
+
+ prt_str(out, "i_spc_warnlimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_spc_warnlimit);
+ prt_newline(out);
+
+ prt_str(out, "i_ino_warnlimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_ino_warnlimit);
+ prt_newline(out);
+
+ prt_str(out, "i_rt_spc_warnlimit");
+ prt_tab(out);
+ prt_printf(out, "%u", i->i_rt_spc_warnlimit);
+ prt_newline(out);
+}
+
+static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
+{
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 20);
+
+ prt_str(out, "d_fieldmask");
+ prt_tab(out);
+ prt_printf(out, "%x", q->d_fieldmask);
+ prt_newline(out);
+
+ prt_str(out, "d_spc_hardlimit");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_spc_hardlimit);
+ prt_newline(out);
+
+ prt_str(out, "d_spc_softlimit");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_spc_softlimit);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_hardlimit");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_ino_hardlimit);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_softlimit");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_ino_softlimit);
+ prt_newline(out);
+
+ prt_str(out, "d_space");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_space);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_count");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_ino_count);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_timer");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_ino_timer);
+ prt_newline(out);
+
+ prt_str(out, "d_spc_timer");
+ prt_tab(out);
+ prt_printf(out, "%llu", q->d_spc_timer);
+ prt_newline(out);
+
+ prt_str(out, "d_ino_warns");
+ prt_tab(out);
+ prt_printf(out, "%i", q->d_ino_warns);
+ prt_newline(out);
+
+ prt_str(out, "d_spc_warns");
+ prt_tab(out);
+ prt_printf(out, "%i", q->d_spc_warns);
+ prt_newline(out);
+}
+
static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
{
qtypes >>= i;
@@ -188,34 +332,20 @@ static int bch2_quota_check_limit(struct bch_fs *c,
if (qc->hardlimit &&
qc->hardlimit < n &&
!ignore_hardlimit(q)) {
- if (mode == KEY_TYPE_QUOTA_PREALLOC)
- return -EDQUOT;
-
prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+ return -EDQUOT;
}
if (qc->softlimit &&
- qc->softlimit < n &&
- qc->timer &&
- ktime_get_real_seconds() >= qc->timer &&
- !ignore_hardlimit(q)) {
- if (mode == KEY_TYPE_QUOTA_PREALLOC)
- return -EDQUOT;
-
- prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
- }
-
- if (qc->softlimit &&
- qc->softlimit < n &&
- qc->timer == 0) {
- if (mode == KEY_TYPE_QUOTA_PREALLOC)
+ qc->softlimit < n) {
+ if (qc->timer == 0) {
+ qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
+ prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+ } else if (ktime_get_real_seconds() >= qc->timer &&
+ !ignore_hardlimit(q)) {
+ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
return -EDQUOT;
-
- prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
-
- /* XXX is this the right one? */
- qc->timer = ktime_get_real_seconds() +
- q->limits[counter].warnlimit;
+ }
}
return 0;
@@ -234,16 +364,16 @@ int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
memset(&msgs, 0, sizeof(msgs));
+ for_each_set_qtype(c, i, q, qtypes) {
+ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL);
+ if (!mq[i])
+ return -ENOMEM;
+ }
+
for_each_set_qtype(c, i, q, qtypes)
mutex_lock_nested(&q->lock, i);
for_each_set_qtype(c, i, q, qtypes) {
- mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
- if (!mq[i]) {
- ret = -ENOMEM;
- goto err;
- }
-
ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
if (ret)
goto err;
@@ -286,18 +416,17 @@ int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
memset(&msgs, 0, sizeof(msgs));
+ for_each_set_qtype(c, i, q, qtypes) {
+ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL);
+ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL);
+ if (!src_q[i] || !dst_q[i])
+ return -ENOMEM;
+ }
+
for_each_set_qtype(c, i, q, qtypes)
mutex_lock_nested(&q->lock, i);
for_each_set_qtype(c, i, q, qtypes) {
- src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
- dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
-
- if (!src_q[i] || !dst_q[i]) {
- ret = -ENOMEM;
- goto err;
- }
-
ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
dst_q[i]->c[Q_SPC].v + space,
mode);
@@ -325,7 +454,8 @@ err:
return ret;
}
-static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
+ struct qc_dqblk *qdq)
{
struct bkey_s_c_quota dq;
struct bch_memquota_type *q;
@@ -334,6 +464,9 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
BUG_ON(k.k->p.inode >= QTYP_NR);
+ if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
+ return 0;
+
switch (k.k->type) {
case KEY_TYPE_quota:
dq = bkey_s_c_to_quota(k);
@@ -351,36 +484,21 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
}
+ if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
+ mq->c[Q_SPC].timer = cpu_to_le64(qdq->d_spc_timer);
+ if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
+ mq->c[Q_SPC].warns = cpu_to_le64(qdq->d_spc_warns);
+ if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
+ mq->c[Q_INO].timer = cpu_to_le64(qdq->d_ino_timer);
+ if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
+ mq->c[Q_INO].warns = cpu_to_le64(qdq->d_ino_warns);
+
mutex_unlock(&q->lock);
}
return 0;
}
-static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0),
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->p.inode != type)
- break;
-
- ret = __bch2_quota_set(c, k);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(&trans, &iter);
-
- bch2_trans_exit(&trans);
- return ret;
-}
-
void bch2_fs_quota_exit(struct bch_fs *c)
{
unsigned i;
@@ -397,6 +515,26 @@ void bch2_fs_quota_init(struct bch_fs *c)
mutex_init(&c->quotas[i].lock);
}
+static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
+{
+ struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb);
+
+ if (sb_quota)
+ return sb_quota;
+
+ sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64));
+ if (sb_quota) {
+ unsigned qtype, qc;
+
+ for (qtype = 0; qtype < QTYP_NR; qtype++)
+ for (qc = 0; qc < Q_COUNTERS; qc++)
+ sb_quota->q[qtype].c[qc].timelimit =
+ cpu_to_le32(7 * 24 * 60 * 60);
+ }
+
+ return sb_quota;
+}
+
static void bch2_sb_quota_read(struct bch_fs *c)
{
struct bch_sb_field_quota *sb_quota;
@@ -419,22 +557,14 @@ static void bch2_sb_quota_read(struct bch_fs *c)
}
static int bch2_fs_quota_read_inode(struct btree_trans *trans,
- struct btree_iter *iter)
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bch_inode_unpacked u;
struct bch_subvolume subvolume;
- struct bkey_s_c k;
int ret;
- k = bch2_btree_iter_peek(iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (!k.k)
- return 1;
-
ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume);
if (ret)
return ret;
@@ -463,36 +593,35 @@ advance:
int bch2_fs_quota_read(struct bch_fs *c)
{
- unsigned i, qtypes = enabled_qtypes(c);
- struct bch_memquota_type *q;
+ struct bch_sb_field_quota *sb_quota;
struct btree_trans trans;
struct btree_iter iter;
+ struct bkey_s_c k;
int ret;
mutex_lock(&c->sb_lock);
+ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+ if (!sb_quota) {
+ mutex_unlock(&c->sb_lock);
+ return -BCH_ERR_ENOSPC_sb_quota;
+ }
+
bch2_sb_quota_read(c);
mutex_unlock(&c->sb_lock);
- for_each_set_qtype(c, i, q, qtypes) {
- ret = bch2_quota_init_type(c, i);
- if (ret)
- return ret;
- }
-
bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
- do {
- ret = lockrestart_do(&trans,
- bch2_fs_quota_read_inode(&trans, &iter));
- } while (!ret);
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
+ POS_MIN, BTREE_ITER_PREFETCH, k,
+ __bch2_quota_set(c, k, NULL)) ?:
+ for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
+ POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ bch2_fs_quota_read_inode(&trans, &iter, k));
+ if (ret)
+ bch_err(c, "err in quota_read: %s", bch2_err_str(ret));
bch2_trans_exit(&trans);
- return ret < 0 ? ret : 0;
+ return ret;
}
/* Enable/disable/delete quotas for an entire filesystem: */
@@ -500,6 +629,8 @@ int bch2_fs_quota_read(struct bch_fs *c)
static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
{
struct bch_fs *c = sb->s_fs_info;
+ struct bch_sb_field_quota *sb_quota;
+ int ret = 0;
if (sb->s_flags & SB_RDONLY)
return -EROFS;
@@ -519,6 +650,12 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
return -EINVAL;
mutex_lock(&c->sb_lock);
+ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+ if (!sb_quota) {
+ ret = -BCH_ERR_ENOSPC_sb_quota;
+ goto unlock;
+ }
+
if (uflags & FS_QUOTA_UDQ_ENFD)
SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
@@ -529,9 +666,10 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
bch2_write_super(c);
+unlock:
mutex_unlock(&c->sb_lock);
- return 0;
+ return bch2_err_class(ret);
}
static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
@@ -571,7 +709,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
POS(QTYP_USR, 0),
- POS(QTYP_USR + 1, 0),
+ POS(QTYP_USR, U64_MAX),
0, NULL);
if (ret)
return ret;
@@ -583,7 +721,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
POS(QTYP_GRP, 0),
- POS(QTYP_GRP + 1, 0),
+ POS(QTYP_GRP, U64_MAX),
0, NULL);
if (ret)
return ret;
@@ -595,7 +733,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
POS(QTYP_PRJ, 0),
- POS(QTYP_PRJ + 1, 0),
+ POS(QTYP_PRJ, U64_MAX),
0, NULL);
if (ret)
return ret;
@@ -643,6 +781,15 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
struct bch_fs *c = sb->s_fs_info;
struct bch_sb_field_quota *sb_quota;
struct bch_memquota_type *q;
+ int ret = 0;
+
+ if (0) {
+ struct printbuf buf = PRINTBUF;
+
+ qc_info_to_text(&buf, info);
+ pr_info("setting:\n%s", buf.buf);
+ printbuf_exit(&buf);
+ }
if (sb->s_flags & SB_RDONLY)
return -EROFS;
@@ -660,12 +807,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
q = &c->quotas[type];
mutex_lock(&c->sb_lock);
- sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
- sb_quota = bch2_sb_resize_quota(&c->disk_sb,
- sizeof(*sb_quota) / sizeof(u64));
- if (!sb_quota)
- return -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_sb_quota;
+ goto unlock;
}
if (info->i_fieldmask & QC_SPC_TIMER)
@@ -687,9 +832,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
bch2_sb_quota_read(c);
bch2_write_super(c);
+unlock:
mutex_unlock(&c->sb_lock);
- return 0;
+ return bch2_err_class(ret);
}
/* Get/set individual quotas: */
@@ -794,6 +940,14 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
struct bkey_i_quota new_quota;
int ret;
+ if (0) {
+ struct printbuf buf = PRINTBUF;
+
+ qc_dqblk_to_text(&buf, qdq);
+ pr_info("setting:\n%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
if (sb->s_flags & SB_RDONLY)
return -EROFS;
@@ -802,7 +956,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
- __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
return ret;
}
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 51e4f9713ef0..146264fd16ce 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -7,13 +7,13 @@
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
-const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-#define bch2_bkey_ops_quota (struct bkey_ops) { \
+#define bch2_bkey_ops_quota ((struct bkey_ops) { \
.key_invalid = bch2_quota_invalid, \
.val_to_text = bch2_quota_to_text, \
-}
+})
static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
{
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index a573fede05b1..4df981bd96df 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -6,6 +6,7 @@
#include "buckets.h"
#include "clock.h"
#include "disk_groups.h"
+#include "errcode.h"
#include "extents.h"
#include "io.h"
#include "move.h"
@@ -22,62 +23,70 @@
* returns -1 if it should not be moved, or
* device of pointer that should be moved, if known, or INT_MAX if unknown
*/
-static int __bch2_rebalance_pred(struct bch_fs *c,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts)
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
+ unsigned i;
+
+ data_opts->rewrite_ptrs = 0;
+ data_opts->target = io_opts->background_target;
+ data_opts->extra_replicas = 0;
+ data_opts->btree_insert_flags = 0;
if (io_opts->background_compression &&
- !bch2_bkey_is_incompressible(k))
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ !bch2_bkey_is_incompressible(k)) {
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (!p.ptr.cached &&
p.crc.compression_type !=
bch2_compression_opt_to_type[io_opts->background_compression])
- return p.ptr.dev;
+ data_opts->rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
- if (io_opts->background_target)
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached &&
- !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target))
- return p.ptr.dev;
+ if (io_opts->background_target) {
+ const struct bch_extent_ptr *ptr;
+
+ i = 0;
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!ptr->cached &&
+ !bch2_dev_in_target(c, ptr->dev, io_opts->background_target))
+ data_opts->rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
- return -1;
+ return data_opts->rewrite_ptrs != 0;
}
void bch2_rebalance_add_key(struct bch_fs *c,
struct bkey_s_c k,
struct bch_io_opts *io_opts)
{
- atomic64_t *counter;
- int dev;
+ struct data_update_opts update_opts = { 0 };
+ struct bkey_ptrs_c ptrs;
+ const struct bch_extent_ptr *ptr;
+ unsigned i;
- dev = __bch2_rebalance_pred(c, k, io_opts);
- if (dev < 0)
+ if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
return;
- counter = dev < INT_MAX
- ? &bch_dev_bkey_exists(c, dev)->rebalance_work
- : &c->rebalance.work_unknown_dev;
-
- if (atomic64_add_return(k.k->size, counter) == k.k->size)
- rebalance_wakeup(c);
-}
-
-static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_opts *data_opts)
-{
- if (__bch2_rebalance_pred(c, k, io_opts) >= 0) {
- data_opts->target = io_opts->background_target;
- data_opts->nr_replicas = 1;
- data_opts->btree_insert_flags = 0;
- return DATA_ADD_REPLICAS;
- } else {
- return DATA_SKIP;
+ i = 0;
+ ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr(ptrs, ptr) {
+ if ((1U << i) && update_opts.rewrite_ptrs)
+ if (atomic64_add_return(k.k->size,
+ &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
+ k.k->size)
+ rebalance_wakeup(c);
+ i++;
}
}
@@ -180,7 +189,7 @@ static int bch2_rebalance_thread(void *arg)
prev_start = jiffies;
prev_cputime = curr_cputime();
- bch_move_stats_init(&move_stats, "rebalance");
+ bch2_move_stats_init(&move_stats, "rebalance");
while (!kthread_wait_freezable(r->enabled)) {
cond_resched();
@@ -245,9 +254,10 @@ static int bch2_rebalance_thread(void *arg)
BTREE_ID_NR, POS_MAX,
/* ratelimiting disabled for now */
NULL, /* &r->pd.rate, */
+ &move_stats,
writepoint_ptr(&c->rebalance_write_point),
- rebalance_pred, NULL,
- &move_stats);
+ true,
+ rebalance_pred, NULL);
}
return 0;
@@ -257,35 +267,48 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
{
struct bch_fs_rebalance *r = &c->rebalance;
struct rebalance_work w = rebalance_work(c);
- char h1[21], h2[21];
- bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9);
- bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9);
- pr_buf(out, "fullest_dev (%i):\t%s/%s\n",
- w.dev_most_full_idx, h1, h2);
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 20);
+
+ prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
+ prt_tab(out);
+
+ prt_human_readable_u64(out, w.dev_most_full_work << 9);
+ prt_printf(out, "/");
+ prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
+ prt_newline(out);
+
+ prt_printf(out, "total work:");
+ prt_tab(out);
- bch2_hprint(&PBUF(h1), w.total_work << 9);
- bch2_hprint(&PBUF(h2), c->capacity << 9);
- pr_buf(out, "total work:\t\t%s/%s\n", h1, h2);
+ prt_human_readable_u64(out, w.total_work << 9);
+ prt_printf(out, "/");
+ prt_human_readable_u64(out, c->capacity << 9);
+ prt_newline(out);
- pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate);
+ prt_printf(out, "rate:");
+ prt_tab(out);
+ prt_printf(out, "%u", r->pd.rate.rate);
+ prt_newline(out);
switch (r->state) {
case REBALANCE_WAITING:
- pr_buf(out, "waiting\n");
+ prt_printf(out, "waiting");
break;
case REBALANCE_THROTTLED:
- bch2_hprint(&PBUF(h1),
+ prt_printf(out, "throttled for %lu sec or ",
+ (r->throttled_until_cputime - jiffies) / HZ);
+ prt_human_readable_u64(out,
(r->throttled_until_iotime -
atomic64_read(&c->io_clock[WRITE].now)) << 9);
- pr_buf(out, "throttled for %lu sec or %s io\n",
- (r->throttled_until_cputime - jiffies) / HZ,
- h1);
+ prt_printf(out, " io");
break;
case REBALANCE_RUNNING:
- pr_buf(out, "running\n");
+ prt_printf(out, "running");
break;
}
+ prt_newline(out);
}
void bch2_rebalance_stop(struct bch_fs *c)
@@ -310,6 +333,7 @@ void bch2_rebalance_stop(struct bch_fs *c)
int bch2_rebalance_start(struct bch_fs *c)
{
struct task_struct *p;
+ int ret;
if (c->rebalance.thread)
return 0;
@@ -318,9 +342,10 @@ int bch2_rebalance_start(struct bch_fs *c)
return 0;
p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
- if (IS_ERR(p)) {
- bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p));
- return PTR_ERR(p);
+ ret = PTR_ERR_OR_ZERO(p);
+ if (ret) {
+ bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret));
+ return ret;
}
get_task_struct(p);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b818093eab39..91a66b5916eb 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "backpointers.h"
#include "bkey_buf.h"
#include "alloc_background.h"
#include "btree_gc.h"
@@ -10,12 +11,14 @@
#include "buckets.h"
#include "dirent.h"
#include "ec.h"
+#include "errcode.h"
#include "error.h"
#include "fs-common.h"
#include "fsck.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
+#include "lru.h"
#include "move.h"
#include "quota.h"
#include "recovery.h"
@@ -71,40 +74,118 @@ static int journal_key_cmp(const struct journal_key *l, const struct journal_key
return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
}
-size_t bch2_journal_key_search(struct journal_keys *journal_keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
{
- size_t l = 0, r = journal_keys->nr, m;
+ size_t gap_size = keys->size - keys->nr;
+
+ if (idx >= keys->gap)
+ idx += gap_size;
+ return idx;
+}
+
+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
+{
+ return keys->d + idx_to_pos(keys, idx);
+}
+
+static size_t __bch2_journal_key_search(struct journal_keys *keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
+{
+ size_t l = 0, r = keys->nr, m;
while (l < r) {
m = l + ((r - l) >> 1);
- if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0)
+ if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
l = m + 1;
else
r = m;
}
- BUG_ON(l < journal_keys->nr &&
- __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0);
+ BUG_ON(l < keys->nr &&
+ __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
BUG_ON(l &&
- __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0);
+ __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
return l;
}
-static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
+static size_t bch2_journal_key_search(struct journal_keys *keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
{
- struct bkey_i *n = iter->keys->d[idx].k;
- struct btree_and_journal_iter *biter =
- container_of(iter, struct btree_and_journal_iter, journal);
-
- if (iter->idx > idx ||
- (iter->idx == idx &&
- biter->last &&
- bpos_cmp(n->k.p, biter->unpacked.p) <= 0))
- iter->idx++;
+ return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
+}
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos,
+ struct bpos end_pos, size_t *idx)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ unsigned iters = 0;
+ struct journal_key *k;
+search:
+ if (!*idx)
+ *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+ while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+ if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
+ return NULL;
+
+ if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
+ !k->overwritten)
+ return k->k;
+
+ (*idx)++;
+ iters++;
+ if (iters == 10) {
+ *idx = 0;
+ goto search;
+ }
+ }
+
+ return NULL;
+}
+
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos)
+{
+ size_t idx = 0;
+
+ return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
+}
+
+static void journal_iters_fix(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ /* The key we just inserted is immediately before the gap: */
+ size_t gap_end = keys->gap + (keys->size - keys->nr);
+ struct btree_and_journal_iter *iter;
+
+ /*
+ * If an iterator points one after the key we just inserted, decrement
+ * the iterator so it points at the key we just inserted - if the
+ * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
+ * handle that:
+ */
+ list_for_each_entry(iter, &c->journal_iters, journal.list)
+ if (iter->journal.idx == gap_end)
+ iter->journal.idx = keys->gap - 1;
+}
+
+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_iter *iter;
+ size_t gap_size = keys->size - keys->nr;
+
+ list_for_each_entry(iter, &c->journal_iters, list) {
+ if (iter->idx > old_gap)
+ iter->idx -= gap_size;
+ if (iter->idx >= new_gap)
+ iter->idx += gap_size;
+ }
}
int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
@@ -122,12 +203,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
.journal_seq = U32_MAX,
};
struct journal_keys *keys = &c->journal_keys;
- struct journal_iter *iter;
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
BUG_ON(test_bit(BCH_FS_RW, &c->flags));
- if (idx < keys->nr &&
+ if (idx < keys->size &&
journal_key_cmp(&n, &keys->d[idx]) == 0) {
if (keys->d[idx].allocated)
kfree(keys->d[idx].k);
@@ -135,29 +215,40 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
return 0;
}
+ if (idx > keys->gap)
+ idx -= keys->size - keys->nr;
+
if (keys->nr == keys->size) {
struct journal_keys new_keys = {
.nr = keys->nr,
- .size = keys->size * 2,
- .journal_seq_base = keys->journal_seq_base,
+ .size = max_t(size_t, keys->size, 8) * 2,
};
- new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
+ new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
if (!new_keys.d) {
bch_err(c, "%s: error allocating new key array (size %zu)",
__func__, new_keys.size);
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_journal_key_insert;
}
+ /* Since @keys was full, there was no gap: */
memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
kvfree(keys->d);
*keys = new_keys;
+
+ /* And now the gap is at the end: */
+ keys->gap = keys->nr;
}
- array_insert_item(keys->d, keys->nr, idx, n);
+ journal_iters_move_gap(c, keys->gap, idx);
+
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
+ keys->gap = idx;
- list_for_each_entry(iter, &c->journal_iters, list)
- journal_iter_fix(c, iter, idx);
+ keys->nr++;
+ keys->d[keys->gap++] = n;
+
+ journal_iters_fix(c);
return 0;
}
@@ -175,7 +266,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
if (!n)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_journal_key_insert;
bkey_copy(n, k);
ret = bch2_journal_key_insert_take(c, id, level, n);
@@ -201,34 +292,37 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
- if (idx < keys->nr &&
+ if (idx < keys->size &&
keys->d[idx].btree_id == btree &&
keys->d[idx].level == level &&
- !bpos_cmp(keys->d[idx].k->k.p, pos))
+ bpos_eq(keys->d[idx].k->k.p, pos))
keys->d[idx].overwritten = true;
}
-static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+ if (iter->idx < iter->keys->size) {
+ iter->idx++;
+ if (iter->idx == iter->keys->gap)
+ iter->idx += iter->keys->size - iter->keys->nr;
+ }
+}
+
+struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
struct journal_key *k = iter->keys->d + iter->idx;
- while (k < iter->keys->d + iter->keys->nr &&
+ while (k < iter->keys->d + iter->keys->size &&
k->btree_id == iter->btree_id &&
k->level == iter->level) {
if (!k->overwritten)
- return k->k;
+ return bkey_i_to_s_c(k->k);
- iter->idx++;
+ bch2_journal_iter_advance(iter);
k = iter->keys->d + iter->idx;
}
- return NULL;
-}
-
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
- if (iter->idx < iter->keys->nr)
- iter->idx++;
+ return bkey_s_c_null;
}
static void bch2_journal_iter_exit(struct journal_iter *iter)
@@ -260,71 +354,49 @@ static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
{
- switch (iter->last) {
- case none:
- break;
- case btree:
- bch2_journal_iter_advance_btree(iter);
- break;
- case journal:
- bch2_journal_iter_advance(&iter->journal);
- break;
- }
-
- iter->last = none;
+ if (bpos_eq(iter->pos, SPOS_MAX))
+ iter->at_end = true;
+ else
+ iter->pos = bpos_successor(iter->pos);
}
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
{
- struct bkey_s_c ret;
-
- while (1) {
- struct bkey_s_c btree_k =
- bch2_journal_iter_peek_btree(iter);
- struct bkey_s_c journal_k =
- bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
+ struct bkey_s_c btree_k, journal_k, ret;
+again:
+ if (iter->at_end)
+ return bkey_s_c_null;
- if (btree_k.k && journal_k.k) {
- int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p);
+ while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
+ bpos_lt(btree_k.k->p, iter->pos))
+ bch2_journal_iter_advance_btree(iter);
- if (!cmp)
- bch2_journal_iter_advance_btree(iter);
+ while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+ bpos_lt(journal_k.k->p, iter->pos))
+ bch2_journal_iter_advance(&iter->journal);
- iter->last = cmp < 0 ? btree : journal;
- } else if (btree_k.k) {
- iter->last = btree;
- } else if (journal_k.k) {
- iter->last = journal;
- } else {
- iter->last = none;
- return bkey_s_c_null;
- }
+ ret = journal_k.k &&
+ (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
+ ? journal_k
+ : btree_k;
- ret = iter->last == journal ? journal_k : btree_k;
+ if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
+ ret = bkey_s_c_null;
- if (iter->b &&
- bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) {
- iter->journal.idx = iter->journal.keys->nr;
- iter->last = none;
- return bkey_s_c_null;
+ if (ret.k) {
+ iter->pos = ret.k->p;
+ if (bkey_deleted(ret.k)) {
+ bch2_btree_and_journal_iter_advance(iter);
+ goto again;
}
-
- if (!bkey_deleted(ret.k))
- break;
-
- bch2_btree_and_journal_iter_advance(iter);
+ } else {
+ iter->pos = SPOS_MAX;
+ iter->at_end = true;
}
return ret;
}
-struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter)
-{
- bch2_btree_and_journal_iter_advance(iter);
-
- return bch2_btree_and_journal_iter_peek(iter);
-}
-
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
{
bch2_journal_iter_exit(&iter->journal);
@@ -342,6 +414,8 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter
iter->node_iter = node_iter;
bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
INIT_LIST_HEAD(&iter->journal.list);
+ iter->pos = b->data->min_key;
+ iter->at_end = false;
}
/*
@@ -361,16 +435,16 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i
/* sort and dedup all keys in the journal: */
-void bch2_journal_entries_free(struct list_head *list)
+void bch2_journal_entries_free(struct bch_fs *c)
{
-
- while (!list_empty(list)) {
- struct journal_replay *i =
- list_first_entry(list, struct journal_replay, list);
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
- }
+ struct journal_replay **i;
+ struct genradix_iter iter;
+
+ genradix_for_each(&c->journal_entries, iter, i)
+ if (*i)
+ kvpfree(*i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&(*i)->j));
+ genradix_free(&c->journal_entries);
}
/*
@@ -390,77 +464,115 @@ void bch2_journal_keys_free(struct journal_keys *keys)
{
struct journal_key *i;
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+ keys->gap = keys->nr;
+
for (i = keys->d; i < keys->d + keys->nr; i++)
if (i->allocated)
kfree(i->k);
kvfree(keys->d);
keys->d = NULL;
- keys->nr = 0;
+ keys->nr = keys->gap = keys->size = 0;
}
-static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
+static void __journal_keys_sort(struct journal_keys *keys)
{
- struct journal_replay *i;
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
- struct journal_keys keys = { NULL };
struct journal_key *src, *dst;
- size_t nr_keys = 0;
- if (list_empty(journal_entries))
- return keys;
+ sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
- list_for_each_entry(i, journal_entries, list) {
- if (i->ignore)
- continue;
+ src = dst = keys->d;
+ while (src < keys->d + keys->nr) {
+ while (src + 1 < keys->d + keys->nr &&
+ src[0].btree_id == src[1].btree_id &&
+ src[0].level == src[1].level &&
+ bpos_eq(src[0].k->k.p, src[1].k->k.p))
+ src++;
- if (!keys.journal_seq_base)
- keys.journal_seq_base = le64_to_cpu(i->j.seq);
+ *dst++ = *src++;
+ }
- for_each_jset_key(k, _n, entry, &i->j)
+ keys->nr = dst - keys->d;
+}
+
+static int journal_keys_sort(struct bch_fs *c)
+{
+ struct genradix_iter iter;
+ struct journal_replay *i, **_i;
+ struct jset_entry *entry;
+ struct bkey_i *k;
+ struct journal_keys *keys = &c->journal_keys;
+ size_t nr_keys = 0, nr_read = 0;
+
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ for_each_jset_key(k, entry, &i->j)
nr_keys++;
}
- keys.size = roundup_pow_of_two(nr_keys);
+ if (!nr_keys)
+ return 0;
- keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL);
- if (!keys.d)
- goto err;
+ keys->size = roundup_pow_of_two(nr_keys);
- list_for_each_entry(i, journal_entries, list) {
- if (i->ignore)
+ keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+ if (!keys->d) {
+ bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
+ nr_keys);
+
+ do {
+ keys->size >>= 1;
+ keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+ } while (!keys->d && keys->size > nr_keys / 8);
+
+ if (!keys->d) {
+ bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
+ keys->size);
+ return -BCH_ERR_ENOMEM_journal_keys_sort;
+ }
+ }
+
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
- BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+ cond_resched();
- for_each_jset_key(k, _n, entry, &i->j)
- keys.d[keys.nr++] = (struct journal_key) {
+ for_each_jset_key(k, entry, &i->j) {
+ if (keys->nr == keys->size) {
+ __journal_keys_sort(keys);
+
+ if (keys->nr > keys->size * 7 / 8) {
+ bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
+ keys->nr, keys->size, nr_read, nr_keys);
+ return -BCH_ERR_ENOMEM_journal_keys_sort;
+ }
+ }
+
+ keys->d[keys->nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
- .journal_seq = le64_to_cpu(i->j.seq) -
- keys.journal_seq_base,
+ .journal_seq = le64_to_cpu(i->j.seq),
.journal_offset = k->_data - i->j._data,
};
- }
-
- sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
-
- src = dst = keys.d;
- while (src < keys.d + keys.nr) {
- while (src + 1 < keys.d + keys.nr &&
- src[0].btree_id == src[1].btree_id &&
- src[0].level == src[1].level &&
- !bpos_cmp(src[0].k->k.p, src[1].k->k.p))
- src++;
- *dst++ = *src++;
+ nr_read++;
+ }
}
- keys.nr = dst - keys.d;
-err:
- return keys;
+ __journal_keys_sort(keys);
+ keys->gap = keys->nr;
+
+ bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
+ return 0;
}
/* journal replay: */
@@ -468,7 +580,8 @@ err:
static void replay_now_at(struct journal *j, u64 seq)
{
BUG_ON(seq < j->replay_journal_seq);
- BUG_ON(seq > j->replay_journal_seq_end);
+
+ seq = min(seq, j->replay_journal_seq_end);
while (j->replay_journal_seq < seq)
bch2_journal_pin_put(j, j->replay_journal_seq++);
@@ -511,7 +624,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
return cmp_int(l->journal_seq, r->journal_seq);
}
-static int bch2_journal_replay(struct bch_fs *c)
+static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq)
{
struct journal_keys *keys = &c->journal_keys;
struct journal_key **keys_sorted, *k;
@@ -519,9 +632,12 @@ static int bch2_journal_replay(struct bch_fs *c)
size_t i;
int ret;
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+ keys->gap = keys->nr;
+
keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
if (!keys_sorted)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_journal_replay;
for (i = 0; i < keys->nr; i++)
keys_sorted[i] = &keys->d[i];
@@ -530,26 +646,30 @@ static int bch2_journal_replay(struct bch_fs *c)
sizeof(keys_sorted[0]),
journal_sort_seq_cmp, NULL);
- if (keys->nr)
- replay_now_at(j, keys->journal_seq_base);
+ if (keys->nr) {
+ ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
+ keys->nr, start_seq, end_seq);
+ if (ret)
+ goto err;
+ }
for (i = 0; i < keys->nr; i++) {
k = keys_sorted[i];
cond_resched();
- if (!k->allocated)
- replay_now_at(j, keys->journal_seq_base + k->journal_seq);
+ replay_now_at(j, k->journal_seq);
ret = bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RESERVED|
- (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0),
+ (!k->allocated
+ ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved
+ : 0),
bch2_journal_replay_key(&trans, k));
if (ret) {
- bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
- ret, bch2_btree_ids[k->btree_id], k->level);
+ bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
+ bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret));
goto err;
}
}
@@ -560,6 +680,9 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_journal_set_replay_done(j);
bch2_journal_flush_all_pins(j);
ret = bch2_journal_error(j);
+
+ if (keys->nr && !ret)
+ bch2_journal_log_msg(c, "journal replay finished");
err:
kvfree(keys_sorted);
return ret;
@@ -630,7 +753,6 @@ static int journal_replay_entry_early(struct bch_fs *c,
unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
- ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
@@ -670,10 +792,8 @@ static int journal_replay_entry_early(struct bch_fs *c,
}
static int journal_replay_early(struct bch_fs *c,
- struct bch_sb_field_clean *clean,
- struct list_head *journal)
+ struct bch_sb_field_clean *clean)
{
- struct journal_replay *i;
struct jset_entry *entry;
int ret;
@@ -686,8 +806,13 @@ static int journal_replay_early(struct bch_fs *c,
return ret;
}
} else {
- list_for_each_entry(i, journal, list) {
- if (i->ignore)
+ struct genradix_iter iter;
+ struct journal_replay *i, **_i;
+
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
vstruct_for_each(&i->j, entry) {
@@ -742,6 +867,8 @@ static int verify_superblock_clean(struct bch_fs *c,
{
unsigned i;
struct bch_sb_field_clean *clean = *cleanp;
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
int ret = 0;
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
@@ -754,7 +881,6 @@ static int verify_superblock_clean(struct bch_fs *c,
}
for (i = 0; i < BTREE_ID_NR; i++) {
- char buf1[200], buf2[200];
struct bkey_i *k1, *k2;
unsigned l1 = 0, l2 = 0;
@@ -764,19 +890,34 @@ static int verify_superblock_clean(struct bch_fs *c,
if (!k1 && !k2)
continue;
+ printbuf_reset(&buf1);
+ printbuf_reset(&buf2);
+
+ if (k1)
+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
+ else
+ prt_printf(&buf1, "(none)");
+
+ if (k2)
+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
+ else
+ prt_printf(&buf2, "(none)");
+
mustfix_fsck_err_on(!k1 || !k2 ||
IS_ERR(k1) ||
IS_ERR(k2) ||
k1->k.u64s != k2->k.u64s ||
- memcmp(k1, k2, bkey_bytes(k1)) ||
+ memcmp(k1, k2, bkey_bytes(&k1->k)) ||
l1 != l2, c,
"superblock btree root %u doesn't match journal after clean shutdown\n"
"sb: l=%u %s\n"
"journal: l=%u %s\n", i,
- l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1),
- l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2));
+ l1, buf1.buf,
+ l2, buf2.buf);
}
fsck_err:
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
return ret;
}
@@ -800,10 +941,10 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
GFP_KERNEL);
if (!clean) {
mutex_unlock(&c->sb_lock);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
}
- ret = bch2_sb_clean_validate(c, clean, READ);
+ ret = bch2_sb_clean_validate_late(c, clean, READ);
if (ret) {
mutex_unlock(&c->sb_lock);
return ERR_PTR(ret);
@@ -817,6 +958,20 @@ fsck_err:
return ERR_PTR(ret);
}
+static bool btree_id_is_alloc(enum btree_id id)
+{
+ switch (id) {
+ case BTREE_ID_alloc:
+ case BTREE_ID_backpointers:
+ case BTREE_ID_need_discard:
+ case BTREE_ID_freespace:
+ case BTREE_ID_bucket_gens:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int read_btree_roots(struct bch_fs *c)
{
unsigned i;
@@ -828,14 +983,14 @@ static int read_btree_roots(struct bch_fs *c)
if (!r->alive)
continue;
- if (i == BTREE_ID_alloc &&
+ if (btree_id_is_alloc(i) &&
c->opts.reconstruct_alloc) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
continue;
}
if (r->error) {
- __fsck_err(c, i == BTREE_ID_alloc
+ __fsck_err(c, btree_id_is_alloc(i)
? FSCK_CAN_IGNORE : 0,
"invalid btree root %s",
bch2_btree_ids[i]);
@@ -845,18 +1000,25 @@ static int read_btree_roots(struct bch_fs *c)
ret = bch2_btree_root_read(c, i, &r->key, r->level);
if (ret) {
- __fsck_err(c, i == BTREE_ID_alloc
+ __fsck_err(c,
+ btree_id_is_alloc(i)
? FSCK_CAN_IGNORE : 0,
"error reading btree root %s",
bch2_btree_ids[i]);
- if (i == BTREE_ID_alloc)
+ if (btree_id_is_alloc(i))
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
}
}
- for (i = 0; i < BTREE_ID_NR; i++)
- if (!c->btree_roots[i].b)
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ struct btree_root *r = &c->btree_roots[i];
+
+ if (!r->b) {
+ r->alive = false;
+ r->level = 0;
bch2_btree_root_alloc(c, i);
+ }
+ }
fsck_err:
return ret;
}
@@ -881,7 +1043,6 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
if (ret)
return ret;
-
bkey_subvolume_init(&root_volume.k_i);
root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
root_volume.v.flags = 0;
@@ -933,7 +1094,7 @@ int bch2_fs_recovery(struct bch_fs *c)
const char *err = "cannot allocate memory";
struct bch_sb_field_clean *clean = NULL;
struct jset *last_journal_entry = NULL;
- u64 blacklist_seq, journal_seq;
+ u64 last_seq, blacklist_seq, journal_seq;
bool write_sb = false;
int ret = 0;
@@ -974,28 +1135,21 @@ int bch2_fs_recovery(struct bch_fs *c)
c->opts.fix_errors = FSCK_OPT_YES;
}
- if (!c->replicas.entries ||
- c->opts.rebuild_replicas) {
- bch_info(c, "building replicas info");
- set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
- }
-
if (!c->opts.nochanges) {
- if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
- bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
+ if (c->sb.version < bcachefs_metadata_version_no_bps_in_alloc_keys) {
+ bch_info(c, "version prior to no_bps_in_alloc_keys, upgrade and fsck required");
c->opts.version_upgrade = true;
c->opts.fsck = true;
c->opts.fix_errors = FSCK_OPT_YES;
- } else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) {
- bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
- c->opts.version_upgrade = true;
- c->opts.fsck = true;
- } else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
- bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
- c->opts.version_upgrade = true;
}
}
+ if (c->opts.fsck && c->opts.norecovery) {
+ bch_err(c, "cannot select both norecovery and fsck");
+ ret = -EINVAL;
+ goto err;
+ }
+
ret = bch2_blacklist_table_initialize(c);
if (ret) {
bch_err(c, "error initializing blacklist table");
@@ -1003,17 +1157,24 @@ int bch2_fs_recovery(struct bch_fs *c)
}
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
- struct journal_replay *i;
+ struct genradix_iter iter;
+ struct journal_replay **i;
bch_verbose(c, "starting journal read");
- ret = bch2_journal_read(c, &c->journal_entries,
- &blacklist_seq, &journal_seq);
+ ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
if (ret)
goto err;
- list_for_each_entry_reverse(i, &c->journal_entries, list)
- if (!i->ignore) {
- last_journal_entry = &i->j;
+ /*
+ * note: cmd_list_journal needs the blacklist table fully up to date so
+ * it can asterisk ignored journal entries:
+ */
+ if (c->opts.read_journal_only)
+ goto out;
+
+ genradix_for_each_reverse(&c->journal_entries, iter, i)
+ if (*i && !(*i)->ignore) {
+ last_journal_entry = &(*i)->j;
break;
}
@@ -1028,14 +1189,20 @@ int bch2_fs_recovery(struct bch_fs *c)
if (!last_journal_entry) {
fsck_err_on(!c->sb.clean, c, "no journal entries found");
- goto use_clean;
+ if (clean)
+ goto use_clean;
+
+ genradix_for_each_reverse(&c->journal_entries, iter, i)
+ if (*i) {
+ last_journal_entry = &(*i)->j;
+ (*i)->ignore = false;
+ break;
+ }
}
- c->journal_keys = journal_keys_sort(&c->journal_entries);
- if (!c->journal_keys.d) {
- ret = -ENOMEM;
+ ret = journal_keys_sort(c);
+ if (ret)
goto err;
- }
if (c->sb.clean && last_journal_entry) {
ret = verify_superblock_clean(c, &clean,
@@ -1047,7 +1214,7 @@ int bch2_fs_recovery(struct bch_fs *c)
use_clean:
if (!clean) {
bch_err(c, "no superblock clean section found");
- ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
+ ret = -BCH_ERR_fsck_repair_impossible;
goto err;
}
@@ -1061,7 +1228,7 @@ use_clean:
zero_out_btree_mem_ptr(&c->journal_keys);
- ret = journal_replay_early(c, clean, &c->journal_entries);
+ ret = journal_replay_early(c, clean);
if (ret)
goto err;
@@ -1076,7 +1243,9 @@ use_clean:
journal_seq += 8;
if (blacklist_seq != journal_seq) {
- ret = bch2_journal_seq_blacklist_add(c,
+ ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
+ blacklist_seq, journal_seq) ?:
+ bch2_journal_seq_blacklist_add(c,
blacklist_seq, journal_seq);
if (ret) {
bch_err(c, "error creating new journal seq blacklist entry");
@@ -1084,18 +1253,35 @@ use_clean:
}
}
- ret = bch2_fs_journal_start(&c->journal, journal_seq,
- &c->journal_entries);
+ ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
+ journal_seq, last_seq, blacklist_seq - 1) ?:
+ bch2_fs_journal_start(&c->journal, journal_seq);
if (ret)
goto err;
+ if (c->opts.reconstruct_alloc)
+ bch2_journal_log_msg(c, "dropping alloc info");
+
+ /*
+ * Skip past versions that might have possibly been used (as nonces),
+ * but hadn't had their pointers written:
+ */
+ if (c->sb.encryption_type && !c->sb.clean)
+ atomic64_add(1 << 16, &c->key_version);
+
ret = read_btree_roots(c);
if (ret)
goto err;
bch_verbose(c, "starting alloc read");
err = "error reading allocation information";
- ret = bch2_alloc_read(c);
+
+ down_read(&c->gc_lock);
+ ret = c->sb.version < bcachefs_metadata_version_bucket_gens
+ ? bch2_alloc_read(c)
+ : bch2_bucket_gens_read(c);
+ up_read(&c->gc_lock);
+
if (ret)
goto err;
bch_verbose(c, "alloc read done");
@@ -1107,84 +1293,124 @@ use_clean:
goto err;
bch_verbose(c, "stripes_read done");
- set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+ if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+ err = "error creating root snapshot node";
+ ret = bch2_fs_initialize_subvolumes(c);
+ if (ret)
+ goto err;
+ }
- /*
- * If we're not running fsck, this ensures bch2_fsck_err() calls are
- * instead interpreted as bch2_inconsistent_err() calls:
- */
- if (!c->opts.fsck)
- set_bit(BCH_FS_FSCK_DONE, &c->flags);
+ bch_verbose(c, "reading snapshots table");
+ err = "error reading snapshots table";
+ ret = bch2_fs_snapshots_start(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "reading snapshots done");
- if (c->opts.fsck ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
- test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
+ if (c->opts.fsck) {
bool metadata_only = c->opts.norecovery;
- bch_info(c, "starting mark and sweep");
- err = "error in mark and sweep";
+ bch_info(c, "checking allocations");
+ err = "error checking allocations";
ret = bch2_gc(c, true, metadata_only);
if (ret)
goto err;
- bch_verbose(c, "mark and sweep done");
- }
+ bch_verbose(c, "done checking allocations");
- bch2_stripes_heap_start(c);
+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
- clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
- set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+ set_bit(BCH_FS_MAY_GO_RW, &c->flags);
- /*
- * Skip past versions that might have possibly been used (as nonces),
- * but hadn't had their pointers written:
- */
- if (c->sb.encryption_type && !c->sb.clean)
- atomic64_add(1 << 16, &c->key_version);
+ bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+ err = "journal replay failed";
+ ret = bch2_journal_replay(c, last_seq, blacklist_seq - 1);
+ if (ret)
+ goto err;
+ if (c->opts.verbose || !c->sb.clean)
+ bch_info(c, "journal replay done");
- if (c->opts.norecovery)
- goto out;
+ bch_info(c, "checking need_discard and freespace btrees");
+ err = "error checking need_discard and freespace btrees";
+ ret = bch2_check_alloc_info(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking need_discard and freespace btrees");
- bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
- err = "journal replay failed";
- ret = bch2_journal_replay(c);
- if (ret)
- goto err;
- if (c->opts.verbose || !c->sb.clean)
- bch_info(c, "journal replay done");
+ set_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags);
- if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
- !c->opts.nochanges) {
- /*
- * note that even when filesystem was clean there might be work
- * to do here, if we ran gc (because of fsck) which recalculated
- * oldest_gen:
- */
- bch_verbose(c, "writing allocation info");
- err = "error writing out alloc info";
- ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW);
- if (ret) {
- bch_err(c, "error writing alloc info");
+ bch_info(c, "checking lrus");
+ err = "error checking lrus";
+ ret = bch2_check_lrus(c);
+ if (ret)
goto err;
- }
- bch_verbose(c, "alloc write done");
- }
+ bch_verbose(c, "done checking lrus");
+ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
- if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
- bch2_fs_lazy_rw(c);
+ bch_info(c, "checking backpointers to alloc keys");
+ err = "error checking backpointers to alloc keys";
+ ret = bch2_check_btree_backpointers(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking backpointers to alloc keys");
- err = "error creating root snapshot node";
- ret = bch2_fs_initialize_subvolumes(c);
+ bch_info(c, "checking backpointers to extents");
+ err = "error checking backpointers to extents";
+ ret = bch2_check_backpointers_to_extents(c);
if (ret)
goto err;
+ bch_verbose(c, "done checking backpointers to extents");
+
+ bch_info(c, "checking extents to backpointers");
+ err = "error checking extents to backpointers";
+ ret = bch2_check_extents_to_backpointers(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking extents to backpointers");
+ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+
+ bch_info(c, "checking alloc to lru refs");
+ err = "error checking alloc to lru refs";
+ ret = bch2_check_alloc_to_lru_refs(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "done checking alloc to lru refs");
+ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
+ } else {
+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+ if (c->opts.norecovery)
+ goto out;
+
+ set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+
+ bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+ err = "journal replay failed";
+ ret = bch2_journal_replay(c, last_seq, blacklist_seq - 1);
+ if (ret)
+ goto err;
+ if (c->opts.verbose || !c->sb.clean)
+ bch_info(c, "journal replay done");
}
- bch_verbose(c, "reading snapshots table");
- err = "error reading snapshots table";
- ret = bch2_fs_snapshots_start(c);
+ err = "error initializing freespace";
+ ret = bch2_fs_freespace_init(c);
if (ret)
goto err;
- bch_verbose(c, "reading snapshots done");
+
+ if (c->sb.version < bcachefs_metadata_version_bucket_gens &&
+ c->opts.version_upgrade) {
+ bch_info(c, "initializing bucket_gens");
+ err = "error initializing bucket gens";
+ ret = bch2_bucket_gens_init(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "bucket_gens init done");
+ }
if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
/* set bi_subvol on root inode */
@@ -1248,7 +1474,7 @@ use_clean:
le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
struct bch_move_stats stats;
- bch_move_stats_init(&stats, "recovery");
+ bch2_move_stats_init(&stats, "recovery");
bch_info(c, "scanning for old btree nodes");
ret = bch2_fs_read_write(c);
@@ -1270,15 +1496,22 @@ out:
set_bit(BCH_FS_FSCK_DONE, &c->flags);
bch2_flush_fsck_errs(c);
- if (!c->opts.keep_journal) {
+ if (!c->opts.keep_journal &&
+ test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) {
bch2_journal_keys_free(&c->journal_keys);
- bch2_journal_entries_free(&c->journal_entries);
+ bch2_journal_entries_free(c);
}
kfree(clean);
+
+ if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) {
+ bch2_fs_read_write_early(c);
+ bch2_delete_dead_snapshots_async(c);
+ }
+
if (ret)
- bch_err(c, "Error in recovery: %s (%i)", err, ret);
+ bch_err(c, "Error in recovery: %s (%s)", err, bch2_err_str(ret));
else
- bch_verbose(c, "ret %i", ret);
+ bch_verbose(c, "ret %s", bch2_err_str(ret));
return ret;
err:
fsck_err:
@@ -1293,7 +1526,6 @@ int bch2_fs_initialize(struct bch_fs *c)
struct qstr lostfound = QSTR("lost+found");
const char *err = "cannot allocate memory";
struct bch_dev *ca;
- LIST_HEAD(journal);
unsigned i;
int ret;
@@ -1303,6 +1535,9 @@ int bch2_fs_initialize(struct bch_fs *c)
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
+ if (c->sb.version < bcachefs_metadata_version_inode_v3)
+ c->opts.version_upgrade = true;
+
if (c->opts.version_upgrade) {
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
@@ -1310,13 +1545,19 @@ int bch2_fs_initialize(struct bch_fs *c)
}
mutex_unlock(&c->sb_lock);
- set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
+ set_bit(BCH_FS_MAY_GO_RW, &c->flags);
set_bit(BCH_FS_FSCK_DONE, &c->flags);
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
+ for_each_online_member(ca, c, i)
+ bch2_dev_usage_init(ca);
+
err = "unable to allocate journal buckets";
for_each_online_member(ca, c, i) {
ret = bch2_dev_journal_alloc(ca);
@@ -1330,7 +1571,7 @@ int bch2_fs_initialize(struct bch_fs *c)
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
- bch2_fs_journal_start(&c->journal, 1, &journal);
+ bch2_fs_journal_start(&c->journal, 1);
bch2_journal_set_replay_done(&c->journal);
err = "error going read-write";
@@ -1342,6 +1583,7 @@ int bch2_fs_initialize(struct bch_fs *c)
* Write out the superblock and journal buckets, now that we can do
* btree updates
*/
+ bch_verbose(c, "marking superblocks");
err = "error marking superblock and journal";
for_each_member_device(ca, c, i) {
ret = bch2_trans_mark_dev_sb(c, ca);
@@ -1353,6 +1595,12 @@ int bch2_fs_initialize(struct bch_fs *c)
ca->new_fs_bucket_idx = 0;
}
+ bch_verbose(c, "initializing freespace");
+ err = "error initializing freespace";
+ ret = bch2_fs_freespace_init(c);
+ if (ret)
+ goto err;
+
err = "error creating root snapshot node";
ret = bch2_fs_initialize_subvolumes(c);
if (ret)
@@ -1365,11 +1613,10 @@ int bch2_fs_initialize(struct bch_fs *c)
goto err;
bch_verbose(c, "reading snapshots done");
- bch2_inode_init(c, &root_inode, 0, 0,
- S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
+ bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
root_inode.bi_inum = BCACHEFS_ROOT_INO;
root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
- bch2_inode_pack(c, &packed_inode, &root_inode);
+ bch2_inode_pack(&packed_inode, &root_inode);
packed_inode.inode.k.p.snapshot = U32_MAX;
err = "error creating root directory";
@@ -1414,6 +1661,6 @@ int bch2_fs_initialize(struct bch_fs *c)
return 0;
err:
- pr_err("Error initializing new filesystem: %s (%i)", err, ret);
+ pr_err("Error initializing new filesystem: %s (%s)", err, bch2_err_str(ret));
return ret;
}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 21bdad9db249..8c0348e8b84c 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,9 +2,6 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
-#define for_each_journal_key(keys, i) \
- for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
-
struct journal_iter {
struct list_head list;
enum btree_id btree_id;
@@ -23,16 +20,14 @@ struct btree_and_journal_iter {
struct bkey unpacked;
struct journal_iter journal;
-
- enum last_key_returned {
- none,
- btree,
- journal,
- } last;
+ struct bpos pos;
+ bool at_end;
};
-size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
- unsigned, struct bpos);
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos);
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
@@ -45,7 +40,6 @@ void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
@@ -56,7 +50,7 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct btree *);
void bch2_journal_keys_free(struct journal_keys *);
-void bch2_journal_entries_free(struct list_head *);
+void bch2_journal_entries_free(struct bch_fs *);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c8d6d73681e0..d8426e754cdf 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -25,18 +25,25 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
/* reflink pointers */
-const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- if (bkey_val_bytes(p.k) != sizeof(*p.v))
- return "incorrect value size";
+ if (bkey_val_bytes(p.k) != sizeof(*p.v)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(p.k), sizeof(*p.v));
+ return -EINVAL;
+ }
if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
- le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad))
- return "idx < front_pad";
+ le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
+ prt_printf(err, "idx < front_pad (%llu < %u)",
+ le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
@@ -44,7 +51,7 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- pr_buf(out, "idx %llu front_pad %u back_pad %u",
+ prt_printf(out, "idx %llu front_pad %u back_pad %u",
le64_to_cpu(p.v->idx),
le32_to_cpu(p.v->front_pad),
le32_to_cpu(p.v->back_pad));
@@ -70,14 +77,18 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
/* indirect extents */
-const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
- if (bkey_val_bytes(r.k) < sizeof(*r.v))
- return "incorrect value size";
+ if (bkey_val_bytes(r.k) < sizeof(*r.v)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(r.k), sizeof(*r.v));
+ return -BCH_ERR_invalid_bkey;
+ }
- return bch2_bkey_ptrs_invalid(c, k);
+ return bch2_bkey_ptrs_invalid(c, k, flags, err);
}
void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
@@ -85,7 +96,7 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
- pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+ prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
bch2_bkey_ptrs_to_text(out, c, k);
}
@@ -98,14 +109,37 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
}
+int bch2_trans_mark_reflink_v(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
+ struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
+
+ if (!r->v.refcount) {
+ r->k.type = KEY_TYPE_deleted;
+ r->k.size = 0;
+ set_bkey_val_u64s(&r->k, 0);
+ return 0;
+ }
+ }
+
+ return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
+}
+
/* indirect inline data */
-const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c,
- struct bkey_s_c k)
+int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
- if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data))
- return "incorrect value size";
- return NULL;
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_indirect_inline_data));
+ return -BCH_ERR_invalid_bkey;
+ }
+
+ return 0;
}
void bch2_indirect_inline_data_to_text(struct printbuf *out,
@@ -114,11 +148,30 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
unsigned datalen = bkey_inline_data_bytes(k.k);
- pr_buf(out, "refcount %llu datalen %u: %*phN",
+ prt_printf(out, "refcount %llu datalen %u: %*phN",
le64_to_cpu(d.v->refcount), datalen,
min(datalen, 32U), d.v->data);
}
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
+ struct bkey_i_indirect_inline_data *r =
+ bkey_i_to_indirect_inline_data(new);
+
+ if (!r->v.refcount) {
+ r->k.type = KEY_TYPE_deleted;
+ r->k.size = 0;
+ set_bkey_val_u64s(&r->k, 0);
+ }
+ }
+
+ return 0;
+}
+
static int bch2_make_extent_indirect(struct btree_trans *trans,
struct btree_iter *extent_iter,
struct bkey_i *orig)
@@ -136,7 +189,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink,
POS(0, c->reflink_hint),
- BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
+ BTREE_ITER_SLOTS, k, ret) {
if (reflink_iter.pos.inode) {
bch2_btree_iter_set_pos(&reflink_iter, POS_MIN);
continue;
@@ -180,7 +233,13 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
orig->k.type = KEY_TYPE_reflink_p;
r_p = bkey_i_to_reflink_p(orig);
set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+
+ /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */
+#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
+ __underlying_memset(&r_p->v, 0, sizeof(r_p->v));
+#else
memset(&r_p->v, 0, sizeof(r_p->v));
+#endif
r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
@@ -198,15 +257,15 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
struct bkey_s_c k;
int ret;
- for_each_btree_key_continue_norestart(*iter, 0, k, ret) {
- if (bkey_cmp(iter->pos, end) >= 0)
- break;
+ for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) {
+ if (bkey_extent_is_unwritten(k))
+ continue;
if (bkey_extent_is_data(k.k))
return k;
}
- if (bkey_cmp(iter->pos, end) >= 0)
+ if (bkey_ge(iter->pos, end))
bch2_btree_iter_set_pos(iter, end);
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
}
@@ -229,8 +288,8 @@ s64 bch2_remap_range(struct bch_fs *c,
u32 dst_snapshot, src_snapshot;
int ret = 0, ret2 = 0;
- if (!percpu_ref_tryget(&c->writes))
- return -EROFS;
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
+ return -BCH_ERR_erofs_no_writes;
bch2_check_set_feature(c, BCH_FEATURE_reflink);
@@ -246,8 +305,9 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
BTREE_ITER_INTENT);
- while ((ret == 0 || ret == -EINTR) &&
- bkey_cmp(dst_iter.pos, dst_end) < 0) {
+ while ((ret == 0 ||
+ bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
+ bkey_lt(dst_iter.pos, dst_end)) {
struct disk_reservation disk_res = { 0 };
bch2_trans_begin(&trans);
@@ -280,7 +340,7 @@ s64 bch2_remap_range(struct bch_fs *c,
if (ret)
continue;
- if (bkey_cmp(src_want, src_iter.pos) < 0) {
+ if (bkey_lt(src_want, src_iter.pos)) {
ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
min(dst_end.offset,
dst_iter.pos.offset +
@@ -324,7 +384,7 @@ s64 bch2_remap_range(struct bch_fs *c,
dst_end.offset - dst_iter.pos.offset));
ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
- new_dst.k, &disk_res, NULL,
+ new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
true);
bch2_disk_reservation_put(c, &disk_res);
@@ -332,8 +392,8 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_iter_exit(&trans, &dst_iter);
bch2_trans_iter_exit(&trans, &src_iter);
- BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end));
- BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0);
+ BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
+ BUG_ON(bkey_gt(dst_iter.pos, dst_end));
dst_done = dst_iter.pos.offset - dst_start.offset;
new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
@@ -356,13 +416,13 @@ s64 bch2_remap_range(struct bch_fs *c,
}
bch2_trans_iter_exit(&trans, &inode_iter);
- } while (ret2 == -EINTR);
+ } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);
- percpu_ref_put(&c->writes);
+ bch2_write_ref_put(c, BCH_WRITE_REF_reflink);
return dst_done ?: ret ?: ret2;
}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 3745873fd88d..2391037c2ece 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -2,36 +2,49 @@
#ifndef _BCACHEFS_REFLINK_H
#define _BCACHEFS_REFLINK_H
-const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
+ unsigned, struct printbuf *);
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \
+#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \
.key_invalid = bch2_reflink_p_invalid, \
.val_to_text = bch2_reflink_p_to_text, \
- .key_merge = bch2_reflink_p_merge, \
-}
+ .key_merge = bch2_reflink_p_merge, \
+ .trans_trigger = bch2_trans_mark_reflink_p, \
+ .atomic_trigger = bch2_mark_reflink_p, \
+})
-const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
+ unsigned, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
+int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *, unsigned);
-#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \
+#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
.swab = bch2_ptr_swab, \
-}
+ .trans_trigger = bch2_trans_mark_reflink_v, \
+ .atomic_trigger = bch2_mark_extent, \
+})
-const char *bch2_indirect_inline_data_invalid(const struct bch_fs *,
- struct bkey_s_c);
+int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
+ unsigned, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+ enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_i *,
+ unsigned);
-#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \
+#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
.key_invalid = bch2_indirect_inline_data_invalid, \
.val_to_text = bch2_indirect_inline_data_to_text, \
-}
+ .trans_trigger = bch2_trans_mark_indirect_inline_data, \
+})
static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
{
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index a08f1e084a9d..8935ff5899c9 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -36,20 +36,36 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
}
+void bch2_replicas_entry_v0_to_text(struct printbuf *out,
+ struct bch_replicas_entry_v0 *e)
+{
+ unsigned i;
+
+ if (e->data_type < BCH_DATA_NR)
+ prt_printf(out, "%s", bch2_data_types[e->data_type]);
+ else
+ prt_printf(out, "(invalid data type %u)", e->data_type);
+
+ prt_printf(out, ": %u [", e->nr_devs);
+ for (i = 0; i < e->nr_devs; i++)
+ prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+ prt_printf(out, "]");
+}
+
void bch2_replicas_entry_to_text(struct printbuf *out,
struct bch_replicas_entry *e)
{
unsigned i;
if (e->data_type < BCH_DATA_NR)
- pr_buf(out, "%s", bch2_data_types[e->data_type]);
+ prt_printf(out, "%s", bch2_data_types[e->data_type]);
else
- pr_buf(out, "(invalid data type %u)", e->data_type);
+ prt_printf(out, "(invalid data type %u)", e->data_type);
- pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs);
+ prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
for (i = 0; i < e->nr_devs; i++)
- pr_buf(out, i ? " %u" : "%u", e->devs[i]);
- pr_buf(out, "]");
+ prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+ prt_printf(out, "]");
}
void bch2_cpu_replicas_to_text(struct printbuf *out,
@@ -60,7 +76,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
for_each_cpu_replicas_entry(r, e) {
if (!first)
- pr_buf(out, " ");
+ prt_printf(out, " ");
first = false;
bch2_replicas_entry_to_text(out, e);
@@ -320,7 +336,7 @@ out:
return ret;
err:
bch_err(c, "error updating replicas table: memory allocation failure");
- ret = -ENOMEM;
+ ret = -BCH_ERR_ENOMEM_replicas_table;
goto out;
}
@@ -367,14 +383,18 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
if (c->replicas_gc.entries &&
!__replicas_has_entry(&c->replicas_gc, new_entry)) {
new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry);
- if (!new_gc.entries)
+ if (!new_gc.entries) {
+ ret = -BCH_ERR_ENOMEM_cpu_replicas;
goto err;
+ }
}
if (!__replicas_has_entry(&c->replicas, new_entry)) {
new_r = cpu_replicas_add_entry(&c->replicas, new_entry);
- if (!new_r.entries)
+ if (!new_r.entries) {
+ ret = -BCH_ERR_ENOMEM_cpu_replicas;
goto err;
+ }
ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
if (ret)
@@ -409,23 +429,14 @@ out:
return ret;
err:
- bch_err(c, "error adding replicas entry: memory allocation failure");
- ret = -ENOMEM;
+ bch_err(c, "error adding replicas entry: %s", bch2_err_str(ret));
goto out;
}
-static int __bch2_mark_replicas(struct bch_fs *c,
- struct bch_replicas_entry *r,
- bool check)
-{
- return likely(bch2_replicas_marked(c, r)) ? 0
- : check ? -1
- : bch2_mark_replicas_slowpath(c, r);
-}
-
int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
{
- return __bch2_mark_replicas(c, r, false);
+ return likely(bch2_replicas_marked(c, r))
+ ? 0 : bch2_mark_replicas_slowpath(c, r);
}
/* replicas delta list: */
@@ -470,7 +481,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
n = cpu_replicas_add_entry(&c->replicas_gc, e);
if (!n.entries) {
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOMEM_cpu_replicas;
goto err;
}
@@ -479,10 +490,9 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
}
}
- if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
- ret = -ENOSPC;
+ ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
+ if (ret)
goto err;
- }
ret = replicas_table_update(c, &c->replicas_gc);
err:
@@ -526,7 +536,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
if (!c->replicas_gc.entries) {
mutex_unlock(&c->sb_lock);
bch_err(c, "error allocating c->replicas_gc");
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_replicas_gc;
}
for_each_cpu_replicas_entry(&c->replicas, e)
@@ -555,7 +565,7 @@ retry:
new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL);
if (!new.entries) {
bch_err(c, "error allocating c->replicas_gc");
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_replicas_gc;
}
mutex_lock(&c->sb_lock);
@@ -585,10 +595,9 @@ retry:
bch2_cpu_replicas_sort(&new);
- if (bch2_cpu_replicas_to_sb_replicas(c, &new)) {
- ret = -ENOSPC;
+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
+ if (ret)
goto err;
- }
ret = replicas_table_update(c, &new);
err:
@@ -615,7 +624,7 @@ int bch2_replicas_set_usage(struct bch_fs *c,
n = cpu_replicas_add_entry(&c->replicas, r);
if (!n.entries)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_cpu_replicas;
ret = replicas_table_update(c, &n);
if (ret)
@@ -649,7 +658,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
if (!cpu_r->entries)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_cpu_replicas;
cpu_r->nr = nr;
cpu_r->entry_size = entry_size;
@@ -681,7 +690,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
if (!cpu_r->entries)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_cpu_replicas;
cpu_r->nr = nr;
cpu_r->entry_size = entry_size;
@@ -711,9 +720,8 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
-
if (ret)
- return -ENOMEM;
+ return ret;
bch2_cpu_replicas_sort(&new_r);
@@ -743,7 +751,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
DIV_ROUND_UP(bytes, sizeof(u64)));
if (!sb_r)
- return -ENOSPC;
+ return -BCH_ERR_ENOSPC_sb_replicas;
bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
@@ -788,7 +796,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
sb_r = bch2_sb_resize_replicas(&c->disk_sb,
DIV_ROUND_UP(bytes, sizeof(u64)));
if (!sb_r)
- return -ENOSPC;
+ return -BCH_ERR_ENOSPC_sb_replicas;
bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
@@ -826,29 +834,29 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
cpu_replicas_entry(cpu_r, i);
if (e->data_type >= BCH_DATA_NR) {
- pr_buf(err, "invalid data type in entry ");
+ prt_printf(err, "invalid data type in entry ");
bch2_replicas_entry_to_text(err, e);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_replicas;
}
if (!e->nr_devs) {
- pr_buf(err, "no devices in entry ");
+ prt_printf(err, "no devices in entry ");
bch2_replicas_entry_to_text(err, e);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_replicas;
}
if (e->nr_required > 1 &&
e->nr_required >= e->nr_devs) {
- pr_buf(err, "bad nr_required in entry ");
+ prt_printf(err, "bad nr_required in entry ");
bch2_replicas_entry_to_text(err, e);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_replicas;
}
for (j = 0; j < e->nr_devs; j++)
if (!bch2_dev_exists(sb, mi, e->devs[j])) {
- pr_buf(err, "invalid device %u in entry ", e->devs[j]);
+ prt_printf(err, "invalid device %u in entry ", e->devs[j]);
bch2_replicas_entry_to_text(err, e);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_replicas;
}
if (i + 1 < cpu_r->nr) {
@@ -858,9 +866,9 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
if (!memcmp(e, n, cpu_r->entry_size)) {
- pr_buf(err, "duplicate replicas entry ");
+ prt_printf(err, "duplicate replicas entry ");
bch2_replicas_entry_to_text(err, e);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_replicas;
}
}
}
@@ -868,15 +876,16 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
return 0;
}
-static int bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f,
+static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
struct printbuf *err)
{
struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
struct bch_replicas_cpu cpu_r;
int ret;
- if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
- return -ENOMEM;
+ ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
+ if (ret)
+ return ret;
ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
kfree(cpu_r.entries);
@@ -893,35 +902,56 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
for_each_replicas_entry(r, e) {
if (!first)
- pr_buf(out, " ");
+ prt_printf(out, " ");
first = false;
bch2_replicas_entry_to_text(out, e);
}
+ prt_newline(out);
}
const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
- .validate = bch2_sb_validate_replicas,
+ .validate = bch2_sb_replicas_validate,
.to_text = bch2_sb_replicas_to_text,
};
-static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f,
+static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
struct printbuf *err)
{
struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
struct bch_replicas_cpu cpu_r;
int ret;
- if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
- return -ENOMEM;
+ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
+ if (ret)
+ return ret;
ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
kfree(cpu_r.entries);
return ret;
}
+static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
+ struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+ struct bch_replicas_entry_v0 *e;
+ bool first = true;
+
+ for_each_replicas_entry(sb_r, e) {
+ if (!first)
+ prt_printf(out, " ");
+ first = false;
+
+ bch2_replicas_entry_v0_to_text(out, e);
+ }
+ prt_newline(out);
+}
+
const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
- .validate = bch2_sb_validate_replicas_v0,
+ .validate = bch2_sb_replicas_v0_validate,
+ .to_text = bch2_sb_replicas_v0_to_text,
};
/* Query replicas: */
@@ -962,11 +992,12 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
if (dflags & ~flags) {
if (print) {
- char buf[100];
+ struct printbuf buf = PRINTBUF;
- bch2_replicas_entry_to_text(&PBUF(buf), e);
+ bch2_replicas_entry_to_text(&buf, e);
bch_err(c, "insufficient devices online (%u) for replicas entry %s",
- nr_online, buf);
+ nr_online, buf.buf);
+ printbuf_exit(&buf);
}
ret = false;
break;
@@ -978,19 +1009,42 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
return ret;
}
-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
{
- struct bch_replicas_entry *e;
- unsigned i, ret = 0;
+ struct bch_sb_field_replicas *replicas;
+ struct bch_sb_field_replicas_v0 *replicas_v0;
+ unsigned i, data_has = 0;
+
+ replicas = bch2_sb_get_replicas(sb);
+ replicas_v0 = bch2_sb_get_replicas_v0(sb);
+
+ if (replicas) {
+ struct bch_replicas_entry *r;
+
+ for_each_replicas_entry(replicas, r)
+ for (i = 0; i < r->nr_devs; i++)
+ if (r->devs[i] == dev)
+ data_has |= 1 << r->data_type;
+ } else if (replicas_v0) {
+ struct bch_replicas_entry_v0 *r;
+
+ for_each_replicas_entry_v0(replicas_v0, r)
+ for (i = 0; i < r->nr_devs; i++)
+ if (r->devs[i] == dev)
+ data_has |= 1 << r->data_type;
+ }
- percpu_down_read(&c->mark_lock);
- for_each_cpu_replicas_entry(&c->replicas, e)
- for (i = 0; i < e->nr_devs; i++)
- if (e->devs[i] == ca->dev_idx)
- ret |= 1 << e->data_type;
+ return data_has;
+}
- percpu_up_read(&c->mark_lock);
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+ unsigned ret;
+
+ mutex_lock(&c->sb_lock);
+ ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
+ mutex_unlock(&c->sb_lock);
return ret;
}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index d237d7c51ccb..4887675a86f0 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_REPLICAS_H
#define _BCACHEFS_REPLICAS_H
+#include "bkey.h"
#include "eytzinger.h"
#include "replicas_types.h"
@@ -26,22 +27,6 @@ bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
int bch2_mark_replicas(struct bch_fs *,
struct bch_replicas_entry *);
-struct replicas_delta {
- s64 delta;
- struct bch_replicas_entry r;
-} __packed;
-
-struct replicas_delta_list {
- unsigned size;
- unsigned used;
-
- struct {} memset_start;
- u64 nr_inodes;
- u64 persistent_reserved[BCH_REPLICAS_MAX];
- struct {} memset_end;
- struct replicas_delta d[0];
-};
-
static inline struct replicas_delta *
replicas_delta_next(struct replicas_delta *d)
{
@@ -64,6 +49,7 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
unsigned, bool);
+unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
int bch2_replicas_gc_end(struct bch_fs *, int);
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
index 0535b1d3760e..5cfff489bbc3 100644
--- a/fs/bcachefs/replicas_types.h
+++ b/fs/bcachefs/replicas_types.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_REPLICAS_TYPES_H
#define _BCACHEFS_REPLICAS_TYPES_H
@@ -7,4 +8,20 @@ struct bch_replicas_cpu {
struct bch_replicas_entry *entries;
};
+struct replicas_delta {
+ s64 delta;
+ struct bch_replicas_entry r;
+} __packed;
+
+struct replicas_delta_list {
+ unsigned size;
+ unsigned used;
+
+ struct {} memset_start;
+ u64 nr_inodes;
+ u64 persistent_reserved[BCH_REPLICAS_MAX];
+ struct {} memset_end;
+ struct replicas_delta d[0];
+};
+
#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
index c062edb3fbc2..dc1a27cc31cd 100644
--- a/fs/bcachefs/siphash.c
+++ b/fs/bcachefs/siphash.c
@@ -160,7 +160,7 @@ u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
memset(ctx, 0, sizeof(*ctx));
- return (r);
+ return r;
}
u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 5de733b95aa4..6178ae620ff1 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -13,7 +13,7 @@
#include <linux/crc32c.h>
#include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
static inline enum bch_str_hash_type
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
@@ -144,7 +144,9 @@ struct bch_hash_desc {
static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
{
return k.k->type == desc.key_type &&
- (!desc.is_visible || desc.is_visible(inum, k));
+ (!desc.is_visible ||
+ !inum.inum ||
+ desc.is_visible(inum, k));
}
static __always_inline int
@@ -163,12 +165,10 @@ bch2_hash_lookup(struct btree_trans *trans,
if (ret)
return ret;
- for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+ POS(inum.inum, U64_MAX),
BTREE_ITER_SLOTS|flags, k, ret) {
- if (iter->pos.inode != inum.inum)
- break;
-
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_key(k, key))
return 0;
@@ -199,18 +199,15 @@ bch2_hash_hole(struct btree_trans *trans,
if (ret)
return ret;
- for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (iter->pos.inode != inum.inum)
- break;
-
+ POS(inum.inum, U64_MAX),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
if (!is_visible_key(desc, inum, k))
return 0;
- }
bch2_trans_iter_exit(trans, iter);
- return ret ?: -ENOSPC;
+ return ret ?: -BCH_ERR_ENOSPC_str_hash_create;
}
static __always_inline
@@ -244,30 +241,25 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
}
static __always_inline
-int bch2_hash_set(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum,
- struct bkey_i *insert, int flags)
+int bch2_hash_set_snapshot(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, u32 snapshot,
+ struct bkey_i *insert,
+ int flags,
+ int update_flags)
{
struct btree_iter iter, slot = { NULL };
struct bkey_s_c k;
bool found = false;
- u32 snapshot;
int ret;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
- for_each_btree_key_norestart(trans, iter, desc.btree_id,
- SPOS(inum.inum,
+ for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
+ SPOS(insert->k.p.inode,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
+ POS(insert->k.p.inode, U64_MAX),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (iter.pos.inode != inum.inum)
- break;
-
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
@@ -285,7 +277,7 @@ int bch2_hash_set(struct btree_trans *trans,
}
if (!ret)
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_str_hash_create;
out:
bch2_trans_iter_exit(trans, &slot);
bch2_trans_iter_exit(trans, &iter);
@@ -311,6 +303,26 @@ not_found:
}
static __always_inline
+int bch2_hash_set(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum,
+ struct bkey_i *insert, int flags)
+{
+ u32 snapshot;
+ int ret;
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ insert->k.p.inode = inum.inum;
+
+ return bch2_hash_set_snapshot(trans, desc, info, inum,
+ snapshot, insert, flags, 0);
+}
+
+static __always_inline
int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 69603327d93d..6407d19edc0e 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -3,21 +3,19 @@
#include "bcachefs.h"
#include "btree_key_cache.h"
#include "btree_update.h"
+#include "errcode.h"
#include "error.h"
#include "fs.h"
#include "subvolume.h"
/* Snapshot tree: */
-static void bch2_delete_dead_snapshots_work(struct work_struct *);
-static void bch2_delete_dead_snapshots(struct bch_fs *);
-
void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
- pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
+ prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u",
BCH_SNAPSHOT_SUBVOL(s.v),
BCH_SNAPSHOT_DELETED(s.v),
le32_to_cpu(s.v->parent),
@@ -26,42 +24,59 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
le32_to_cpu(s.v->subvol));
}
-const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
struct bkey_s_c_snapshot s;
u32 i, id;
- if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
- bkey_cmp(k.k->p, POS(0, 1)) < 0)
- return "bad pos";
+ if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+ bkey_lt(k.k->p, POS(0, 1))) {
+ prt_printf(err, "bad pos");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
- return "bad val size";
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) {
+ prt_printf(err, "bad val size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_snapshot));
+ return -BCH_ERR_invalid_bkey;
+ }
s = bkey_s_c_to_snapshot(k);
id = le32_to_cpu(s.v->parent);
- if (id && id <= k.k->p.offset)
- return "bad parent node";
+ if (id && id <= k.k->p.offset) {
+ prt_printf(err, "bad parent node (%u <= %llu)",
+ id, k.k->p.offset);
+ return -BCH_ERR_invalid_bkey;
+ }
- if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
- return "children not normalized";
+ if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
+ prt_printf(err, "children not normalized");
+ return -BCH_ERR_invalid_bkey;
+ }
if (s.v->children[0] &&
- s.v->children[0] == s.v->children[1])
- return "duplicate child nodes";
+ s.v->children[0] == s.v->children[1]) {
+ prt_printf(err, "duplicate child nodes");
+ return -BCH_ERR_invalid_bkey;
+ }
for (i = 0; i < 2; i++) {
id = le32_to_cpu(s.v->children[i]);
- if (id >= k.k->p.offset)
- return "bad child node";
+ if (id >= k.k->p.offset) {
+ prt_printf(err, "bad child node (%u >= %llu)",
+ id, k.k->p.offset);
+ return -BCH_ERR_invalid_bkey;
+ }
}
- return NULL;
+ return 0;
}
int bch2_mark_snapshot(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
@@ -72,7 +87,7 @@ int bch2_mark_snapshot(struct btree_trans *trans,
U32_MAX - new.k->p.offset,
GFP_KERNEL);
if (!t)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_mark_snapshot;
if (new.k->type == KEY_TYPE_snapshot) {
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
@@ -118,7 +133,7 @@ static int snapshot_live(struct btree_trans *trans, u32 id)
if (!id)
return 0;
- ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ ret = snapshot_lookup(trans, id, &v);
if (ret == -ENOENT)
bch_err(trans->c, "snapshot node %u not found", id);
if (ret)
@@ -127,156 +142,207 @@ static int snapshot_live(struct btree_trans *trans, u32 id)
return !BCH_SNAPSHOT_DELETED(&v);
}
-static int bch2_snapshots_set_equiv(struct btree_trans *trans)
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
+ unsigned i, nr_live = 0, live_idx = 0;
struct bkey_s_c_snapshot snap;
- unsigned i;
- int ret;
+ u32 id = k.k->p.offset, child[2];
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
- u32 id = k.k->p.offset, child[2];
- unsigned nr_live = 0, live_idx;
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
- if (k.k->type != KEY_TYPE_snapshot)
- continue;
+ snap = bkey_s_c_to_snapshot(k);
- snap = bkey_s_c_to_snapshot(k);
- child[0] = le32_to_cpu(snap.v->children[0]);
- child[1] = le32_to_cpu(snap.v->children[1]);
+ child[0] = le32_to_cpu(snap.v->children[0]);
+ child[1] = le32_to_cpu(snap.v->children[1]);
- for (i = 0; i < 2; i++) {
- ret = snapshot_live(trans, child[i]);
- if (ret < 0)
- break;
+ for (i = 0; i < 2; i++) {
+ int ret = snapshot_live(trans, child[i]);
- if (ret)
- live_idx = i;
- nr_live += ret;
- }
+ if (ret < 0)
+ return ret;
- snapshot_t(c, id)->equiv = nr_live == 1
- ? snapshot_t(c, child[live_idx])->equiv
- : id;
+ if (ret)
+ live_idx = i;
+ nr_live += ret;
}
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- bch_err(c, "error walking snapshots: %i", ret);
- return ret;
+ snapshot_t(c, id)->equiv = nr_live == 1
+ ? snapshot_t(c, child[live_idx])->equiv
+ : id;
+ return 0;
}
/* fsck: */
-static int bch2_snapshot_check(struct btree_trans *trans,
- struct bkey_s_c_snapshot s)
+static int check_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_snapshot s;
struct bch_subvolume subvol;
struct bch_snapshot v;
+ struct printbuf buf = PRINTBUF;
+ bool should_have_subvol;
u32 i, id;
- int ret;
-
- id = le32_to_cpu(s.v->subvol);
- ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
- if (ret == -ENOENT)
- bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
- s.k->p.offset, id);
- if (ret)
- return ret;
+ int ret = 0;
- if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
- bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
- s.k->p.offset);
- return -EINVAL;
- }
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
+ s = bkey_s_c_to_snapshot(k);
id = le32_to_cpu(s.v->parent);
if (id) {
- ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ ret = snapshot_lookup(trans, id, &v);
if (ret == -ENOENT)
- bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
- s.k->p.offset, id);
+ bch_err(c, "snapshot with nonexistent parent:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
if (ret)
- return ret;
+ goto err;
if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
le32_to_cpu(v.children[1]) != s.k->p.offset) {
- bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
+ bch_err(c, "snapshot parent %u missing pointer to child %llu",
id, s.k->p.offset);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
}
for (i = 0; i < 2 && s.v->children[i]; i++) {
id = le32_to_cpu(s.v->children[i]);
- ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ ret = snapshot_lookup(trans, id, &v);
if (ret == -ENOENT)
- bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
+ bch_err(c, "snapshot node %llu has nonexistent child %u",
s.k->p.offset, id);
if (ret)
- return ret;
+ goto err;
if (le32_to_cpu(v.parent) != s.k->p.offset) {
- bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
+ bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
id, le32_to_cpu(v.parent), s.k->p.offset);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
}
- return 0;
+ should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) &&
+ !BCH_SNAPSHOT_DELETED(s.v);
+
+ if (should_have_subvol) {
+ id = le32_to_cpu(s.v->subvol);
+ ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+ if (ret == -ENOENT)
+ bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
+ if (ret)
+ goto err;
+
+ if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+ bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+ s.k->p.offset);
+ ret = -EINVAL;
+ goto err;
+ }
+ } else {
+ if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u));
+
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&u->k_i, s.s_c);
+ u->v.subvol = 0;
+ ret = bch2_trans_update(trans, iter, &u->k_i, 0);
+ if (ret)
+ goto err;
+ }
+ }
+
+ if (BCH_SNAPSHOT_DELETED(s.v))
+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
}
-int bch2_fs_snapshots_check(struct bch_fs *c)
+int bch2_fs_check_snapshots(struct bch_fs *c)
{
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct bch_snapshot s;
- unsigned id;
int ret;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
- if (k.k->type != KEY_TYPE_snapshot)
- continue;
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_snapshots, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_snapshot(&trans, &iter, k));
+
+ if (ret)
+ bch_err(c, "error %i checking snapshots", ret);
+
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+static int check_subvol(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_subvolume subvol;
+ struct bch_snapshot snapshot;
+ unsigned snapid;
+ int ret;
+
+ if (k.k->type != KEY_TYPE_subvolume)
+ return 0;
+
+ subvol = bkey_s_c_to_subvolume(k);
+ snapid = le32_to_cpu(subvol.v->snapshot);
+ ret = snapshot_lookup(trans, snapid, &snapshot);
+
+ if (ret == -ENOENT)
+ bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u",
+ k.k->p.offset, snapid);
+ if (ret)
+ return ret;
- ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
+ if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+ ret = bch2_subvolume_delete(trans, iter->pos.offset);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(trans->c, "error deleting subvolume %llu: %s",
+ iter->pos.offset, bch2_err_str(ret));
if (ret)
- break;
+ return ret;
}
- bch2_trans_iter_exit(&trans, &iter);
- if (ret) {
- bch_err(c, "error %i checking snapshots", ret);
- goto err;
- }
+ return 0;
+}
+
+int bch2_fs_check_subvols(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ ret = for_each_btree_key_commit(&trans, iter,
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ check_subvol(&trans, &iter, k));
- for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
- POS_MIN, 0, k, ret) {
- if (k.k->type != KEY_TYPE_subvolume)
- continue;
-again_2:
- id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
- ret = snapshot_lookup(&trans, id, &s);
-
- if (ret == -EINTR) {
- k = bch2_btree_iter_peek(&iter);
- goto again_2;
- } else if (ret == -ENOENT)
- bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
- k.k->p.offset, id);
- else if (ret)
- break;
- }
- bch2_trans_iter_exit(&trans, &iter);
-err:
bch2_trans_exit(&trans);
+
return ret;
}
@@ -290,49 +356,19 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
- bool have_deleted = false;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
- if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
- break;
-
- if (k.k->type != KEY_TYPE_snapshot) {
- bch_err(c, "found wrong key type %u in snapshot node table",
- k.k->type);
- continue;
- }
-
- if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
- have_deleted = true;
-
- ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(&trans, &iter);
-
- if (ret)
- goto err;
+ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+ bch2_snapshot_set_equiv(&trans, k));
- ret = bch2_snapshots_set_equiv(&trans);
- if (ret)
- goto err;
-err:
bch2_trans_exit(&trans);
- if (!ret && have_deleted) {
- bch_info(c, "restarting deletion of dead snapshots");
- if (c->opts.fsck) {
- bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
- } else {
- bch2_delete_dead_snapshots(c);
- }
- }
-
+ if (ret)
+ bch_err(c, "error starting snapshots: %s", bch2_err_str(ret));
return ret;
}
@@ -342,35 +378,26 @@ err:
static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
{
struct btree_iter iter;
- struct bkey_s_c k;
struct bkey_i_snapshot *s;
int ret = 0;
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_snapshot) {
- bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
- ret = -ENOENT;
+ s = bch2_bkey_get_mut_typed(trans, &iter, snapshot);
+ ret = PTR_ERR_OR_ZERO(s);
+ if (unlikely(ret)) {
+ bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing snapshot %u", id);
goto err;
}
/* already deleted? */
- if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
- goto err;
-
- s = bch2_trans_kmalloc(trans, sizeof(*s));
- ret = PTR_ERR_OR_ZERO(s);
- if (ret)
+ if (BCH_SNAPSHOT_DELETED(&s->v))
goto err;
- bkey_reassemble(&s->k_i, k);
-
SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+ SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
+ s->v.subvol = 0;
+
ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
if (ret)
goto err;
@@ -384,7 +411,6 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
struct bkey_s_c k;
struct bkey_s_c_snapshot s;
- struct bkey_i_snapshot *parent;
u32 parent_id;
unsigned i;
int ret = 0;
@@ -408,26 +434,17 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
parent_id = le32_to_cpu(s.v->parent);
if (parent_id) {
+ struct bkey_i_snapshot *parent;
+
bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
POS(0, parent_id),
BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(&p_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_snapshot) {
- bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
- ret = -ENOENT;
- goto err;
- }
-
- parent = bch2_trans_kmalloc(trans, sizeof(*parent));
+ parent = bch2_bkey_get_mut_typed(trans, &p_iter, snapshot);
ret = PTR_ERR_OR_ZERO(parent);
- if (ret)
+ if (unlikely(ret)) {
+ bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing snapshot %u", parent_id);
goto err;
-
- bkey_reassemble(&parent->k_i, k);
+ }
for (i = 0; i < 2; i++)
if (le32_to_cpu(parent->v.children[i]) == id)
@@ -481,17 +498,15 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
goto err;
if (!k.k || !k.k->p.offset) {
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_snapshot_create;
goto err;
}
- n = bch2_trans_kmalloc(trans, sizeof(*n));
+ n = bch2_bkey_alloc(trans, &iter, snapshot);
ret = PTR_ERR_OR_ZERO(n);
if (ret)
goto err;
- bkey_snapshot_init(&n->k_i);
- n->k.p = iter.pos;
n->v.flags = 0;
n->v.parent = cpu_to_le32(parent);
n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
@@ -499,7 +514,8 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
ret = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
- bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+ bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+ bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
if (ret)
goto err;
@@ -508,23 +524,13 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
if (parent) {
bch2_btree_iter_set_pos(&iter, POS(0, parent));
- k = bch2_btree_iter_peek(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_snapshot) {
- bch_err(trans->c, "snapshot %u not found", parent);
- ret = -ENOENT;
- goto err;
- }
-
- n = bch2_trans_kmalloc(trans, sizeof(*n));
+ n = bch2_bkey_get_mut_typed(trans, &iter, snapshot);
ret = PTR_ERR_OR_ZERO(n);
- if (ret)
+ if (unlikely(ret)) {
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot %u not found", parent);
goto err;
-
- bkey_reassemble(&n->k_i, k);
+ }
if (n->v.children[0] || n->v.children[1]) {
bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
@@ -534,8 +540,9 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
n->v.children[0] = cpu_to_le32(new_snapids[0]);
n->v.children[1] = cpu_to_le32(new_snapids[1]);
+ n->v.subvol = 0;
SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
- ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+ ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
if (ret)
goto err;
}
@@ -544,141 +551,100 @@ err:
return ret;
}
-static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
+static int snapshot_delete_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ snapshot_id_list *deleted,
+ snapshot_id_list *equiv_seen,
+ struct bpos *last_pos)
{
- BUG_ON(snapshot_list_has_id(s, id));
-
- if (s->nr == s->size) {
- size_t new_size = max(8U, s->size * 2);
- void *n = krealloc(s->d,
- new_size * sizeof(s->d[0]),
- GFP_KERNEL);
- if (!n) {
- pr_err("error allocating snapshot ID list");
- return -ENOMEM;
- }
+ struct bch_fs *c = trans->c;
+ u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
- s->d = n;
- s->size = new_size;
- };
+ if (!bkey_eq(k.k->p, *last_pos))
+ equiv_seen->nr = 0;
+ *last_pos = k.k->p;
- s->d[s->nr++] = id;
- return 0;
+ if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+ snapshot_list_has_id(equiv_seen, equiv)) {
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ } else {
+ return snapshot_list_add(c, equiv_seen, equiv);
+ }
}
-static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
- struct snapshot_id_list *deleted,
- enum btree_id btree_id)
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct snapshot_id_list equiv_seen = { 0 };
- struct bpos last_pos = POS_MIN;
- int ret = 0;
+ struct bkey_s_c_snapshot snap;
+ u32 children[2];
+ int ret;
- /*
- * XXX: We should also delete whiteouts that no longer overwrite
- * anything
- */
+ if (k.k->type != KEY_TYPE_snapshot)
+ return 0;
- bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
-
- while ((bch2_trans_begin(trans),
- (k = bch2_btree_iter_peek(&iter)).k) &&
- !(ret = bkey_err(k))) {
- u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
-
- if (bkey_cmp(k.k->p, last_pos))
- equiv_seen.nr = 0;
- last_pos = k.k->p;
-
- if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
- snapshot_list_has_id(&equiv_seen, equiv)) {
- if (btree_id == BTREE_ID_inodes &&
- bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
- continue;
-
- ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL,
- bch2_btree_iter_traverse(&iter) ?:
- bch2_btree_delete_at(trans, &iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
- if (ret)
- break;
- } else {
- ret = snapshot_id_add(&equiv_seen, equiv);
- if (ret)
- break;
- }
+ snap = bkey_s_c_to_snapshot(k);
+ if (BCH_SNAPSHOT_DELETED(snap.v) ||
+ BCH_SNAPSHOT_SUBVOL(snap.v))
+ return 0;
- bch2_btree_iter_advance(&iter);
- }
- bch2_trans_iter_exit(trans, &iter);
+ children[0] = le32_to_cpu(snap.v->children[0]);
+ children[1] = le32_to_cpu(snap.v->children[1]);
- kfree(equiv_seen.d);
+ ret = snapshot_live(trans, children[0]) ?:
+ snapshot_live(trans, children[1]);
+ if (ret < 0)
+ return ret;
- return ret;
+ if (!ret)
+ return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+ return 0;
}
-static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+int bch2_delete_dead_snapshots(struct bch_fs *c)
{
- struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_snapshot snap;
- struct snapshot_id_list deleted = { 0 };
- u32 i, id, children[2];
+ snapshot_id_list deleted = { 0 };
+ u32 i, id;
int ret = 0;
+ if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
+ return 0;
+
+ if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+ ret = bch2_fs_read_write_early(c);
+ if (ret) {
+ bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
+ return ret;
+ }
+ }
+
bch2_trans_init(&trans, c, 0, 0);
/*
* For every snapshot node: If we have no live children and it's not
* pointed to by a subvolume, delete it:
*/
- for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
- if (k.k->type != KEY_TYPE_snapshot)
- continue;
-
- snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v) ||
- BCH_SNAPSHOT_SUBVOL(snap.v))
- continue;
-
- children[0] = le32_to_cpu(snap.v->children[0]);
- children[1] = le32_to_cpu(snap.v->children[1]);
-
- ret = snapshot_live(&trans, children[0]) ?:
- snapshot_live(&trans, children[1]);
- if (ret < 0)
- break;
- if (ret)
- continue;
-
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
- if (ret) {
- bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
- break;
- }
- }
- bch2_trans_iter_exit(&trans, &iter);
-
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ NULL, NULL, 0,
+ bch2_delete_redundant_snapshot(&trans, &iter, k));
if (ret) {
- bch_err(c, "error walking snapshots: %i", ret);
+ bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
goto err;
}
- ret = bch2_snapshots_set_equiv(&trans);
- if (ret)
+ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ bch2_snapshot_set_equiv(&trans, k));
+ if (ret) {
+ bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
goto err;
+ }
for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k, ret) {
@@ -687,7 +653,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
snap = bkey_s_c_to_snapshot(k);
if (BCH_SNAPSHOT_DELETED(snap.v)) {
- ret = snapshot_id_add(&deleted, k.k->p.offset);
+ ret = snapshot_list_add(c, &deleted, k.k->p.offset);
if (ret)
break;
}
@@ -695,66 +661,95 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
bch2_trans_iter_exit(&trans, &iter);
if (ret) {
- bch_err(c, "error walking snapshots: %i", ret);
+ bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
goto err;
}
for (id = 0; id < BTREE_ID_NR; id++) {
+ struct bpos last_pos = POS_MIN;
+ snapshot_id_list equiv_seen = { 0 };
+
if (!btree_type_has_snapshots(id))
continue;
- ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
+ ret = for_each_btree_key_commit(&trans, iter,
+ id, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BTREE_INSERT_NOFAIL,
+ snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
+
+ darray_exit(&equiv_seen);
+
if (ret) {
- bch_err(c, "error deleting snapshot keys: %i", ret);
+ bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
goto err;
}
}
for (i = 0; i < deleted.nr; i++) {
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(&trans, deleted.d[i]));
+ ret = commit_do(&trans, NULL, NULL, 0,
+ bch2_snapshot_node_delete(&trans, deleted.data[i]));
if (ret) {
- bch_err(c, "error deleting snapshot %u: %i",
- deleted.d[i], ret);
+ bch_err(c, "error deleting snapshot %u: %s",
+ deleted.data[i], bch2_err_str(ret));
goto err;
}
}
+
+ clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
err:
- kfree(deleted.d);
+ darray_exit(&deleted);
bch2_trans_exit(&trans);
- percpu_ref_put(&c->writes);
+ return ret;
}
-static void bch2_delete_dead_snapshots(struct bch_fs *c)
+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
{
- if (unlikely(!percpu_ref_tryget(&c->writes)))
- return;
+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
- if (!queue_work(system_long_wq, &c->snapshot_delete_work))
- percpu_ref_put(&c->writes);
+ bch2_delete_dead_snapshots(c);
+ bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+void bch2_delete_dead_snapshots_async(struct bch_fs *c)
+{
+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
+ !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
struct btree_trans_commit_hook *h)
{
- bch2_delete_dead_snapshots(trans->c);
+ struct bch_fs *c = trans->c;
+
+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+
+ if (!test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ return 0;
+
+ bch2_delete_dead_snapshots_async(c);
return 0;
}
/* Subvolumes: */
-const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
- if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
- return "invalid pos";
-
- if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
- return "invalid pos";
+ if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
+ bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
+ prt_printf(err, "invalid pos");
+ return -BCH_ERR_invalid_bkey;
+ }
- if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
- return "bad val size";
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) {
+ prt_printf(err, "incorrect value size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_subvolume));
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ return 0;
}
void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
@@ -762,15 +757,16 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
- pr_buf(out, "root %llu snapshot id %u",
+ prt_printf(out, "root %llu snapshot id %u",
le64_to_cpu(s.v->inode),
le32_to_cpu(s.v->snapshot));
}
-int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
- bool inconsistent_if_not_found,
- int iter_flags,
- struct bch_subvolume *s)
+static __always_inline int
+bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
+ bool inconsistent_if_not_found,
+ int iter_flags,
+ struct bch_subvolume *s)
{
struct btree_iter iter;
struct bkey_s_c k;
@@ -790,6 +786,14 @@ int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
return ret;
}
+int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
+ bool inconsistent_if_not_found,
+ int iter_flags,
+ struct bch_subvolume *s)
+{
+ return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
+}
+
int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
struct bch_subvolume *subvol)
{
@@ -805,12 +809,12 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
struct bch_subvolume s;
int ret;
- ret = bch2_subvolume_get(trans, subvol, true,
- BTREE_ITER_CACHED|
- BTREE_ITER_WITH_UPDATES,
- &s);
-
- *snapid = le32_to_cpu(s.snapshot);
+ ret = bch2_subvolume_get_inlined(trans, subvol, true,
+ BTREE_ITER_CACHED|
+ BTREE_ITER_WITH_UPDATES,
+ &s);
+ if (!ret)
+ *snapid = le32_to_cpu(s.snapshot);
return ret;
}
@@ -824,7 +828,6 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
struct bkey_s_c k;
struct bkey_s_c_subvolume subvol;
struct btree_trans_commit_hook *h;
- struct bkey_i *delete;
u32 snapid;
int ret = 0;
@@ -846,19 +849,14 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
subvol = bkey_s_c_to_subvolume(k);
snapid = le32_to_cpu(subvol.v->snapshot);
- delete = bch2_trans_kmalloc(trans, sizeof(*delete));
- ret = PTR_ERR_OR_ZERO(delete);
+ ret = bch2_btree_delete_at(trans, &iter, 0);
if (ret)
goto err;
- bkey_init(&delete->k);
- delete->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, delete, 0);
+ ret = bch2_snapshot_node_set_deleted(trans, snapid);
if (ret)
goto err;
- ret = bch2_snapshot_node_set_deleted(trans, snapid);
-
h = bch2_trans_kmalloc(trans, sizeof(*h));
ret = PTR_ERR_OR_ZERO(h);
if (ret)
@@ -875,14 +873,14 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs,
snapshot_wait_for_pagecache_and_delete_work);
- struct snapshot_id_list s;
+ snapshot_id_list s;
u32 *id;
int ret = 0;
while (!ret) {
mutex_lock(&c->snapshots_unlinked_lock);
s = c->snapshots_unlinked;
- memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
+ darray_init(&c->snapshots_unlinked);
mutex_unlock(&c->snapshots_unlinked_lock);
if (!s.nr)
@@ -890,19 +888,19 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
bch2_evict_subvolume_inodes(c, &s);
- for (id = s.d; id < s.d + s.nr; id++) {
+ for (id = s.data; id < s.data + s.nr; id++) {
ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
bch2_subvolume_delete(&trans, *id));
if (ret) {
- bch_err(c, "error %i deleting subvolume %u", ret, *id);
+ bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
break;
}
}
- kfree(s.d);
+ darray_exit(&s);
}
- percpu_ref_put(&c->writes);
+ bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
}
struct subvolume_unlink_hook {
@@ -919,24 +917,23 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
mutex_lock(&c->snapshots_unlinked_lock);
if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
- ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol);
+ ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
mutex_unlock(&c->snapshots_unlinked_lock);
if (ret)
return ret;
- if (unlikely(!percpu_ref_tryget(&c->writes)))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
return -EROFS;
- if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
- percpu_ref_put(&c->writes);
+ if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
return 0;
}
int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
{
struct btree_iter iter;
- struct bkey_s_c k;
struct bkey_i_subvolume *n;
struct subvolume_unlink_hook *h;
int ret = 0;
@@ -945,23 +942,13 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
POS(0, subvolid),
BTREE_ITER_CACHED|
BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_subvolume) {
- bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
- ret = -EIO;
- goto err;
- }
-
- n = bch2_trans_kmalloc(trans, sizeof(*n));
+ n = bch2_bkey_get_mut_typed(trans, &iter, subvolume);
ret = PTR_ERR_OR_ZERO(n);
- if (ret)
+ if (unlikely(ret)) {
+ bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing subvolume %u", subvolid);
goto err;
+ }
- bkey_reassemble(&n->k_i, k);
SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
@@ -997,7 +984,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+ if (bkey_gt(k.k->p, SUBVOL_POS_MAX))
break;
/*
@@ -1010,7 +997,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
}
if (!ret)
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_subvolume_create;
goto err;
found_slot:
snapshot_subvols[0] = dst_iter.pos.offset;
@@ -1018,27 +1005,19 @@ found_slot:
if (src_subvolid) {
/* Creating a snapshot: */
- src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
- ret = PTR_ERR_OR_ZERO(src_subvol);
- if (ret)
- goto err;
bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
POS(0, src_subvolid),
BTREE_ITER_CACHED|
BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(&src_iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_subvolume) {
- bch_err(c, "subvolume %u not found", src_subvolid);
- ret = -ENOENT;
+ src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, subvolume);
+ ret = PTR_ERR_OR_ZERO(src_subvol);
+ if (unlikely(ret)) {
+ bch2_fs_inconsistent_on(ret == -ENOENT, trans->c,
+ "subvolume %u not found", src_subvolid);
goto err;
}
- bkey_reassemble(&src_subvol->k_i, k);
parent = le32_to_cpu(src_subvol->v.snapshot);
}
@@ -1055,18 +1034,16 @@ found_slot:
goto err;
}
- new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+ new_subvol = bch2_bkey_alloc(trans, &dst_iter, subvolume);
ret = PTR_ERR_OR_ZERO(new_subvol);
if (ret)
goto err;
- bkey_subvolume_init(&new_subvol->k_i);
new_subvol->v.flags = 0;
new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
new_subvol->v.inode = cpu_to_le64(inode);
SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
- new_subvol->k.p = dst_iter.pos;
ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
if (ret)
goto err;
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 4abe53df2788..df6657952e2f 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -2,18 +2,20 @@
#ifndef _BCACHEFS_SUBVOLUME_H
#define _BCACHEFS_SUBVOLUME_H
+#include "darray.h"
#include "subvolume_types.h"
void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
+ unsigned, struct printbuf *);
+int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s_c, unsigned);
-#define bch2_bkey_ops_snapshot (struct bkey_ops) { \
+#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \
.key_invalid = bch2_snapshot_invalid, \
.val_to_text = bch2_snapshot_to_text, \
-}
-
-int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
- struct bkey_s_c, unsigned);
+ .atomic_trigger = bch2_mark_snapshot, \
+})
static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
{
@@ -25,6 +27,16 @@ static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
return snapshot_t(c, id)->parent;
}
+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+ return snapshot_t(c, id)->equiv;
+}
+
+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
+{
+ return id == snapshot_t(c, id)->equiv;
+}
+
static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
{
struct snapshot_t *s = snapshot_t(c, id);
@@ -56,65 +68,58 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
return id == ancestor;
}
-struct snapshots_seen {
- struct bpos pos;
- size_t nr;
- size_t size;
- u32 *d;
-};
-
-static inline void snapshots_seen_exit(struct snapshots_seen *s)
+static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
{
- kfree(s->d);
- s->d = NULL;
-}
+ struct snapshot_t *t = snapshot_t(c, id);
-static inline void snapshots_seen_init(struct snapshots_seen *s)
-{
- memset(s, 0, sizeof(*s));
+ return (t->children[0]|t->children[1]) != 0;
}
-static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
{
- if (s->nr == s->size) {
- size_t new_size = max(s->size, (size_t) 128) * 2;
- u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
-
- if (!d) {
- bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
- new_size);
- return -ENOMEM;
- }
+ u32 *i;
- s->size = new_size;
- s->d = d;
- }
-
- s->d[s->nr++] = id;
- return 0;
+ darray_for_each(*s, i)
+ if (*i == id)
+ return true;
+ return false;
}
-static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
- unsigned i;
+ u32 *i;
- for (i = 0; i < s->nr; i++)
- if (id == s->d[i])
+ darray_for_each(*s, i)
+ if (bch2_snapshot_is_ancestor(c, id, *i))
return true;
return false;
}
-int bch2_fs_snapshots_check(struct bch_fs *);
+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+ int ret;
+
+ BUG_ON(snapshot_list_has_id(s, id));
+ ret = darray_push(s, id);
+ if (ret)
+ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+ return ret;
+}
+
+int bch2_fs_check_snapshots(struct bch_fs *);
+int bch2_fs_check_subvols(struct bch_fs *);
+
void bch2_fs_snapshots_exit(struct bch_fs *);
int bch2_fs_snapshots_start(struct bch_fs *);
-const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
+ unsigned, struct printbuf *);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-#define bch2_bkey_ops_subvolume (struct bkey_ops) { \
+#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
.key_invalid = bch2_subvolume_invalid, \
.val_to_text = bch2_subvolume_to_text, \
-}
+})
int bch2_subvolume_get(struct btree_trans *, unsigned,
bool, int, struct bch_subvolume *);
@@ -126,6 +131,9 @@ int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
int bch2_snapshot_node_create(struct btree_trans *, u32,
u32 *, u32 *, unsigned);
+int bch2_delete_dead_snapshots(struct bch_fs *);
+void bch2_delete_dead_snapshots_async(struct bch_fs *);
+
int bch2_subvolume_delete(struct btree_trans *, u32);
int bch2_subvolume_unlink(struct btree_trans *, u32);
int bch2_subvolume_create(struct btree_trans *, u64, u32,
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 9410b9587591..aa49c45a35ab 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -2,10 +2,20 @@
#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
#define _BCACHEFS_SUBVOLUME_TYPES_H
-struct snapshot_id_list {
- u32 nr;
- u32 size;
- u32 *d;
+#include "darray.h"
+
+typedef DARRAY(u32) snapshot_id_list;
+
+struct snapshot_t {
+ u32 parent;
+ u32 children[2];
+ u32 subvol; /* Nonzero only if a subvolume points to this node: */
+ u32 equiv;
};
+typedef struct {
+ u32 subvol;
+ u64 inum;
+} subvol_inum;
+
#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6e2e077f5f8d..519df09917e3 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -10,16 +10,20 @@
#include "io.h"
#include "journal.h"
#include "journal_io.h"
+#include "journal_sb.h"
#include "journal_seq_blacklist.h"
#include "replicas.h"
#include "quota.h"
#include "super-io.h"
#include "super.h"
#include "vstructs.h"
+#include "counters.h"
#include <linux/backing-dev.h>
#include <linux/sort.h>
+#include <trace/events/bcachefs.h>
+
const char * const bch2_sb_fields[] = {
#define x(name, nr) #name,
BCH_SB_FIELDS()
@@ -95,8 +99,7 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
void bch2_free_super(struct bch_sb_handle *sb)
{
- if (sb->bio)
- bio_put(sb->bio);
+ kfree(sb->bio);
if (!IS_ERR_OR_NULL(sb->bdev))
blkdev_put(sb->bdev, sb->mode);
@@ -123,11 +126,9 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
if (new_bytes > max_bytes) {
- char buf[BDEVNAME_SIZE];
-
- pr_err("%s: superblock too big: want %zu but have %llu",
- bdevname(sb->bdev, buf), new_bytes, max_bytes);
- return -ENOSPC;
+ pr_err("%pg: superblock too big: want %zu but have %llu",
+ sb->bdev, new_bytes, max_bytes);
+ return -BCH_ERR_ENOSPC_sb;
}
}
@@ -135,22 +136,24 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
return 0;
if (dynamic_fault("bcachefs:add:super_realloc"))
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_sb_realloc_injected;
if (sb->have_bio) {
- bio = bio_kmalloc(GFP_KERNEL,
- DIV_ROUND_UP(new_buffer_size, PAGE_SIZE));
+ unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE);
+
+ bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
if (!bio)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_sb_bio_realloc;
- if (sb->bio)
- bio_put(sb->bio);
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
+
+ kfree(sb->bio);
sb->bio = bio;
}
new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
if (!new_sb)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_sb_buf_realloc;
sb->sb = new_sb;
sb->buffer_size = new_buffer_size;
@@ -207,25 +210,26 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
u64 offset, prev_offset, max_sectors;
unsigned i;
- if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) {
- pr_buf(out, "Not a bcachefs superblock layout");
- return -EINVAL;
+ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC) &&
+ uuid_le_cmp(layout->magic, BCHFS_MAGIC)) {
+ prt_printf(out, "Not a bcachefs superblock layout");
+ return -BCH_ERR_invalid_sb_layout;
}
if (layout->layout_type != 0) {
- pr_buf(out, "Invalid superblock layout type %u",
+ prt_printf(out, "Invalid superblock layout type %u",
layout->layout_type);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_layout_type;
}
if (!layout->nr_superblocks) {
- pr_buf(out, "Invalid superblock layout: no superblocks");
- return -EINVAL;
+ prt_printf(out, "Invalid superblock layout: no superblocks");
+ return -BCH_ERR_invalid_sb_layout_nr_superblocks;
}
if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
- pr_buf(out, "Invalid superblock layout: too many superblocks");
- return -EINVAL;
+ prt_printf(out, "Invalid superblock layout: too many superblocks");
+ return -BCH_ERR_invalid_sb_layout_nr_superblocks;
}
max_sectors = 1 << layout->sb_max_size_bits;
@@ -236,10 +240,10 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
offset = le64_to_cpu(layout->sb_offset[i]);
if (offset < prev_offset + max_sectors) {
- pr_buf(out, "Invalid superblock layout: superblocks overlap\n"
+ prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
" (sb %u ends at %llu next starts at %llu",
i - 1, prev_offset + max_sectors, offset);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_layout_superblocks_overlap;
}
prev_offset = offset;
}
@@ -247,80 +251,109 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
return 0;
}
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
+ int rw)
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field *f;
struct bch_sb_field_members *mi;
+ enum bch_opt_id opt_id;
u32 version, version_min;
u16 block_size;
int ret;
version = le16_to_cpu(sb->version);
- version_min = version >= bcachefs_metadata_version_new_versioning
+ version_min = version >= bcachefs_metadata_version_bkey_renumber
? le16_to_cpu(sb->version_min)
: version;
if (version >= bcachefs_metadata_version_max) {
- pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+ prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_version;
}
if (version_min < bcachefs_metadata_version_min) {
- pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+ prt_printf(out, "Unsupported superblock version %u (min %u, max %u)",
version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_version;
}
if (version_min > version) {
- pr_buf(out, "Bad minimum version %u, greater than version field %u",
+ prt_printf(out, "Bad minimum version %u, greater than version field %u",
version_min, version);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_version;
}
if (sb->features[1] ||
(le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
- pr_buf(out, "Filesystem has incompatible features");
- return -EINVAL;
+ prt_printf(out, "Filesystem has incompatible features");
+ return -BCH_ERR_invalid_sb_features;
}
block_size = le16_to_cpu(sb->block_size);
if (block_size > PAGE_SECTORS) {
- pr_buf(out, "Block size too big (got %u, max %lu)",
+ prt_printf(out, "Block size too big (got %u, max %u)",
block_size, PAGE_SECTORS);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_block_size;
}
if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) {
- pr_buf(out, "Bad user UUID (got zeroes)");
- return -EINVAL;
+ prt_printf(out, "Bad user UUID (got zeroes)");
+ return -BCH_ERR_invalid_sb_uuid;
}
if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) {
- pr_buf(out, "Bad intenal UUID (got zeroes)");
- return -EINVAL;
+ prt_printf(out, "Bad intenal UUID (got zeroes)");
+ return -BCH_ERR_invalid_sb_uuid;
}
if (!sb->nr_devices ||
sb->nr_devices > BCH_SB_MEMBERS_MAX) {
- pr_buf(out, "Bad number of member devices %u (max %u)",
+ prt_printf(out, "Bad number of member devices %u (max %u)",
sb->nr_devices, BCH_SB_MEMBERS_MAX);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_too_many_members;
}
if (sb->dev_idx >= sb->nr_devices) {
- pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)",
+ prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
sb->dev_idx, sb->nr_devices);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_dev_idx;
}
if (!sb->time_precision ||
le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
- pr_buf(out, "Invalid time precision: %u (min 1, max %lu)",
+ prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_time_precision;
+ }
+
+ if (rw == READ) {
+ /*
+ * Been seeing a bug where these are getting inexplicably
+ * zeroed, so we'r now validating them, but we have to be
+ * careful not to preven people's filesystems from mounting:
+ */
+ if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+ if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
+ }
+
+ for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
+ const struct bch_option *opt = bch2_opt_table + opt_id;
+
+ if (opt->get_sb != BCH2_NO_SB_OPT) {
+ u64 v = bch2_opt_from_sb(sb, opt_id);
+
+ prt_printf(out, "Invalid option ");
+ ret = bch2_opt_validate(opt, v, out);
+ if (ret)
+ return ret;
+
+ printbuf_reset(out);
+ }
}
/* validate layout */
@@ -330,23 +363,23 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
vstruct_for_each(sb, f) {
if (!f->u64s) {
- pr_buf(out, "Invalid superblock: optional with size 0 (type %u)",
+ prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)",
le32_to_cpu(f->type));
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_field_size;
}
if (vstruct_next(f) > vstruct_last(sb)) {
- pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
+ prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
le32_to_cpu(f->type));
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_field_size;
}
}
/* members must be validated first: */
mi = bch2_sb_get_members(sb);
if (!mi) {
- pr_buf(out, "Invalid superblock: member info area missing");
- return -EINVAL;
+ prt_printf(out, "Invalid superblock: member info area missing");
+ return -BCH_ERR_invalid_sb_members_missing;
}
ret = bch2_sb_field_validate(sb, &mi->field, out);
@@ -399,7 +432,7 @@ static void bch2_sb_update(struct bch_fs *c)
ca->mi = bch2_mi_to_cpu(mi->members + i);
}
-static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
+static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
{
struct bch_sb_field *src_f, *dst_f;
struct bch_sb *dst = dst_handle->sb;
@@ -424,45 +457,45 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
memcpy(dst->compat, src->compat, sizeof(dst->compat));
for (i = 0; i < BCH_SB_FIELD_NR; i++) {
- if (i == BCH_SB_FIELD_journal)
+ int d;
+
+ if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
continue;
src_f = bch2_sb_field_get(src, i);
dst_f = bch2_sb_field_get(dst, i);
+
+ d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
+ (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
+ if (d > 0) {
+ int ret = bch2_sb_realloc(dst_handle, le32_to_cpu(dst_handle->sb->u64s) + d);
+ if (ret)
+ return ret;
+
+ dst = dst_handle->sb;
+ dst_f = bch2_sb_field_get(dst, i);
+ }
+
dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
src_f ? le32_to_cpu(src_f->u64s) : 0);
if (src_f)
memcpy(dst_f, src_f, vstruct_bytes(src_f));
}
+
+ return 0;
}
int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
{
- struct bch_sb_field_journal *journal_buckets =
- bch2_sb_get_journal(src);
- unsigned journal_u64s = journal_buckets
- ? le32_to_cpu(journal_buckets->field.u64s)
- : 0;
int ret;
lockdep_assert_held(&c->sb_lock);
- ret = bch2_sb_realloc(&c->disk_sb,
- le32_to_cpu(src->u64s) - journal_u64s);
- if (ret)
- return ret;
-
- __copy_super(&c->disk_sb, src);
-
- if (BCH_SB_INITIALIZED(c->disk_sb.sb))
- set_bit(BCH_FS_INITIALIZED, &c->flags);
-
- ret = bch2_sb_replicas_to_cpu_replicas(c);
- if (ret)
- return ret;
-
- ret = bch2_sb_disk_groups_to_cpu(c);
+ ret = bch2_sb_realloc(&c->disk_sb, 0) ?:
+ __copy_super(&c->disk_sb, src) ?:
+ bch2_sb_replicas_to_cpu_replicas(c) ?:
+ bch2_sb_disk_groups_to_cpu(c);
if (ret)
return ret;
@@ -472,21 +505,7 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb;
- struct bch_sb_field_journal *journal_buckets =
- bch2_sb_get_journal(dst);
- unsigned journal_u64s = journal_buckets
- ? le32_to_cpu(journal_buckets->field.u64s)
- : 0;
- unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
- int ret;
-
- ret = bch2_sb_realloc(&ca->disk_sb, u64s);
- if (ret)
- return ret;
-
- __copy_super(&ca->disk_sb, src);
- return 0;
+ return __copy_super(&ca->disk_sb, c->disk_sb.sb);
}
/* read superblock: */
@@ -498,57 +517,57 @@ static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf
size_t bytes;
int ret;
reread:
- bio_reset(sb->bio);
- bio_set_dev(sb->bio, sb->bdev);
+ bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
sb->bio->bi_iter.bi_sector = offset;
- bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
ret = submit_bio_wait(sb->bio);
if (ret) {
- pr_buf(err, "IO error: %i", ret);
+ prt_printf(err, "IO error: %i", ret);
return ret;
}
- if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) {
- pr_buf(err, "Not a bcachefs superblock");
- return -EINVAL;
+ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC) &&
+ uuid_le_cmp(sb->sb->magic, BCHFS_MAGIC)) {
+ prt_printf(err, "Not a bcachefs superblock");
+ return -BCH_ERR_invalid_sb_magic;
}
version = le16_to_cpu(sb->sb->version);
- version_min = version >= bcachefs_metadata_version_new_versioning
+ version_min = version >= bcachefs_metadata_version_bkey_renumber
? le16_to_cpu(sb->sb->version_min)
: version;
if (version >= bcachefs_metadata_version_max) {
- pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+ prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_version;
}
if (version_min < bcachefs_metadata_version_min) {
- pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+ prt_printf(err, "Unsupported superblock version %u (min %u, max %u)",
version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_version;
}
bytes = vstruct_bytes(sb->sb);
if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
- pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
+ prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
bytes, 512UL << sb->sb->layout.sb_max_size_bits);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_too_big;
}
if (bytes > sb->buffer_size) {
- if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
- return -ENOMEM;
+ ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s));
+ if (ret)
+ return ret;
goto reread;
}
if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
- pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
- return -EINVAL;
+ prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
+ return -BCH_ERR_invalid_sb_csum_type;
}
/* XXX: verify MACs */
@@ -556,8 +575,8 @@ reread:
null_nonce(), sb->sb);
if (bch2_crc_cmp(csum, sb->sb->csum)) {
- pr_buf(err, "bad checksum");
- return -EINVAL;
+ prt_printf(err, "bad checksum");
+ return -BCH_ERR_invalid_sb_csum;
}
sb->seq = le64_to_cpu(sb->sb->seq);
@@ -570,16 +589,10 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
{
u64 offset = opt_get(*opts, sb);
struct bch_sb_layout layout;
- char *_err;
- struct printbuf err;
+ struct printbuf err = PRINTBUF;
__le64 *i;
int ret;
- _err = kmalloc(4096, GFP_KERNEL);
- if (!_err)
- return -ENOMEM;
- err = _PBUF(_err, 4096);
-
pr_verbose_init(*opts, "");
memset(sb, 0, sizeof(*sb));
@@ -610,12 +623,12 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
ret = bch2_sb_realloc(sb, 0);
if (ret) {
- pr_buf(&err, "error allocating memory for superblock");
+ prt_printf(&err, "error allocating memory for superblock");
goto err;
}
if (bch2_fs_init_fault("read_super")) {
- pr_buf(&err, "dynamic fault");
+ prt_printf(&err, "dynamic fault");
ret = -EFAULT;
goto err;
}
@@ -628,17 +641,15 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
goto err;
printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
- path, _err);
- err = _PBUF(_err, 4096);
+ path, err.buf);
+ printbuf_reset(&err);
/*
* Error reading primary superblock - read location of backup
* superblocks:
*/
- bio_reset(sb->bio);
- bio_set_dev(sb->bio, sb->bdev);
+ bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
- bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
/*
* use sb buffer to read layout, since sb buffer is page aligned but
* layout won't be:
@@ -647,7 +658,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
ret = submit_bio_wait(sb->bio);
if (ret) {
- pr_buf(&err, "IO error: %i", ret);
+ prt_printf(&err, "IO error: %i", ret);
goto err;
}
@@ -673,29 +684,29 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
got_super:
if (le16_to_cpu(sb->sb->block_size) << 9 <
bdev_logical_block_size(sb->bdev)) {
- pr_buf(&err, "block size (%u) smaller than device block size (%u)",
+ prt_printf(&err, "block size (%u) smaller than device block size (%u)",
le16_to_cpu(sb->sb->block_size) << 9,
bdev_logical_block_size(sb->bdev));
- ret = -EINVAL;
+ ret = -BCH_ERR_block_size_too_small;
goto err;
}
ret = 0;
sb->have_layout = true;
- ret = bch2_sb_validate(sb, &err);
+ ret = bch2_sb_validate(sb, &err, READ);
if (ret) {
printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
- path, _err);
+ path, err.buf);
goto err_no_print;
}
out:
pr_verbose_init(*opts, "ret %i", ret);
- kfree(_err);
+ printbuf_exit(&err);
return ret;
err:
printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
- path, _err);
+ path, err.buf);
err_no_print:
bch2_free_super(sb);
goto out;
@@ -722,12 +733,10 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
struct bch_sb *sb = ca->disk_sb.sb;
struct bio *bio = ca->disk_sb.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]);
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META);
bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
@@ -748,12 +757,10 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
null_nonce(), sb);
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
bch2_bio_map(bio, sb,
roundup((size_t) vstruct_bytes(sb),
bdev_logical_block_size(ca->disk_sb.bdev)));
@@ -769,12 +776,15 @@ int bch2_write_super(struct bch_fs *c)
{
struct closure *cl = &c->sb_write;
struct bch_dev *ca;
+ struct printbuf err = PRINTBUF;
unsigned i, sb = 0, nr_wrote;
struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
int ret = 0;
+ trace_and_count(c, write_super, c, _RET_IP_);
+
if (c->opts.very_degraded)
degraded_flags |= BCH_FORCE_IF_LOST;
@@ -783,6 +793,11 @@ int bch2_write_super(struct bch_fs *c)
closure_init_stack(cl);
memset(&sb_written, 0, sizeof(sb_written));
+ if (c->opts.version_upgrade) {
+ c->disk_sb.sb->magic = BCHFS_MAGIC;
+ c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
+ }
+
le64_add_cpu(&c->disk_sb.sb->seq, 1);
if (test_bit(BCH_FS_ERROR, &c->flags))
@@ -792,22 +807,17 @@ int bch2_write_super(struct bch_fs *c)
SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
+ bch2_sb_counters_from_cpu(c);
+
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
for_each_online_member(ca, c, i) {
- struct printbuf buf = { NULL, NULL };
+ printbuf_reset(&err);
- ret = bch2_sb_validate(&ca->disk_sb, &buf);
+ ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
if (ret) {
- char *_buf = kmalloc(4096, GFP_NOFS);
- if (_buf) {
- buf = _PBUF(_buf, 4096);
- bch2_sb_validate(&ca->disk_sb, &buf);
- }
-
- bch2_fs_inconsistent(c, "sb invalid before write: %s", _buf);
- kfree(_buf);
+ bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
percpu_ref_put(&ca->io_ref);
goto out;
}
@@ -816,6 +826,13 @@ int bch2_write_super(struct bch_fs *c)
if (c->opts.nochanges)
goto out;
+ /*
+ * Defer writing the superblock until filesystem initialization is
+ * complete - don't write out a partly initialized superblock:
+ */
+ if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
+ goto out;
+
for_each_online_member(ca, c, i) {
__set_bit(ca->dev_idx, sb_written.d);
ca->sb_write_error = 0;
@@ -835,7 +852,7 @@ int bch2_write_super(struct bch_fs *c)
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
percpu_ref_put(&ca->io_ref);
- ret = -EROFS;
+ ret = -BCH_ERR_erofs_sb_err;
goto out;
}
@@ -845,7 +862,7 @@ int bch2_write_super(struct bch_fs *c)
le64_to_cpu(ca->sb_read_scratch->seq),
ca->disk_sb.seq);
percpu_ref_put(&ca->io_ref);
- ret = -EROFS;
+ ret = -BCH_ERR_erofs_sb_err;
goto out;
}
}
@@ -898,6 +915,7 @@ int bch2_write_super(struct bch_fs *c)
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
+ printbuf_exit(&err);
return ret;
}
@@ -912,75 +930,9 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
mutex_unlock(&c->sb_lock);
}
-/* BCH_SB_FIELD_journal: */
-
-static int u64_cmp(const void *_l, const void *_r)
-{
- u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
-
- return l < r ? -1 : l > r ? 1 : 0;
-}
-
-static int bch2_sb_validate_journal(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
-{
- struct bch_sb_field_journal *journal = field_to_type(f, journal);
- struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
- int ret = -EINVAL;
- unsigned nr;
- unsigned i;
- u64 *b;
-
- nr = bch2_nr_journal_buckets(journal);
- if (!nr)
- return 0;
-
- b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
- if (!b)
- return -ENOMEM;
-
- for (i = 0; i < nr; i++)
- b[i] = le64_to_cpu(journal->buckets[i]);
-
- sort(b, nr, sizeof(u64), u64_cmp, NULL);
-
- if (!b[0]) {
- pr_buf(err, "journal bucket at sector 0");
- goto err;
- }
-
- if (b[0] < le16_to_cpu(m->first_bucket)) {
- pr_buf(err, "journal bucket %llu before first bucket %u",
- b[0], le16_to_cpu(m->first_bucket));
- goto err;
- }
-
- if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
- pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
- b[nr - 1], le64_to_cpu(m->nbuckets));
- goto err;
- }
-
- for (i = 0; i + 1 < nr; i++)
- if (b[i] == b[i + 1]) {
- pr_buf(err, "duplicate journal buckets %llu", b[i]);
- goto err;
- }
-
- ret = 0;
-err:
- kfree(b);
- return ret;
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
- .validate = bch2_sb_validate_journal,
-};
-
/* BCH_SB_FIELD_members: */
-static int bch2_sb_validate_members(struct bch_sb *sb,
+static int bch2_sb_members_validate(struct bch_sb *sb,
struct bch_sb_field *f,
struct printbuf *err)
{
@@ -989,8 +941,8 @@ static int bch2_sb_validate_members(struct bch_sb *sb,
if ((void *) (mi->members + sb->nr_devices) >
vstruct_end(&mi->field)) {
- pr_buf(err, "too many devices for section size");
- return -EINVAL;
+ prt_printf(err, "too many devices for section size");
+ return -BCH_ERR_invalid_sb_members;
}
for (i = 0; i < sb->nr_devices; i++) {
@@ -1000,69 +952,195 @@ static int bch2_sb_validate_members(struct bch_sb *sb,
continue;
if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
- pr_buf(err, "device %u: too many buckets (got %llu, max %lu)",
+ prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
i, le64_to_cpu(m->nbuckets), LONG_MAX);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_members;
}
if (le64_to_cpu(m->nbuckets) -
le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
- pr_buf(err, "device %u: not enough buckets (got %llu, max %u)",
+ prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_members;
}
if (le16_to_cpu(m->bucket_size) <
le16_to_cpu(sb->block_size)) {
- pr_buf(err, "device %u: bucket size %u smaller than block size %u",
+ prt_printf(err, "device %u: bucket size %u smaller than block size %u",
i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_members;
}
if (le16_to_cpu(m->bucket_size) <
BCH_SB_BTREE_NODE_SIZE(sb)) {
- pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu",
+ prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_members;
}
}
return 0;
}
+static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_members *mi = field_to_type(f, members);
+ struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
+ unsigned i;
+
+ for (i = 0; i < sb->nr_devices; i++) {
+ struct bch_member *m = mi->members + i;
+ unsigned data_have = bch2_sb_dev_has_data(sb, i);
+ u64 bucket_size = le16_to_cpu(m->bucket_size);
+ u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
+
+ if (!bch2_member_exists(m))
+ continue;
+
+ prt_printf(out, "Device:");
+ prt_tab(out);
+ prt_printf(out, "%u", i);
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "UUID:");
+ prt_tab(out);
+ pr_uuid(out, m->uuid.b);
+ prt_newline(out);
+
+ prt_printf(out, "Size:");
+ prt_tab(out);
+ prt_units_u64(out, device_size << 9);
+ prt_newline(out);
+
+ prt_printf(out, "Bucket size:");
+ prt_tab(out);
+ prt_units_u64(out, bucket_size << 9);
+ prt_newline(out);
+
+ prt_printf(out, "First bucket:");
+ prt_tab(out);
+ prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
+ prt_newline(out);
+
+ prt_printf(out, "Buckets:");
+ prt_tab(out);
+ prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
+ prt_newline(out);
+
+ prt_printf(out, "Last mount:");
+ prt_tab(out);
+ if (m->last_mount)
+ pr_time(out, le64_to_cpu(m->last_mount));
+ else
+ prt_printf(out, "(never)");
+ prt_newline(out);
+
+ prt_printf(out, "State:");
+ prt_tab(out);
+ prt_printf(out, "%s",
+ BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
+ ? bch2_member_states[BCH_MEMBER_STATE(m)]
+ : "unknown");
+ prt_newline(out);
+
+ prt_printf(out, "Label:");
+ prt_tab(out);
+ if (BCH_MEMBER_GROUP(m)) {
+ unsigned idx = BCH_MEMBER_GROUP(m) - 1;
+
+ if (idx < disk_groups_nr(gi))
+ prt_printf(out, "%s (%u)",
+ gi->entries[idx].label, idx);
+ else
+ prt_printf(out, "(bad disk labels section)");
+ } else {
+ prt_printf(out, "(none)");
+ }
+ prt_newline(out);
+
+ prt_printf(out, "Data allowed:");
+ prt_tab(out);
+ if (BCH_MEMBER_DATA_ALLOWED(m))
+ prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
+ else
+ prt_printf(out, "(none)");
+ prt_newline(out);
+
+ prt_printf(out, "Has data:");
+ prt_tab(out);
+ if (data_have)
+ prt_bitflags(out, bch2_data_types, data_have);
+ else
+ prt_printf(out, "(none)");
+ prt_newline(out);
+
+ prt_printf(out, "Discard:");
+ prt_tab(out);
+ prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
+ prt_newline(out);
+
+ prt_printf(out, "Freespace initialized:");
+ prt_tab(out);
+ prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+ }
+}
+
static const struct bch_sb_field_ops bch_sb_field_ops_members = {
- .validate = bch2_sb_validate_members,
+ .validate = bch2_sb_members_validate,
+ .to_text = bch2_sb_members_to_text,
};
/* BCH_SB_FIELD_crypt: */
-static int bch2_sb_validate_crypt(struct bch_sb *sb,
+static int bch2_sb_crypt_validate(struct bch_sb *sb,
struct bch_sb_field *f,
struct printbuf *err)
{
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
- pr_buf(err, "wrong size (got %llu should be %zu)",
+ prt_printf(err, "wrong size (got %zu should be %zu)",
vstruct_bytes(&crypt->field), sizeof(*crypt));
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_crypt;
}
if (BCH_CRYPT_KDF_TYPE(crypt)) {
- pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
- return -EINVAL;
+ prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+ return -BCH_ERR_invalid_sb_crypt;
}
return 0;
}
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+ prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt));
+ prt_newline(out);
+ prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt));
+ prt_newline(out);
+ prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt));
+ prt_newline(out);
+ prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt));
+ prt_newline(out);
+}
+
static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
- .validate = bch2_sb_validate_crypt,
+ .validate = bch2_sb_crypt_validate,
+ .to_text = bch2_sb_crypt_to_text,
};
/* BCH_SB_FIELD_clean: */
-int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
{
struct jset_entry *entry;
int ret;
@@ -1070,7 +1148,7 @@ int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, i
for (entry = clean->start;
entry < (struct jset_entry *) vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
- ret = bch2_journal_entry_validate(c, "superblock", entry,
+ ret = bch2_journal_entry_validate(c, NULL, entry,
le16_to_cpu(c->disk_sb.sb->version),
BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
write);
@@ -1172,7 +1250,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->entry.type = BCH_JSET_ENTRY_data_usage;
u->v = cpu_to_le64(c->usage_base->replicas[i]);
- memcpy(&u->r, e, replicas_entry_bytes(e));
+ unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
+ "embedded variable length struct");
}
for_each_member_device(ca, c, dev) {
@@ -1185,7 +1264,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->entry.type = BCH_JSET_ENTRY_dev_usage;
u->dev = cpu_to_le32(dev);
u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
- u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable);
for (i = 0; i < BCH_DATA_NR; i++) {
u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
@@ -1234,7 +1312,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
}
sb_clean->flags = 0;
- sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
+ sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq));
/* Trying to catch outstanding bug: */
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
@@ -1251,7 +1329,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
* this should be in the write path, and we should be validating every
* superblock section:
*/
- ret = bch2_sb_clean_validate(c, sb_clean, WRITE);
+ ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
if (ret) {
bch_err(c, "error writing marking filesystem clean: validate error");
goto out;
@@ -1262,23 +1340,47 @@ out:
mutex_unlock(&c->sb_lock);
}
-static int bch2_sb_validate_clean(struct bch_sb *sb,
+static int bch2_sb_clean_validate(struct bch_sb *sb,
struct bch_sb_field *f,
struct printbuf *err)
{
struct bch_sb_field_clean *clean = field_to_type(f, clean);
if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
- pr_buf(err, "wrong size (got %llu should be %zu)",
+ prt_printf(err, "wrong size (got %zu should be %zu)",
vstruct_bytes(&clean->field), sizeof(*clean));
- return -EINVAL;
+ return -BCH_ERR_invalid_sb_clean;
}
return 0;
}
+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_clean *clean = field_to_type(f, clean);
+ struct jset_entry *entry;
+
+ prt_printf(out, "flags: %x", le32_to_cpu(clean->flags));
+ prt_newline(out);
+ prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq));
+ prt_newline(out);
+
+ for (entry = clean->start;
+ entry != vstruct_end(&clean->field);
+ entry = vstruct_next(entry)) {
+ if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+ !entry->u64s)
+ continue;
+
+ bch2_journal_entry_to_text(out, NULL, entry);
+ prt_newline(out);
+ }
+}
+
static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
- .validate = bch2_sb_validate_clean,
+ .validate = bch2_sb_clean_validate,
+ .to_text = bch2_sb_clean_to_text,
};
static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
@@ -1289,24 +1391,25 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
};
static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
- struct printbuf *orig_err)
+ struct printbuf *err)
{
unsigned type = le32_to_cpu(f->type);
- struct printbuf err = *orig_err;
+ struct printbuf field_err = PRINTBUF;
int ret;
if (type >= BCH_SB_FIELD_NR)
return 0;
- pr_buf(&err, "Invalid superblock section %s: ", bch2_sb_fields[type]);
-
- ret = bch2_sb_field_ops[type]->validate(sb, f, &err);
+ ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err);
if (ret) {
- pr_buf(&err, "\n");
- bch2_sb_field_to_text(&err, sb, f);
- *orig_err = err;
+ prt_printf(err, "Invalid superblock section %s: %s",
+ bch2_sb_fields[type],
+ field_err.buf);
+ prt_newline(err);
+ bch2_sb_field_to_text(err, sb, f);
}
+ printbuf_exit(&field_err);
return ret;
}
@@ -1317,13 +1420,179 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR
? bch2_sb_field_ops[type] : NULL;
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
if (ops)
- pr_buf(out, "%s", bch2_sb_fields[type]);
+ prt_printf(out, "%s", bch2_sb_fields[type]);
else
- pr_buf(out, "(unknown field %u)", type);
+ prt_printf(out, "(unknown field %u)", type);
- pr_buf(out, " (size %llu):", vstruct_bytes(f));
+ prt_printf(out, " (size %zu):", vstruct_bytes(f));
+ prt_newline(out);
- if (ops && ops->to_text)
+ if (ops && ops->to_text) {
+ printbuf_indent_add(out, 2);
bch2_sb_field_ops[type]->to_text(out, sb, f);
+ printbuf_indent_sub(out, 2);
+ }
+}
+
+void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
+{
+ unsigned i;
+
+ prt_printf(out, "Type: %u", l->layout_type);
+ prt_newline(out);
+
+ prt_str(out, "Superblock max size: ");
+ prt_units_u64(out, 512 << l->sb_max_size_bits);
+ prt_newline(out);
+
+ prt_printf(out, "Nr superblocks: %u", l->nr_superblocks);
+ prt_newline(out);
+
+ prt_str(out, "Offsets: ");
+ for (i = 0; i < l->nr_superblocks; i++) {
+ if (i)
+ prt_str(out, ", ");
+ prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
+ }
+ prt_newline(out);
+}
+
+void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
+ bool print_layout, unsigned fields)
+{
+ struct bch_sb_field_members *mi;
+ struct bch_sb_field *f;
+ u64 fields_have = 0;
+ unsigned nr_devices = 0;
+
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 44);
+
+ mi = bch2_sb_get_members(sb);
+ if (mi) {
+ struct bch_member *m;
+
+ for (m = mi->members;
+ m < mi->members + sb->nr_devices;
+ m++)
+ nr_devices += bch2_member_exists(m);
+ }
+
+ prt_printf(out, "External UUID:");
+ prt_tab(out);
+ pr_uuid(out, sb->user_uuid.b);
+ prt_newline(out);
+
+ prt_printf(out, "Internal UUID:");
+ prt_tab(out);
+ pr_uuid(out, sb->uuid.b);
+ prt_newline(out);
+
+ prt_str(out, "Device index:");
+ prt_tab(out);
+ prt_printf(out, "%u", sb->dev_idx);
+ prt_newline(out);
+
+ prt_str(out, "Label:");
+ prt_tab(out);
+ prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+ prt_newline(out);
+
+ prt_str(out, "Version:");
+ prt_tab(out);
+ prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]);
+ prt_newline(out);
+
+ prt_printf(out, "Oldest version on disk:");
+ prt_tab(out);
+ prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]);
+ prt_newline(out);
+
+ prt_printf(out, "Created:");
+ prt_tab(out);
+ if (sb->time_base_lo)
+ pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+ else
+ prt_printf(out, "(not set)");
+ prt_newline(out);
+
+ prt_printf(out, "Sequence number:");
+ prt_tab(out);
+ prt_printf(out, "%llu", le64_to_cpu(sb->seq));
+ prt_newline(out);
+
+ prt_printf(out, "Superblock size:");
+ prt_tab(out);
+ prt_printf(out, "%zu", vstruct_bytes(sb));
+ prt_newline(out);
+
+ prt_printf(out, "Clean:");
+ prt_tab(out);
+ prt_printf(out, "%llu", BCH_SB_CLEAN(sb));
+ prt_newline(out);
+
+ prt_printf(out, "Devices:");
+ prt_tab(out);
+ prt_printf(out, "%u", nr_devices);
+ prt_newline(out);
+
+ prt_printf(out, "Sections:");
+ vstruct_for_each(sb, f)
+ fields_have |= 1 << le32_to_cpu(f->type);
+ prt_tab(out);
+ prt_bitflags(out, bch2_sb_fields, fields_have);
+ prt_newline(out);
+
+ prt_printf(out, "Features:");
+ prt_tab(out);
+ prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
+ prt_newline(out);
+
+ prt_printf(out, "Compat features:");
+ prt_tab(out);
+ prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
+ prt_newline(out);
+
+ prt_newline(out);
+ prt_printf(out, "Options:");
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+ {
+ enum bch_opt_id id;
+
+ for (id = 0; id < bch2_opts_nr; id++) {
+ const struct bch_option *opt = bch2_opt_table + id;
+
+ if (opt->get_sb != BCH2_NO_SB_OPT) {
+ u64 v = bch2_opt_from_sb(sb, id);
+
+ prt_printf(out, "%s:", opt->attr.name);
+ prt_tab(out);
+ bch2_opt_to_text(out, NULL, sb, opt, v,
+ OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
+ prt_newline(out);
+ }
+ }
+ }
+
+ printbuf_indent_sub(out, 2);
+
+ if (print_layout) {
+ prt_newline(out);
+ prt_printf(out, "layout:");
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+ bch2_sb_layout_to_text(out, &sb->layout);
+ printbuf_indent_sub(out, 2);
+ }
+
+ vstruct_for_each(sb, f)
+ if (fields & (1 << le32_to_cpu(f->type))) {
+ prt_newline(out);
+ bch2_sb_field_to_text(out, sb, f);
+ }
}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 3b425bed17c4..14a25f6fe29a 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -75,15 +75,6 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
__bch2_check_set_feature(c, feat);
}
-/* BCH_SB_FIELD_journal: */
-
-static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
-{
- return j
- ? (__le64 *) vstruct_end(&j->field) - j->buckets
- : 0;
-}
-
/* BCH_SB_FIELD_members: */
static inline bool bch2_member_exists(struct bch_member *m)
@@ -112,6 +103,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
.durability = BCH_MEMBER_DURABILITY(mi)
? BCH_MEMBER_DURABILITY(mi) - 1
: 1,
+ .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
.valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
};
}
@@ -121,12 +113,14 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
void bch2_journal_super_entries_add_common(struct bch_fs *,
struct jset_entry **, u64);
-int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int);
+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
int bch2_fs_mark_dirty(struct bch_fs *);
void bch2_fs_mark_clean(struct bch_fs *);
void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
struct bch_sb_field *);
+void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
+void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e706b203a030..613d09f5b8e6 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -16,6 +16,7 @@
#include "btree_key_cache.h"
#include "btree_update_interior.h"
#include "btree_io.h"
+#include "btree_write_buffer.h"
#include "buckets_waiting_for_journal.h"
#include "chardev.h"
#include "checksum.h"
@@ -24,6 +25,7 @@
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
+#include "errcode.h"
#include "error.h"
#include "fs.h"
#include "fs-io.h"
@@ -36,6 +38,7 @@
#include "move.h"
#include "migrate.h"
#include "movinggc.h"
+#include "nocow_locking.h"
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
@@ -44,12 +47,12 @@
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
+#include "counters.h"
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/debugfs.h>
#include <linux/device.h>
-#include <linux/genhd.h>
#include <linux/idr.h>
#include <linux/module.h>
#include <linux/percpu.h>
@@ -63,14 +66,26 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
#define KTYPE(type) \
-struct kobj_type type ## _ktype = { \
+static const struct attribute_group type ## _group = { \
+ .attrs = type ## _files \
+}; \
+ \
+static const struct attribute_group *type ## _groups[] = { \
+ &type ## _group, \
+ NULL \
+}; \
+ \
+static const struct kobj_type type ## _ktype = { \
.release = type ## _release, \
.sysfs_ops = &type ## _sysfs_ops, \
- .default_attrs = type ## _files \
+ .default_groups = type ## _groups \
}
static void bch2_fs_release(struct kobject *);
static void bch2_dev_release(struct kobject *);
+static void bch2_fs_counters_release(struct kobject *k)
+{
+}
static void bch2_fs_internal_release(struct kobject *k)
{
@@ -84,17 +99,18 @@ static void bch2_fs_time_stats_release(struct kobject *k)
{
}
-static KTYPE(bch2_fs);
-static KTYPE(bch2_fs_internal);
-static KTYPE(bch2_fs_opts_dir);
-static KTYPE(bch2_fs_time_stats);
-static KTYPE(bch2_dev);
+KTYPE(bch2_fs);
+KTYPE(bch2_fs_counters);
+KTYPE(bch2_fs_internal);
+KTYPE(bch2_fs_opts_dir);
+KTYPE(bch2_fs_time_stats);
+KTYPE(bch2_dev);
static struct kset *bcachefs_kset;
static LIST_HEAD(bch_fs_list);
static DEFINE_MUTEX(bch_fs_list_lock);
-static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
+DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
static void bch2_dev_free(struct bch_dev *);
static int bch2_dev_alloc(struct bch_fs *, unsigned);
@@ -167,44 +183,6 @@ static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
&c->dev_usage_journal_res, u64s * nr);
}
-int bch2_congested(void *data, int bdi_bits)
-{
- struct bch_fs *c = data;
- struct backing_dev_info *bdi;
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
-
- rcu_read_lock();
- if (bdi_bits & (1 << WB_sync_congested)) {
- /* Reads - check all devices: */
- for_each_readable_member(ca, c, i) {
- bdi = ca->disk_sb.bdev->bd_bdi;
-
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- } else {
- const struct bch_devs_mask *devs =
- bch2_target_to_mask(c, c->opts.foreground_target) ?:
- &c->rw_devs[BCH_DATA_user];
-
- for_each_member_device_rcu(ca, c, i, devs) {
- bdi = ca->disk_sb.bdev->bd_bdi;
-
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- }
- rcu_read_unlock();
-
- return ret;
-}
-
/* Filesystem RO/RW: */
/*
@@ -226,85 +204,54 @@ static void __bch2_fs_read_only(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i, clean_passes = 0;
+ u64 seq = 0;
+ bch2_fs_ec_stop(c);
+ bch2_open_buckets_stop(c, NULL, true);
bch2_rebalance_stop(c);
bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
+ bch2_fs_ec_flush(c);
- /*
- * Flush journal before stopping allocators, because flushing journal
- * blacklist entries involves allocating new btree nodes:
- */
- bch2_journal_flush_all_pins(&c->journal);
-
- /*
- * If the allocator threads didn't all start up, the btree updates to
- * write out alloc info aren't going to work:
- */
- if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
- goto nowrote_alloc;
-
- bch_verbose(c, "flushing journal and stopping allocators");
-
- bch2_journal_flush_all_pins(&c->journal);
- set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
+ bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
+ journal_cur_seq(&c->journal));
do {
clean_passes++;
- if (bch2_journal_flush_all_pins(&c->journal))
- clean_passes = 0;
-
- /*
- * In flight interior btree updates will generate more journal
- * updates and btree updates (alloc btree):
- */
- if (bch2_btree_interior_updates_nr_pending(c)) {
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
+ if (bch2_btree_interior_updates_flush(c) ||
+ bch2_journal_flush_all_pins(&c->journal) ||
+ bch2_btree_flush_all_writes(c) ||
+ seq != atomic64_read(&c->journal.seq)) {
+ seq = atomic64_read(&c->journal.seq);
clean_passes = 0;
}
- flush_work(&c->btree_interior_update_work);
-
- if (bch2_journal_flush_all_pins(&c->journal))
- clean_passes = 0;
} while (clean_passes < 2);
- bch_verbose(c, "flushing journal and stopping allocators complete");
- set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-nowrote_alloc:
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
- flush_work(&c->btree_interior_update_work);
-
- for_each_member_device(ca, c, i)
- bch2_dev_allocator_stop(ca);
-
- clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
- clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
+ bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
+ journal_cur_seq(&c->journal));
+ if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+ set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
bch2_fs_journal_stop(&c->journal);
/*
- * the journal kicks off btree writes via reclaim - wait for in flight
- * writes after stopping journal:
- */
- bch2_btree_flush_all_writes(c);
-
- /*
* After stopping journal:
*/
for_each_member_device(ca, c, i)
bch2_dev_allocator_remove(c, ca);
}
+#ifndef BCH_WRITE_REF_DEBUG
static void bch2_writes_disabled(struct percpu_ref *writes)
{
struct bch_fs *c = container_of(writes, struct bch_fs, writes);
set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
- wake_up(&bch_read_only_wait);
+ wake_up(&bch2_read_only_wait);
}
+#endif
void bch2_fs_read_only(struct bch_fs *c)
{
@@ -318,14 +265,14 @@ void bch2_fs_read_only(struct bch_fs *c)
/*
* Block new foreground-end write operations from starting - any new
* writes will return -EROFS:
- *
- * (This is really blocking new _allocations_, writes to previously
- * allocated space can still happen until stopping the allocator in
- * bch2_dev_allocator_stop()).
*/
+ set_bit(BCH_FS_GOING_RO, &c->flags);
+#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_kill(&c->writes);
-
- cancel_work_sync(&c->ec_stripe_delete_work);
+#else
+ for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+ bch2_write_ref_put(c, i);
+#endif
/*
* If we're not doing an emergency shutdown, we want to wait on
@@ -338,22 +285,23 @@ void bch2_fs_read_only(struct bch_fs *c)
* we do need to wait on them before returning and signalling
* that going RO is complete:
*/
- wait_event(bch_read_only_wait,
+ wait_event(bch2_read_only_wait,
test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
__bch2_fs_read_only(c);
- wait_event(bch_read_only_wait,
+ wait_event(bch2_read_only_wait,
test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+ clear_bit(BCH_FS_GOING_RO, &c->flags);
if (!bch2_journal_error(&c->journal) &&
!test_bit(BCH_FS_ERROR, &c->flags) &&
!test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
test_bit(BCH_FS_STARTED, &c->flags) &&
- test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) &&
+ test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
!c->opts.norecovery) {
bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
@@ -384,7 +332,7 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
bch2_journal_halt(&c->journal);
bch2_fs_read_only_async(c);
- wake_up(&bch_read_only_wait);
+ wake_up(&bch2_read_only_wait);
return ret;
}
@@ -392,26 +340,12 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
{
int ret;
- ret = bch2_gc_thread_start(c);
- if (ret) {
- bch_err(c, "error starting gc thread");
- return ret;
- }
-
- ret = bch2_copygc_start(c);
- if (ret) {
- bch_err(c, "error starting copygc thread");
- return ret;
- }
-
ret = bch2_rebalance_start(c);
if (ret) {
bch_err(c, "error starting rebalance thread");
return ret;
}
- schedule_work(&c->ec_stripe_delete_work);
-
return 0;
}
@@ -444,25 +378,31 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
if (ret)
goto err;
- clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
+ clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+
+ /*
+ * First journal write must be a flush write: after a clean shutdown we
+ * don't read the journal, so the first journal write may end up
+ * overwriting whatever was there previously, and there must always be
+ * at least one non-flush write in the journal or recovery will fail:
+ */
+ set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
for_each_rw_member(ca, c, i)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- for_each_rw_member(ca, c, i) {
- ret = bch2_dev_allocator_start(ca);
- if (ret) {
- bch_err(c, "error starting allocator threads");
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
+ ret = bch2_gc_thread_start(c);
+ if (ret) {
+ bch_err(c, "error starting gc thread");
+ return ret;
}
- set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
- for_each_rw_member(ca, c, i)
- bch2_wake_allocator(ca);
+ ret = bch2_copygc_start(c);
+ if (ret) {
+ bch_err(c, "error starting copygc thread");
+ return ret;
+ }
if (!early) {
ret = bch2_fs_read_write_late(c);
@@ -470,9 +410,21 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
goto err;
}
+#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_reinit(&c->writes);
+#else
+ for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
+ BUG_ON(atomic_long_read(&c->writes[i]));
+ atomic_long_inc(&c->writes[i]);
+ }
+#endif
set_bit(BCH_FS_RW, &c->flags);
set_bit(BCH_FS_WAS_RW, &c->flags);
+
+ bch2_do_discards(c);
+ bch2_do_invalidates(c);
+ bch2_do_stripe_deletes(c);
+ bch2_do_pending_node_rewrites(c);
return 0;
err:
__bch2_fs_read_only(c);
@@ -501,6 +453,8 @@ static void __bch2_fs_free(struct bch_fs *c)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
+ bch2_free_pending_node_rewrites(c);
+ bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
bch2_fs_quota_exit(c);
bch2_fs_fsio_exit(c);
@@ -518,28 +472,32 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
bch2_journal_keys_free(&c->journal_keys);
- bch2_journal_entries_free(&c->journal_entries);
+ bch2_journal_entries_free(c);
+ bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
+ free_percpu(c->online_reserved);
if (c->btree_paths_bufs)
for_each_possible_cpu(cpu)
kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
- free_percpu(c->online_reserved);
free_percpu(c->btree_paths_bufs);
free_percpu(c->pcpu);
mempool_exit(&c->large_bkey_pool);
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
mempool_exit(&c->fill_iter);
+#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_exit(&c->writes);
+#endif
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
kfree(c->unused_inode_hints);
- free_heap(&c->copygc_heap);
- if (c->io_complete_wq )
- destroy_workqueue(c->io_complete_wq );
+ if (c->write_ref_wq)
+ destroy_workqueue(c->write_ref_wq);
+ if (c->io_complete_wq)
+ destroy_workqueue(c->io_complete_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
if (c->btree_io_complete_wq)
@@ -577,8 +535,7 @@ void __bch2_fs_stop(struct bch_fs *c)
for_each_member_device(ca, c, i)
if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev)
- sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
- "bcachefs");
+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
if (c->kobj.state_in_sysfs)
kobject_del(&c->kobj);
@@ -586,6 +543,7 @@ void __bch2_fs_stop(struct bch_fs *c)
bch2_fs_debug_exit(c);
bch2_fs_chardev_exit(c);
+ kobject_put(&c->counters_kobj);
kobject_put(&c->time_stats);
kobject_put(&c->opts_dir);
kobject_put(&c->internal);
@@ -654,6 +612,7 @@ static int bch2_fs_online(struct bch_fs *c)
kobject_add(&c->internal, &c->kobj, "internal") ?:
kobject_add(&c->opts_dir, &c->kobj, "options") ?:
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+ kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
bch2_opts_create_sysfs_files(&c->opts_dir);
if (ret) {
bch_err(c, "error creating sysfs objects");
@@ -682,6 +641,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
{
struct bch_sb_field_members *mi;
struct bch_fs *c;
+ struct printbuf name = PRINTBUF;
unsigned i, iter_size;
int ret = 0;
@@ -689,7 +649,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
if (!c) {
- c = ERR_PTR(-ENOMEM);
+ c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
goto out;
}
@@ -702,6 +662,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
kobject_init(&c->internal, &bch2_fs_internal_ktype);
kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+ kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
c->minor = -1;
c->disk_sb.fs_sb = true;
@@ -713,6 +674,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
init_rwsem(&c->gc_lock);
+ mutex_init(&c->gc_gens_lock);
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
@@ -723,6 +685,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_allocator_foreground_init(c);
bch2_fs_rebalance_init(c);
bch2_fs_quota_init(c);
+ bch2_fs_ec_init_early(c);
+ bch2_fs_move_init(c);
INIT_LIST_HEAD(&c->list);
@@ -736,28 +700,19 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_WORK(&c->journal_seq_blacklist_gc_work,
bch2_blacklist_entries_gc);
- INIT_LIST_HEAD(&c->journal_entries);
INIT_LIST_HEAD(&c->journal_iters);
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);
- INIT_LIST_HEAD(&c->ec_stripe_head_list);
- mutex_init(&c->ec_stripe_head_lock);
-
- INIT_LIST_HEAD(&c->ec_stripe_new_list);
- mutex_init(&c->ec_stripe_new_lock);
-
- INIT_LIST_HEAD(&c->data_progress_list);
- mutex_init(&c->data_progress_lock);
-
- spin_lock_init(&c->ec_stripes_heap_lock);
-
seqcount_init(&c->gc_pos_lock);
seqcount_init(&c->usage_lock);
- sema_init(&c->io_in_flight, 64);
+ sema_init(&c->io_in_flight, 128);
+
+ INIT_LIST_HEAD(&c->vfs_inodes_list);
+ mutex_init(&c->vfs_inodes_lock);
c->copy_gc_enabled = 1;
c->rebalance.enabled = 1;
@@ -783,7 +738,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret)
goto err;
- uuid_unparse_lower(c->sb.user_uuid.b, c->name);
+ pr_uuid(&name, c->sb.user_uuid.b);
+ strscpy(c->name, name.buf, sizeof(c->name));
+ printbuf_exit(&name);
+
+ ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
+ if (ret)
+ goto err;
/* Compat: */
if (sb->version <= bcachefs_metadata_version_inode_v2 &&
@@ -801,6 +762,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_opts_apply(&c->opts, opts);
+ c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
+ if (c->opts.inodes_use_key_cache)
+ c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+
c->block_bits = ilog2(block_sectors(c));
c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
@@ -817,33 +782,38 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
- WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+ WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
!(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
- WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+ WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->io_complete_wq = alloc_workqueue("bcachefs_io",
WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
+ !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
+ WQ_FREEZABLE, 0)) ||
+#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_init(&c->writes, bch2_writes_disabled,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+#endif
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->btree_bio, 1,
max(offsetof(struct btree_read_bio, bio),
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
- !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
+ !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
- ret = -ENOMEM;
+ ret = -BCH_ERR_ENOMEM_fs_other_alloc;
goto err;
}
- ret = bch2_io_clock_init(&c->io_clock[READ]) ?:
+ ret = bch2_fs_counters_init(c) ?:
+ bch2_io_clock_init(&c->io_clock[READ]) ?:
bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_journal_init(&c->journal) ?:
bch2_fs_replicas_init(c) ?:
@@ -851,9 +821,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_interior_update_init(c) ?:
- bch2_fs_buckets_waiting_for_journal_init(c);
+ bch2_fs_buckets_waiting_for_journal_init(c) ?:
+ bch2_fs_btree_write_buffer_init(c) ?:
bch2_fs_subvolumes_init(c) ?:
bch2_fs_io_init(c) ?:
+ bch2_fs_nocow_locking_init(c) ?:
bch2_fs_encryption_init(c) ?:
bch2_fs_compress_init(c) ?:
bch2_fs_ec_init(c) ?:
@@ -861,9 +833,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret)
goto err;
- if (c->opts.nochanges)
- set_bit(JOURNAL_NOCHANGES, &c->journal.flags);
-
mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
@@ -899,15 +868,15 @@ noinline_for_stack
static void print_mount_opts(struct bch_fs *c)
{
enum bch_opt_id i;
- char buf[512];
- struct printbuf p = PBUF(buf);
+ struct printbuf p = PRINTBUF;
bool first = true;
- strcpy(buf, "(null)");
+ prt_printf(&p, "mounted version=%s", bch2_metadata_versions[c->sb.version]);
if (c->opts.read_only) {
- pr_buf(&p, "ro");
+ prt_str(&p, " opts=");
first = false;
+ prt_printf(&p, "ro");
}
for (i = 0; i < bch2_opts_nr; i++) {
@@ -920,13 +889,13 @@ static void print_mount_opts(struct bch_fs *c)
if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
continue;
- if (!first)
- pr_buf(&p, ",");
+ prt_str(&p, first ? " opts=" : ",");
first = false;
- bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
+ bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
}
- bch_info(c, "mounted with opts: %s", buf);
+ bch_info(c, "%s", p.buf);
+ printbuf_exit(&p);
}
int bch2_fs_start(struct bch_fs *c)
@@ -935,7 +904,7 @@ int bch2_fs_start(struct bch_fs *c)
struct bch_dev *ca;
time64_t now = ktime_get_real_seconds();
unsigned i;
- int ret = -EINVAL;
+ int ret;
down_write(&c->state_lock);
@@ -956,6 +925,12 @@ int bch2_fs_start(struct bch_fs *c)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+ for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
+ mutex_lock(&c->btree_transaction_stats[i].lock);
+ bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
+ mutex_unlock(&c->btree_transaction_stats[i].lock);
+ }
+
ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
? bch2_fs_recovery(c)
: bch2_fs_initialize(c);
@@ -966,28 +941,14 @@ int bch2_fs_start(struct bch_fs *c)
if (ret)
goto err;
- ret = -EINVAL;
if (bch2_fs_init_fault("fs_start")) {
bch_err(c, "fs_start fault injected");
+ ret = -EINVAL;
goto err;
}
set_bit(BCH_FS_STARTED, &c->flags);
- /*
- * Allocator threads don't start filling copygc reserve until after we
- * set BCH_FS_STARTED - wake them now:
- *
- * XXX ugly hack:
- * Need to set ca->allocator_state here instead of relying on the
- * allocator threads to do it to avoid racing with the copygc threads
- * checking it and thinking they have no alloc reserve:
- */
- for_each_online_member(ca, c, i) {
- ca->allocator_state = ALLOCATOR_running;
- bch2_wake_allocator(ca);
- }
-
if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c);
} else {
@@ -1004,68 +965,44 @@ out:
up_write(&c->state_lock);
return ret;
err:
- switch (ret) {
- case BCH_FSCK_ERRORS_NOT_FIXED:
- bch_err(c, "filesystem contains errors: please report this to the developers");
- pr_cont("mount with -o fix_errors to repair\n");
- break;
- case BCH_FSCK_REPAIR_UNIMPLEMENTED:
- bch_err(c, "filesystem contains errors: please report this to the developers");
- pr_cont("repair unimplemented: inform the developers so that it can be added\n");
- break;
- case BCH_FSCK_REPAIR_IMPOSSIBLE:
- bch_err(c, "filesystem contains errors, but repair impossible");
- break;
- case BCH_FSCK_UNKNOWN_VERSION:
- bch_err(c, "unknown metadata version");
- break;
- case -ENOMEM:
- bch_err(c, "cannot allocate memory");
- break;
- case -EIO:
- bch_err(c, "IO error");
- break;
- }
-
- if (ret >= 0)
- ret = -EIO;
+ bch_err(c, "error starting filesystem: %s", bch2_err_str(ret));
goto out;
}
-static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
+static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
{
struct bch_sb_field_members *sb_mi;
sb_mi = bch2_sb_get_members(sb);
if (!sb_mi)
- return "Invalid superblock: member info area missing";
+ return -BCH_ERR_member_info_missing;
if (le16_to_cpu(sb->block_size) != block_sectors(c))
- return "mismatched block size";
+ return -BCH_ERR_mismatched_block_size;
if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
- return "new cache bucket size is too small";
+ return -BCH_ERR_bucket_size_too_small;
- return NULL;
+ return 0;
}
-static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
{
struct bch_sb *newest =
le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
if (uuid_le_cmp(fs->uuid, sb->uuid))
- return "device not a member of filesystem";
+ return -BCH_ERR_device_not_a_member_of_filesystem;
if (!bch2_dev_exists(newest, mi, sb->dev_idx))
- return "device has been removed";
+ return -BCH_ERR_device_has_been_removed;
if (fs->block_size != sb->block_size)
- return "mismatched block size";
+ return -BCH_ERR_mismatched_block_size;
- return NULL;
+ return 0;
}
/* Device startup/shutdown: */
@@ -1079,14 +1016,11 @@ static void bch2_dev_release(struct kobject *kobj)
static void bch2_dev_free(struct bch_dev *ca)
{
- bch2_dev_allocator_stop(ca);
-
cancel_work_sync(&ca->io_error_work);
if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev)
- sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
- "bcachefs");
+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
@@ -1122,10 +1056,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
wait_for_completion(&ca->io_ref_completion);
if (ca->kobj.state_in_sysfs) {
- struct kobject *block =
- &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
-
- sysfs_remove_link(block, "bcachefs");
+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
sysfs_remove_link(&ca->kobj, "block");
}
@@ -1162,12 +1093,12 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
}
if (ca->disk_sb.bdev) {
- struct kobject *block =
- &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj;
+ struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
if (ret)
return ret;
+
ret = sysfs_create_link(&ca->kobj, block, "block");
if (ret)
return ret;
@@ -1199,8 +1130,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid;
- if (opt_defined(c->opts, discard))
- ca->mi.discard = opt_get(c->opts, discard);
+ ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+ ca->mi.bucket_size / btree_sectors(c));
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
@@ -1251,12 +1182,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
ca->fs = c;
- if (ca->mi.state == BCH_MEMBER_STATE_rw &&
- bch2_dev_allocator_start(ca)) {
- bch2_dev_free(ca);
- goto err;
- }
-
bch2_dev_attach(c, ca, dev_idx);
out:
pr_verbose_init(c->opts, "ret %i", ret);
@@ -1264,7 +1189,7 @@ out:
err:
if (ca)
bch2_dev_free(ca);
- ret = -ENOMEM;
+ ret = -BCH_ERR_ENOMEM_dev_alloc;
goto out;
}
@@ -1275,23 +1200,17 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
if (bch2_dev_is_online(ca)) {
bch_err(ca, "already have device online in slot %u",
sb->sb->dev_idx);
- return -EINVAL;
+ return -BCH_ERR_device_already_online;
}
if (get_capacity(sb->bdev->bd_disk) <
ca->mi.bucket_size * ca->mi.nbuckets) {
bch_err(ca, "cannot online: device too small");
- return -EINVAL;
+ return -BCH_ERR_device_size_too_small;
}
BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
- if (get_capacity(sb->bdev->bd_disk) <
- ca->mi.bucket_size * ca->mi.nbuckets) {
- bch_err(ca, "device too small");
- return -EINVAL;
- }
-
ret = bch2_dev_journal_init(ca, sb->sb);
if (ret)
return ret;
@@ -1332,8 +1251,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
bch2_dev_sysfs_online(c, ca);
if (c->sb.nr_devices == 1)
- bdevname(ca->disk_sb.bdev, c->name);
- bdevname(ca->disk_sb.bdev, ca->name);
+ snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
+ snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
rebalance_wakeup(c);
return 0;
@@ -1434,22 +1353,13 @@ static bool bch2_fs_may_start(struct bch_fs *c)
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
/*
- * Device going read only means the copygc reserve get smaller, so we
- * don't want that happening while copygc is in progress:
- */
- bch2_copygc_stop(c);
-
- /*
* The allocator thread itself allocates btree nodes, so stop it first:
*/
- bch2_dev_allocator_stop(ca);
bch2_dev_allocator_remove(c, ca);
bch2_dev_journal_stop(&c->journal, ca);
-
- bch2_copygc_start(c);
}
-static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
@@ -1457,8 +1367,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
-
- return bch2_dev_allocator_start(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1471,7 +1379,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
return 0;
if (!bch2_dev_state_allowed(c, ca, new_state, flags))
- return -EINVAL;
+ return -BCH_ERR_device_state_not_allowed;
if (new_state != BCH_MEMBER_STATE_rw)
__bch2_dev_read_only(c, ca);
@@ -1485,7 +1393,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
mutex_unlock(&c->sb_lock);
if (new_state == BCH_MEMBER_STATE_rw)
- ret = __bch2_dev_read_write(c, ca);
+ __bch2_dev_read_write(c, ca);
rebalance_wakeup(c);
@@ -1508,37 +1416,35 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- struct btree_trans trans;
- size_t i;
+ struct bpos start = POS(ca->dev_idx, 0);
+ struct bpos end = POS(ca->dev_idx, U64_MAX);
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-
- for (i = 0; i < ca->mi.nbuckets; i++) {
- ret = lockrestart_do(&trans,
- bch2_btree_key_cache_flush(&trans,
- BTREE_ID_alloc, POS(ca->dev_idx, i)));
- if (ret)
- break;
- }
- bch2_trans_exit(&trans);
-
- if (ret) {
- bch_err(c, "error %i removing dev alloc info", ret);
- return ret;
- }
+ /*
+ * We clear the LRU and need_discard btrees first so that we don't race
+ * with bch2_do_invalidates() and bch2_do_discards()
+ */
+ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+ BTREE_TRIGGER_NORUN, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+ BTREE_TRIGGER_NORUN, NULL);
+ if (ret)
+ bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret));
- return bch2_btree_delete_range(c, BTREE_ID_alloc,
- POS(ca->dev_idx, 0),
- POS(ca->dev_idx + 1, 0),
- 0, NULL);
+ return ret;
}
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_sb_field_members *mi;
unsigned dev_idx = ca->dev_idx, data;
- int ret = -EINVAL;
+ int ret;
down_write(&c->state_lock);
@@ -1550,6 +1456,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot remove without losing data");
+ ret = -BCH_ERR_device_state_not_allowed;
goto err;
}
@@ -1557,32 +1464,23 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
if (ret) {
- bch_err(ca, "Remove failed: error %i dropping data", ret);
+ bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret));
goto err;
}
- ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
+ ret = bch2_dev_remove_alloc(c, ca);
if (ret) {
- bch_err(ca, "Remove failed: error %i flushing journal", ret);
+ bch_err(ca, "Remove failed, error deleting alloc info");
goto err;
}
- ret = bch2_dev_remove_alloc(c, ca);
+ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
if (ret) {
- bch_err(ca, "Remove failed, error deleting alloc info");
+ bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret));
goto err;
}
- /*
- * must flush all existing journal entries, they might have
- * (overwritten) keys that point to the device we're removing:
- */
- bch2_journal_flush_all_pins(&c->journal);
- /*
- * hack to ensure bch2_replicas_gc2() clears out entries to this device
- */
- bch2_journal_meta(&c->journal);
- ret = bch2_journal_error(&c->journal);
+ ret = bch2_journal_flush(&c->journal);
if (ret) {
bch_err(ca, "Remove failed, journal error");
goto err;
@@ -1590,17 +1488,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
ret = bch2_replicas_gc2(c);
if (ret) {
- bch_err(ca, "Remove failed: error %i from replicas gc", ret);
+ bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret));
goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
- char data_has_str[100];
+ struct printbuf data_has = PRINTBUF;
- bch2_flags_to_text(&PBUF(data_has_str),
- bch2_data_types, data);
- bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+ prt_bitflags(&data_has, bch2_data_types, data);
+ bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
+ printbuf_exit(&data_has);
ret = -EBUSY;
goto err;
}
@@ -1644,33 +1542,33 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
{
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb;
- const char *err;
struct bch_dev *ca = NULL;
struct bch_sb_field_members *mi;
struct bch_member dev_mi;
unsigned dev_idx, nr_devices, u64s;
- char *_errbuf;
- struct printbuf errbuf;
+ struct printbuf errbuf = PRINTBUF;
+ struct printbuf label = PRINTBUF;
int ret;
- _errbuf = kmalloc(4096, GFP_KERNEL);
- if (!_errbuf)
- return -ENOMEM;
-
- errbuf = _PBUF(_errbuf, 4096);
-
ret = bch2_read_super(path, &opts, &sb);
if (ret) {
- bch_err(c, "device add error: error reading super: %i", ret);
+ bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret));
goto err;
}
dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
- err = bch2_dev_may_add(sb.sb, c);
- if (err) {
- bch_err(c, "device add error: %s", err);
- ret = -EINVAL;
+ if (BCH_MEMBER_GROUP(&dev_mi)) {
+ bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+ if (label.allocation_failure) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+
+ ret = bch2_dev_may_add(sb.sb, c);
+ if (ret) {
+ bch_err(c, "device add error: %s", bch2_err_str(ret));
goto err;
}
@@ -1681,6 +1579,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err;
}
+ bch2_dev_usage_init(ca);
+
ret = __bch2_dev_attach_bdev(ca, &sb);
if (ret) {
bch2_dev_free(ca);
@@ -1708,7 +1608,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
le32_to_cpu(mi->field.u64s) +
sizeof(dev_mi) / sizeof(u64))) {
bch_err(c, "device add error: new device superblock too small");
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_sb_members;
goto err_unlock;
}
@@ -1721,7 +1621,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto have_slot;
no_slot:
bch_err(c, "device add error: already have maximum number of devices");
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_sb_members;
goto err_unlock;
have_slot:
@@ -1732,7 +1632,7 @@ have_slot:
mi = bch2_sb_resize_members(&c->disk_sb, u64s);
if (!mi) {
bch_err(c, "device add error: no room in superblock for member info");
- ret = -ENOSPC;
+ ret = -BCH_ERR_ENOSPC_sb_members;
goto err_unlock;
}
@@ -1745,6 +1645,14 @@ have_slot:
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
+ if (BCH_MEMBER_GROUP(&dev_mi)) {
+ ret = __bch2_dev_group_set(c, ca, label.buf);
+ if (ret) {
+ bch_err(c, "device add error: error setting label");
+ goto err_unlock;
+ }
+ }
+
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -1752,19 +1660,20 @@ have_slot:
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
- bch_err(c, "device add error: error marking new superblock: %i", ret);
+ bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret));
+ goto err_late;
+ }
+
+ ret = bch2_fs_freespace_init(c);
+ if (ret) {
+ bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
goto err_late;
}
ca->new_fs_bucket_idx = 0;
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- ret = __bch2_dev_read_write(c, ca);
- if (ret) {
- bch_err(c, "device add error: error going RW on new device: %i", ret);
- goto err_late;
- }
- }
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
return 0;
@@ -1776,7 +1685,8 @@ err:
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
- kfree(_errbuf);
+ printbuf_exit(&label);
+ printbuf_exit(&errbuf);
return ret;
err_late:
up_write(&c->state_lock);
@@ -1792,7 +1702,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
struct bch_sb_field_members *mi;
struct bch_dev *ca;
unsigned dev_idx;
- const char *err;
int ret;
down_write(&c->state_lock);
@@ -1805,9 +1714,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
- err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
- if (err) {
- bch_err(c, "error bringing %s online: %s", path, err);
+ ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
+ if (ret) {
+ bch_err(c, "error bringing %s online: %s", path, bch2_err_str(ret));
goto err;
}
@@ -1819,16 +1728,13 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
- bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb",
- path, ret);
+ bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s",
+ path, bch2_err_str(ret));
goto err;
}
- if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- ret = __bch2_dev_read_write(c, ca);
- if (ret)
- goto err;
- }
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
@@ -1839,12 +1745,16 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ ret = bch2_fs_freespace_init(c);
+ if (ret)
+ bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
+
up_write(&c->state_lock);
return 0;
err:
up_write(&c->state_lock);
bch2_free_super(&sb);
- return -EINVAL;
+ return ret;
}
int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
@@ -1860,7 +1770,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot offline required disk");
up_write(&c->state_lock);
- return -EINVAL;
+ return -BCH_ERR_device_state_not_allowed;
}
__bch2_dev_offline(c, ca);
@@ -1886,20 +1796,19 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
get_capacity(ca->disk_sb.bdev->bd_disk) <
ca->mi.bucket_size * nbuckets) {
bch_err(ca, "New size larger than device");
- ret = -EINVAL;
+ ret = -BCH_ERR_device_size_too_small;
goto err;
}
ret = bch2_dev_buckets_resize(c, ca, nbuckets);
if (ret) {
- bch_err(ca, "Resize error: %i", ret);
+ bch_err(ca, "Resize error: %s", bch2_err_str(ret));
goto err;
}
ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret) {
+ if (ret)
goto err;
- }
mutex_lock(&c->sb_lock);
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
@@ -1915,22 +1824,19 @@ err:
}
/* return with ref on ca->ref: */
-struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
- struct block_device *bdev = lookup_bdev(path);
struct bch_dev *ca;
unsigned i;
- if (IS_ERR(bdev))
- return ERR_CAST(bdev);
-
- for_each_member_device(ca, c, i)
- if (ca->disk_sb.bdev == bdev)
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL)
+ if (!strcmp(name, ca->name))
goto found;
-
ca = ERR_PTR(-ENOENT);
found:
- bdput(bdev);
+ rcu_read_unlock();
+
return ca;
}
@@ -1943,9 +1849,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
struct bch_fs *c = NULL;
struct bch_sb_field_members *mi;
unsigned i, best_sb = 0;
- const char *err;
- char *_errbuf = NULL;
- struct printbuf errbuf;
+ struct printbuf errbuf = PRINTBUF;
int ret = 0;
if (!try_module_get(THIS_MODULE))
@@ -1958,14 +1862,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
goto err;
}
- _errbuf = kmalloc(4096, GFP_KERNEL);
- if (!_errbuf) {
- ret = -ENOMEM;
- goto err;
- }
-
- errbuf = _PBUF(_errbuf, 4096);
-
sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
if (!sb) {
ret = -ENOMEM;
@@ -1990,16 +1886,14 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
while (i < nr_devices) {
if (i != best_sb &&
!bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) {
- char buf[BDEVNAME_SIZE];
- pr_info("%s has been removed, skipping",
- bdevname(sb[i].bdev, buf));
+ pr_info("%pg has been removed, skipping", sb[i].bdev);
bch2_free_super(&sb[i]);
array_remove_item(sb, nr_devices, i);
continue;
}
- err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
- if (err)
+ ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
+ if (ret)
goto err_print;
i++;
}
@@ -2020,9 +1914,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
}
up_write(&c->state_lock);
- err = "insufficient devices";
- if (!bch2_fs_may_start(c))
+ if (!bch2_fs_may_start(c)) {
+ ret = -BCH_ERR_insufficient_devices_to_start;
goto err_print;
+ }
if (!c->opts.nostart) {
ret = bch2_fs_start(c);
@@ -2031,14 +1926,14 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
}
out:
kfree(sb);
- kfree(_errbuf);
+ printbuf_exit(&errbuf);
module_put(THIS_MODULE);
- pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
+ pr_verbose_init(opts, "ret %s (%i)", bch2_err_str(PTR_ERR_OR_ZERO(c)),
+ PTR_ERR_OR_ZERO(c));
return c;
err_print:
pr_err("bch_fs_open err opening %s: %s",
- devices[0], err);
- ret = -EINVAL;
+ devices[0], bch2_err_str(ret));
err:
if (!IS_ERR_OR_NULL(c))
bch2_fs_stop(c);
@@ -2085,5 +1980,8 @@ err:
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
+unsigned bch2_metadata_version = bcachefs_metadata_version_current;
+module_param_named(version, bch2_metadata_version, uint, 0400);
+
module_exit(bcachefs_exit);
module_init(bcachefs_init);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 7e118244abe6..d4e939c808fa 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -26,6 +26,12 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
return remainder;
}
+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
+ u32 *offset)
+{
+ return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
+
static inline bool bch2_dev_is_online(struct bch_dev *ca)
{
return !percpu_ref_is_zero(&ca->io_ref);
@@ -82,9 +88,10 @@ static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
unsigned dev)
{
- BUG_ON(bch2_dev_list_has_dev(*devs, dev));
- BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
- devs->devs[devs->nr++] = dev;
+ if (!bch2_dev_list_has_dev(*devs, dev)) {
+ BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
+ devs->devs[devs->nr++] = dev;
+ }
}
static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
@@ -217,7 +224,6 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
struct bch_fs *bch2_dev_to_fs(dev_t);
struct bch_fs *bch2_uuid_to_fs(uuid_le);
-int bch2_congested(void *, int);
bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
enum bch_member_state, int);
@@ -245,7 +251,8 @@ int bch2_fs_read_write_early(struct bch_fs *);
*/
static inline void bch2_fs_lazy_rw(struct bch_fs *c)
{
- if (percpu_ref_is_zero(&c->writes))
+ if (!test_bit(BCH_FS_RW, &c->flags) &&
+ !test_bit(BCH_FS_WAS_RW, &c->flags))
bch2_fs_read_write_early(c);
}
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index d8b159a5b7f7..89419fc7930d 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -32,6 +32,7 @@ struct bch_member_cpu {
u8 discard;
u8 data_allowed;
u8 durability;
+ u8 freespace_initialized;
u8 valid;
};
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ed9a095063e8..0f86a6c0c9d8 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -27,6 +27,8 @@
#include "journal.h"
#include "keylist.h"
#include "move.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
#include "opts.h"
#include "rebalance.h"
#include "replicas.h"
@@ -40,46 +42,75 @@
#include "util.h"
#define SYSFS_OPS(type) \
-struct sysfs_ops type ## _sysfs_ops = { \
+const struct sysfs_ops type ## _sysfs_ops = { \
.show = type ## _show, \
.store = type ## _store \
}
#define SHOW(fn) \
+static ssize_t fn ## _to_text(struct printbuf *, \
+ struct kobject *, struct attribute *); \
+ \
static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
char *buf) \
+{ \
+ struct printbuf out = PRINTBUF; \
+ ssize_t ret = fn ## _to_text(&out, kobj, attr); \
+ \
+ if (out.pos && out.buf[out.pos - 1] != '\n') \
+ prt_newline(&out); \
+ \
+ if (!ret && out.allocation_failure) \
+ ret = -ENOMEM; \
+ \
+ if (!ret) { \
+ ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \
+ memcpy(buf, out.buf, ret); \
+ } \
+ printbuf_exit(&out); \
+ return bch2_err_class(ret); \
+} \
+ \
+static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
+ struct attribute *attr)
#define STORE(fn) \
+static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
+ const char *, size_t); \
+ \
static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
const char *buf, size_t size) \
+{ \
+ return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \
+} \
+ \
+static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
+ const char *buf, size_t size)
#define __sysfs_attribute(_name, _mode) \
static struct attribute sysfs_##_name = \
{ .name = #_name, .mode = _mode }
-#define write_attribute(n) __sysfs_attribute(n, S_IWUSR)
-#define read_attribute(n) __sysfs_attribute(n, S_IRUGO)
-#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR)
+#define write_attribute(n) __sysfs_attribute(n, 0200)
+#define read_attribute(n) __sysfs_attribute(n, 0444)
+#define rw_attribute(n) __sysfs_attribute(n, 0644)
#define sysfs_printf(file, fmt, ...) \
do { \
if (attr == &sysfs_ ## file) \
- return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\
+ prt_printf(out, fmt "\n", __VA_ARGS__); \
} while (0)
#define sysfs_print(file, var) \
do { \
if (attr == &sysfs_ ## file) \
- return snprint(buf, PAGE_SIZE, var); \
+ snprint(out, var); \
} while (0)
#define sysfs_hprint(file, val) \
do { \
- if (attr == &sysfs_ ## file) { \
- bch2_hprint(&out, val); \
- pr_buf(&out, "\n"); \
- return out.pos - buf; \
- } \
+ if (attr == &sysfs_ ## file) \
+ prt_human_readable_s64(out, val); \
} while (0)
#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var))
@@ -133,7 +164,10 @@ do { \
} while (0)
write_attribute(trigger_gc);
+write_attribute(trigger_discards);
+write_attribute(trigger_invalidates);
write_attribute(prune_cache);
+write_attribute(btree_wakeup);
rw_attribute(btree_gc_periodic);
rw_attribute(gc_gens_pos);
@@ -142,7 +176,7 @@ read_attribute(minor);
read_attribute(bucket_size);
read_attribute(first_bucket);
read_attribute(nbuckets);
-read_attribute(durability);
+rw_attribute(durability);
read_attribute(iodone);
read_attribute(io_latency_read);
@@ -151,30 +185,51 @@ read_attribute(io_latency_stats_read);
read_attribute(io_latency_stats_write);
read_attribute(congested);
-read_attribute(btree_avg_write_size);
+read_attribute(btree_write_stats);
-read_attribute(reserve_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
-read_attribute(journal_pins);
read_attribute(btree_updates);
-read_attribute(dirty_btree_nodes);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
-read_attribute(btree_transactions);
read_attribute(stripes_heap);
read_attribute(open_buckets);
+read_attribute(open_buckets_partial);
+read_attribute(write_points);
+read_attribute(nocow_lock_table);
+
+#ifdef BCH_WRITE_REF_DEBUG
+read_attribute(write_refs);
+
+const char * const bch2_write_refs[] = {
+#define x(n) #n,
+ BCH_WRITE_REFS()
+#undef x
+ NULL
+};
+
+static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ bch2_printbuf_tabstop_push(out, 24);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) {
+ prt_str(out, bch2_write_refs[i]);
+ prt_tab(out);
+ prt_printf(out, "%li", atomic_long_read(&c->writes[i]));
+ prt_newline(out);
+ }
+}
+#endif
read_attribute(internal_uuid);
read_attribute(has_data);
read_attribute(alloc_debug);
-write_attribute(wake_allocator);
-read_attribute(read_realloc_races);
-read_attribute(extent_migrate_done);
-read_attribute(extent_migrate_raced);
+#define x(t, n, ...) read_attribute(t);
+BCH_PERSISTENT_COUNTERS()
+#undef x
rw_attribute(discard);
rw_attribute(label);
@@ -193,6 +248,7 @@ read_attribute(io_timers_read);
read_attribute(io_timers_write);
read_attribute(data_jobs);
+read_attribute(moving_ctxts);
#ifdef CONFIG_BCACHEFS_TESTS
write_attribute(perf_test);
@@ -200,13 +256,13 @@ write_attribute(perf_test);
#define x(_name) \
static struct attribute sysfs_time_stat_##_name = \
- { .name = #_name, .mode = S_IRUGO };
+ { .name = #_name, .mode = 0444 };
BCH_TIME_STATS()
#undef x
static struct attribute sysfs_state_rw = {
.name = "state",
- .mode = S_IRUGO
+ .mode = 0444,
};
static size_t bch2_btree_cache_size(struct bch_fs *c)
@@ -222,33 +278,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
return ret;
}
-static size_t bch2_btree_avg_write_size(struct bch_fs *c)
-{
- u64 nr = atomic64_read(&c->btree_writes_nr);
- u64 sectors = atomic64_read(&c->btree_writes_sectors);
-
- return nr ? div64_u64(sectors, nr) : 0;
-}
-
-static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
-{
- long ret = 0;
- struct bch_move_stats *stats;
-
- mutex_lock(&c->data_progress_lock);
- list_for_each_entry(stats, &c->data_progress_list, list) {
- pr_buf(out, "%s: data type %s btree_id %s position: ",
- stats->name,
- bch2_data_types[stats->data_type],
- bch2_btree_ids[stats->btree_id]);
- bch2_bpos_to_text(out, stats->pos);
- pr_buf(out, "%s", "\n");
- }
-
- mutex_unlock(&c->data_progress_lock);
- return ret;
-}
-
static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
struct btree_trans trans;
@@ -270,7 +299,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
bch2_trans_init(&trans, c, 0, 0);
for (id = 0; id < BTREE_ID_NR; id++) {
- if (!((1U << id) & BTREE_ID_HAS_PTRS))
+ if (!btree_type_has_ptrs(id))
continue;
for_each_btree_key(&trans, iter, id, POS_MIN,
@@ -315,146 +344,132 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
if (ret)
return ret;
- pr_buf(out, "uncompressed:\n");
- pr_buf(out, " nr extents: %llu\n", nr_uncompressed_extents);
- pr_buf(out, " size: ");
- bch2_hprint(out, uncompressed_sectors << 9);
- pr_buf(out, "\n");
-
- pr_buf(out, "compressed:\n");
- pr_buf(out, " nr extents: %llu\n", nr_compressed_extents);
- pr_buf(out, " compressed size: ");
- bch2_hprint(out, compressed_sectors_compressed << 9);
- pr_buf(out, "\n");
- pr_buf(out, " uncompressed size: ");
- bch2_hprint(out, compressed_sectors_uncompressed << 9);
- pr_buf(out, "\n");
-
- pr_buf(out, "incompressible:\n");
- pr_buf(out, " nr extents: %llu\n", nr_incompressible_extents);
- pr_buf(out, " size: ");
- bch2_hprint(out, incompressible_sectors << 9);
- pr_buf(out, "\n");
+ prt_printf(out, "uncompressed:\n");
+ prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents);
+ prt_printf(out, " size: ");
+ prt_human_readable_u64(out, uncompressed_sectors << 9);
+ prt_printf(out, "\n");
+
+ prt_printf(out, "compressed:\n");
+ prt_printf(out, " nr extents: %llu\n", nr_compressed_extents);
+ prt_printf(out, " compressed size: ");
+ prt_human_readable_u64(out, compressed_sectors_compressed << 9);
+ prt_printf(out, "\n");
+ prt_printf(out, " uncompressed size: ");
+ prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
+ prt_printf(out, "\n");
+
+ prt_printf(out, "incompressible:\n");
+ prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents);
+ prt_printf(out, " size: ");
+ prt_human_readable_u64(out, incompressible_sectors << 9);
+ prt_printf(out, "\n");
return 0;
}
static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
{
- pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
+ prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
bch2_bpos_to_text(out, c->gc_gens_pos);
- pr_buf(out, "\n");
+ prt_printf(out, "\n");
+}
+
+static void bch2_btree_wakeup_all(struct bch_fs *c)
+{
+ struct btree_trans *trans;
+
+ mutex_lock(&c->btree_trans_lock);
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
+
+ if (b)
+ six_lock_wakeup_all(&b->lock);
+
+ }
+ mutex_unlock(&c->btree_trans_lock);
}
SHOW(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
sysfs_print(minor, c->minor);
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
- sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c));
- sysfs_print(read_realloc_races,
- atomic_long_read(&c->read_realloc_races));
- sysfs_print(extent_migrate_done,
- atomic_long_read(&c->extent_migrate_done));
- sysfs_print(extent_migrate_raced,
- atomic_long_read(&c->extent_migrate_raced));
+ if (attr == &sysfs_btree_write_stats)
+ bch2_btree_write_stats_to_text(out, c);
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
- if (attr == &sysfs_gc_gens_pos) {
- bch2_gc_gens_pos_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_gc_gens_pos)
+ bch2_gc_gens_pos_to_text(out, c);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled);
sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */
- sysfs_hprint(copy_gc_wait,
- max(0LL, c->copygc_wait -
- atomic64_read(&c->io_clock[WRITE].now)) << 9);
- if (attr == &sysfs_rebalance_work) {
- bch2_rebalance_work_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_copy_gc_wait)
+ bch2_copygc_wait_to_text(out, c);
+
+ if (attr == &sysfs_rebalance_work)
+ bch2_rebalance_work_to_text(out, c);
sysfs_print(promote_whole_extents, c->promote_whole_extents);
/* Debugging: */
- if (attr == &sysfs_journal_debug) {
- bch2_journal_debug_to_text(&out, &c->journal);
- return out.pos - buf;
- }
+ if (attr == &sysfs_journal_debug)
+ bch2_journal_debug_to_text(out, &c->journal);
- if (attr == &sysfs_journal_pins) {
- bch2_journal_pins_to_text(&out, &c->journal);
- return out.pos - buf;
- }
+ if (attr == &sysfs_btree_updates)
+ bch2_btree_updates_to_text(out, c);
- if (attr == &sysfs_btree_updates) {
- bch2_btree_updates_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_btree_cache)
+ bch2_btree_cache_to_text(out, &c->btree_cache);
- if (attr == &sysfs_dirty_btree_nodes) {
- bch2_dirty_btree_nodes_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_btree_key_cache)
+ bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
- if (attr == &sysfs_btree_cache) {
- bch2_btree_cache_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_stripes_heap)
+ bch2_stripes_heap_to_text(out, c);
- if (attr == &sysfs_btree_key_cache) {
- bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
- return out.pos - buf;
- }
+ if (attr == &sysfs_open_buckets)
+ bch2_open_buckets_to_text(out, c);
- if (attr == &sysfs_btree_transactions) {
- bch2_btree_trans_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_open_buckets_partial)
+ bch2_open_buckets_partial_to_text(out, c);
- if (attr == &sysfs_stripes_heap) {
- bch2_stripes_heap_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_write_points)
+ bch2_write_points_to_text(out, c);
- if (attr == &sysfs_open_buckets) {
- bch2_open_buckets_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_compression_stats)
+ bch2_compression_stats_to_text(out, c);
- if (attr == &sysfs_compression_stats) {
- bch2_compression_stats_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_new_stripes)
+ bch2_new_stripes_to_text(out, c);
- if (attr == &sysfs_new_stripes) {
- bch2_new_stripes_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_io_timers_read)
+ bch2_io_timers_to_text(out, &c->io_clock[READ]);
- if (attr == &sysfs_io_timers_read) {
- bch2_io_timers_to_text(&out, &c->io_clock[READ]);
- return out.pos - buf;
- }
- if (attr == &sysfs_io_timers_write) {
- bch2_io_timers_to_text(&out, &c->io_clock[WRITE]);
- return out.pos - buf;
- }
+ if (attr == &sysfs_io_timers_write)
+ bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
- if (attr == &sysfs_data_jobs) {
- data_progress_to_text(&out, c);
- return out.pos - buf;
- }
+ if (attr == &sysfs_data_jobs)
+ bch2_data_jobs_to_text(out, c);
+
+ if (attr == &sysfs_moving_ctxts)
+ bch2_fs_moving_ctxts_to_text(out, c);
+
+#ifdef BCH_WRITE_REF_DEBUG
+ if (attr == &sysfs_write_refs)
+ bch2_write_refs_to_text(out, c);
+#endif
+
+ if (attr == &sysfs_nocow_lock_table)
+ bch2_nocow_locks_to_text(out, &c->nocow_locks);
return 0;
}
@@ -499,6 +514,20 @@ STORE(bch2_fs)
/* Debugging: */
+ if (!test_bit(BCH_FS_RW, &c->flags))
+ return -EROFS;
+
+ if (attr == &sysfs_prune_cache) {
+ struct shrink_control sc;
+
+ sc.gfp_mask = GFP_KERNEL;
+ sc.nr_to_scan = strtoul_or_return(buf);
+ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+ }
+
+ if (attr == &sysfs_btree_wakeup)
+ bch2_btree_wakeup_all(c);
+
if (attr == &sysfs_trigger_gc) {
/*
* Full gc is currently incompatible with btree key cache:
@@ -512,13 +541,11 @@ STORE(bch2_fs)
#endif
}
- if (attr == &sysfs_prune_cache) {
- struct shrink_control sc;
+ if (attr == &sysfs_trigger_discards)
+ bch2_do_discards(c);
- sc.gfp_mask = GFP_KERNEL;
- sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
- }
+ if (attr == &sysfs_trigger_invalidates)
+ bch2_do_invalidates(c);
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
@@ -547,7 +574,7 @@ SYSFS_OPS(bch2_fs);
struct attribute *bch2_fs_files[] = {
&sysfs_minor,
&sysfs_btree_cache_size,
- &sysfs_btree_avg_write_size,
+ &sysfs_btree_write_stats,
&sysfs_promote_whole_extents,
@@ -559,41 +586,87 @@ struct attribute *bch2_fs_files[] = {
NULL
};
+/* counters dir */
+
+SHOW(bch2_fs_counters)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
+ u64 counter = 0;
+ u64 counter_since_mount = 0;
+
+ printbuf_tabstop_push(out, 32);
+
+ #define x(t, ...) \
+ if (attr == &sysfs_##t) { \
+ counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
+ counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+ prt_printf(out, "since mount:"); \
+ prt_tab(out); \
+ prt_human_readable_u64(out, counter_since_mount); \
+ prt_newline(out); \
+ \
+ prt_printf(out, "since filesystem creation:"); \
+ prt_tab(out); \
+ prt_human_readable_u64(out, counter); \
+ prt_newline(out); \
+ }
+ BCH_PERSISTENT_COUNTERS()
+ #undef x
+ return 0;
+}
+
+STORE(bch2_fs_counters) {
+ return 0;
+}
+
+SYSFS_OPS(bch2_fs_counters);
+
+struct attribute *bch2_fs_counters_files[] = {
+#define x(t, ...) \
+ &sysfs_##t,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ NULL
+};
/* internal dir - just a wrapper */
SHOW(bch2_fs_internal)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
- return bch2_fs_show(&c->kobj, attr, buf);
+
+ return bch2_fs_to_text(out, &c->kobj, attr);
}
STORE(bch2_fs_internal)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+
return bch2_fs_store(&c->kobj, attr, buf, size);
}
SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
&sysfs_journal_debug,
- &sysfs_journal_pins,
&sysfs_btree_updates,
- &sysfs_dirty_btree_nodes,
&sysfs_btree_cache,
&sysfs_btree_key_cache,
- &sysfs_btree_transactions,
&sysfs_new_stripes,
&sysfs_stripes_heap,
&sysfs_open_buckets,
+ &sysfs_open_buckets_partial,
+ &sysfs_write_points,
+#ifdef BCH_WRITE_REF_DEBUG
+ &sysfs_write_refs,
+#endif
+ &sysfs_nocow_lock_table,
&sysfs_io_timers_read,
&sysfs_io_timers_write,
&sysfs_trigger_gc,
+ &sysfs_trigger_discards,
+ &sysfs_trigger_invalidates,
&sysfs_prune_cache,
-
- &sysfs_read_realloc_races,
- &sysfs_extent_migrate_done,
- &sysfs_extent_migrate_raced,
+ &sysfs_btree_wakeup,
&sysfs_gc_gens_pos,
@@ -605,6 +678,7 @@ struct attribute *bch2_fs_internal_files[] = {
sysfs_pd_controller_files(rebalance),
&sysfs_data_jobs,
+ &sysfs_moving_ctxts,
&sysfs_internal_uuid,
NULL
@@ -614,16 +688,15 @@ struct attribute *bch2_fs_internal_files[] = {
SHOW(bch2_fs_opts_dir)
{
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
const struct bch_option *opt = container_of(attr, struct bch_option, attr);
int id = opt - bch2_opt_table;
u64 v = bch2_opt_get_by_id(&c->opts, id);
- bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST);
- pr_buf(&out, "\n");
+ bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
+ prt_char(out, '\n');
- return out.pos - buf;
+ return 0;
}
STORE(bch2_fs_opts_dir)
@@ -634,19 +707,28 @@ STORE(bch2_fs_opts_dir)
char *tmp;
u64 v;
+ /*
+ * We don't need to take c->writes for correctness, but it eliminates an
+ * unsightly error message in the dmesg log when we're RO:
+ */
+ if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
+ return -EROFS;
+
tmp = kstrdup(buf, GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto err;
+ }
- ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v);
+ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
kfree(tmp);
if (ret < 0)
- return ret;
+ goto err;
ret = bch2_opt_check_may_set(c, id, v);
if (ret < 0)
- return ret;
+ goto err;
bch2_opt_set_sb(c, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
@@ -657,7 +739,10 @@ STORE(bch2_fs_opts_dir)
rebalance_wakeup(c);
}
- return size;
+ ret = size;
+err:
+ bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
+ return ret;
}
SYSFS_OPS(bch2_fs_opts_dir);
@@ -687,13 +772,10 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj)
SHOW(bch2_fs_time_stats)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
#define x(name) \
- if (attr == &sysfs_time_stat_##name) { \
- bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\
- return out.pos - buf; \
- }
+ if (attr == &sysfs_time_stat_##name) \
+ bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
BCH_TIME_STATS()
#undef x
@@ -714,24 +796,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
NULL
};
-static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
-{
- enum alloc_reserve i;
-
- spin_lock(&ca->fs->freelist_lock);
-
- pr_buf(out, "free_inc:\t%zu\t%zu\n",
- fifo_used(&ca->free_inc),
- ca->free_inc.size);
-
- for (i = 0; i < RESERVE_NR; i++)
- pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
- fifo_used(&ca->free[i]),
- ca->free[i].size);
-
- spin_unlock(&ca->fs->freelist_lock);
-}
-
static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
@@ -743,46 +807,100 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].data_type]++;
- pr_buf(out,
- "\t\t buckets\t sectors fragmented\n"
- "capacity%16llu\n",
- ca->mi.nbuckets - ca->mi.first_bucket);
-
- for (i = 1; i < BCH_DATA_NR; i++)
- pr_buf(out, "%-8s%16llu%16llu%16llu\n",
- bch2_data_types[i], stats.d[i].buckets,
- stats.d[i].sectors, stats.d[i].fragmented);
-
- pr_buf(out,
- "ec\t%16llu\n"
- "available%15llu\n"
- "\n"
- "free_inc\t\t%zu/%zu\n"
- "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
- "free[RESERVE_NONE]\t%zu/%zu\n"
- "freelist_wait\t\t%s\n"
- "open buckets allocated\t%u\n"
- "open buckets this dev\t%u\n"
- "open buckets total\t%u\n"
- "open_buckets_wait\t%s\n"
- "open_buckets_btree\t%u\n"
- "open_buckets_user\t%u\n"
- "btree reserve cache\t%u\n"
- "thread state:\t\t%s\n",
- stats.buckets_ec,
- __dev_buckets_available(ca, stats),
- fifo_used(&ca->free_inc), ca->free_inc.size,
- fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
- fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
- c->freelist_wait.list.first ? "waiting" : "empty",
- OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
- ca->nr_open_buckets,
- OPEN_BUCKETS_COUNT,
- c->open_buckets_wait.list.first ? "waiting" : "empty",
- nr[BCH_DATA_btree],
- nr[BCH_DATA_user],
- c->btree_reserve_cache_nr,
- bch2_allocator_states[ca->allocator_state]);
+ printbuf_tabstop_push(out, 8);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+
+ prt_tab(out);
+ prt_str(out, "buckets");
+ prt_tab_rjust(out);
+ prt_str(out, "sectors");
+ prt_tab_rjust(out);
+ prt_str(out, "fragmented");
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ prt_str(out, bch2_data_types[i]);
+ prt_tab(out);
+ prt_u64(out, stats.d[i].buckets);
+ prt_tab_rjust(out);
+ prt_u64(out, stats.d[i].sectors);
+ prt_tab_rjust(out);
+ prt_u64(out, stats.d[i].fragmented);
+ prt_tab_rjust(out);
+ prt_newline(out);
+ }
+
+ prt_str(out, "ec");
+ prt_tab(out);
+ prt_u64(out, stats.buckets_ec);
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ prt_newline(out);
+
+ prt_printf(out, "reserves:");
+ prt_newline(out);
+ for (i = 0; i < RESERVE_NR; i++) {
+ prt_str(out, bch2_alloc_reserves[i]);
+ prt_tab(out);
+ prt_u64(out, bch2_dev_buckets_reserved(ca, i));
+ prt_tab_rjust(out);
+ prt_newline(out);
+ }
+
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 24);
+
+ prt_str(out, "freelist_wait");
+ prt_tab(out);
+ prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty");
+ prt_newline(out);
+
+ prt_str(out, "open buckets allocated");
+ prt_tab(out);
+ prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
+ prt_newline(out);
+
+ prt_str(out, "open buckets this dev");
+ prt_tab(out);
+ prt_u64(out, ca->nr_open_buckets);
+ prt_newline(out);
+
+ prt_str(out, "open buckets total");
+ prt_tab(out);
+ prt_u64(out, OPEN_BUCKETS_COUNT);
+ prt_newline(out);
+
+ prt_str(out, "open_buckets_wait");
+ prt_tab(out);
+ prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty");
+ prt_newline(out);
+
+ prt_str(out, "open_buckets_btree");
+ prt_tab(out);
+ prt_u64(out, nr[BCH_DATA_btree]);
+ prt_newline(out);
+
+ prt_str(out, "open_buckets_user");
+ prt_tab(out);
+ prt_u64(out, nr[BCH_DATA_user]);
+ prt_newline(out);
+
+ prt_str(out, "buckets_to_invalidate");
+ prt_tab(out);
+ prt_u64(out, should_invalidate_buckets(ca, stats));
+ prt_newline(out);
+
+ prt_str(out, "btree reserve cache");
+ prt_tab(out);
+ prt_u64(out, c->btree_reserve_cache_nr);
+ prt_newline(out);
}
static const char * const bch2_rw[] = {
@@ -796,10 +914,10 @@ static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
int rw, i;
for (rw = 0; rw < 2; rw++) {
- pr_buf(out, "%s:\n", bch2_rw[rw]);
+ prt_printf(out, "%s:\n", bch2_rw[rw]);
for (i = 1; i < BCH_DATA_NR; i++)
- pr_buf(out, "%-12s:%12llu\n",
+ prt_printf(out, "%-12s:%12llu\n",
bch2_data_types[i],
percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
}
@@ -809,7 +927,6 @@ SHOW(bch2_dev)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
struct bch_fs *c = ca->fs;
- struct printbuf out = _PBUF(buf, PAGE_SIZE);
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
@@ -822,58 +939,42 @@ SHOW(bch2_dev)
if (attr == &sysfs_label) {
if (ca->mi.group) {
mutex_lock(&c->sb_lock);
- bch2_disk_path_to_text(&out, &c->disk_sb,
+ bch2_disk_path_to_text(out, c->disk_sb.sb,
ca->mi.group - 1);
mutex_unlock(&c->sb_lock);
}
- pr_buf(&out, "\n");
- return out.pos - buf;
+ prt_char(out, '\n');
}
if (attr == &sysfs_has_data) {
- bch2_flags_to_text(&out, bch2_data_types,
- bch2_dev_has_data(c, ca));
- pr_buf(&out, "\n");
- return out.pos - buf;
+ prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+ prt_char(out, '\n');
}
if (attr == &sysfs_state_rw) {
- bch2_string_opt_to_text(&out, bch2_member_states,
- ca->mi.state);
- pr_buf(&out, "\n");
- return out.pos - buf;
+ prt_string_option(out, bch2_member_states, ca->mi.state);
+ prt_char(out, '\n');
}
- if (attr == &sysfs_iodone) {
- dev_iodone_to_text(&out, ca);
- return out.pos - buf;
- }
+ if (attr == &sysfs_iodone)
+ dev_iodone_to_text(out, ca);
sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
- if (attr == &sysfs_io_latency_stats_read) {
- bch2_time_stats_to_text(&out, &ca->io_latency[READ]);
- return out.pos - buf;
- }
- if (attr == &sysfs_io_latency_stats_write) {
- bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]);
- return out.pos - buf;
- }
+ if (attr == &sysfs_io_latency_stats_read)
+ bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+
+ if (attr == &sysfs_io_latency_stats_write)
+ bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
sysfs_printf(congested, "%u%%",
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
* 100 / CONGESTED_MAX);
- if (attr == &sysfs_reserve_stats) {
- reserve_stats_to_text(&out, ca);
- return out.pos - buf;
- }
- if (attr == &sysfs_alloc_debug) {
- dev_alloc_debug_to_text(&out, ca);
- return out.pos - buf;
- }
+ if (attr == &sysfs_alloc_debug)
+ dev_alloc_debug_to_text(out, ca);
return 0;
}
@@ -897,6 +998,19 @@ STORE(bch2_dev)
mutex_unlock(&c->sb_lock);
}
+ if (attr == &sysfs_durability) {
+ u64 v = strtoul_or_return(buf);
+
+ mutex_lock(&c->sb_lock);
+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+
+ if (v != BCH_MEMBER_DURABILITY(mi)) {
+ SET_BCH_MEMBER_DURABILITY(mi, v + 1);
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+ }
+
if (attr == &sysfs_label) {
char *tmp;
int ret;
@@ -911,9 +1025,6 @@ STORE(bch2_dev)
return ret;
}
- if (attr == &sysfs_wake_allocator)
- bch2_wake_allocator(ca);
-
return size;
}
SYSFS_OPS(bch2_dev);
@@ -939,11 +1050,8 @@ struct attribute *bch2_dev_files[] = {
&sysfs_io_latency_stats_write,
&sysfs_congested,
- &sysfs_reserve_stats,
-
/* debug: */
&sysfs_alloc_debug,
- &sysfs_wake_allocator,
NULL
};
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
index 525fd05d91f7..222cd5062702 100644
--- a/fs/bcachefs/sysfs.h
+++ b/fs/bcachefs/sysfs.h
@@ -10,28 +10,32 @@ struct attribute;
struct sysfs_ops;
extern struct attribute *bch2_fs_files[];
+extern struct attribute *bch2_fs_counters_files[];
extern struct attribute *bch2_fs_internal_files[];
extern struct attribute *bch2_fs_opts_dir_files[];
extern struct attribute *bch2_fs_time_stats_files[];
extern struct attribute *bch2_dev_files[];
-extern struct sysfs_ops bch2_fs_sysfs_ops;
-extern struct sysfs_ops bch2_fs_internal_sysfs_ops;
-extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-extern struct sysfs_ops bch2_dev_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+extern const struct sysfs_ops bch2_dev_sysfs_ops;
int bch2_opts_create_sysfs_files(struct kobject *);
#else
static struct attribute *bch2_fs_files[] = {};
+static struct attribute *bch2_fs_counters_files[] = {};
static struct attribute *bch2_fs_internal_files[] = {};
static struct attribute *bch2_fs_opts_dir_files[] = {};
static struct attribute *bch2_fs_time_stats_files[] = {};
static struct attribute *bch2_dev_files[] = {};
static const struct sysfs_ops bch2_fs_sysfs_ops;
+static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index de84ce834975..d352821d5614 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -15,15 +15,15 @@ static void delete_test_keys(struct bch_fs *c)
int ret;
ret = bch2_btree_delete_range(c, BTREE_ID_extents,
- POS_MIN, SPOS_MAX,
- BTREE_ITER_ALL_SNAPSHOTS,
- NULL);
+ SPOS(0, 0, U32_MAX),
+ POS(0, U64_MAX),
+ 0, NULL);
BUG_ON(ret);
ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
- POS_MIN, SPOS_MAX,
- BTREE_ITER_ALL_SNAPSHOTS,
- NULL);
+ SPOS(0, 0, U32_MAX),
+ POS(0, U64_MAX),
+ 0, NULL);
BUG_ON(ret);
}
@@ -43,29 +43,29 @@ static int test_delete(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
BTREE_ITER_INTENT);
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(&trans, &iter, &k.k_i, 0));
if (ret) {
- bch_err(c, "update error in test_delete: %i", ret);
+ bch_err(c, "%s(): update error in: %s", __func__, bch2_err_str(ret));
goto err;
}
pr_info("deleting once");
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
bch2_btree_delete_at(&trans, &iter, 0));
if (ret) {
- bch_err(c, "delete error (first) in test_delete: %i", ret);
+ bch_err(c, "%s(): delete error (first): %s", __func__, bch2_err_str(ret));
goto err;
}
pr_info("deleting twice");
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
bch2_btree_delete_at(&trans, &iter, 0));
if (ret) {
- bch_err(c, "delete error (second) in test_delete: %i", ret);
+ bch_err(c, "%s(): delete error (second): %s", __func__, bch2_err_str(ret));
goto err;
}
err:
@@ -89,22 +89,22 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
BTREE_ITER_INTENT);
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(&trans, &iter, &k.k_i, 0));
if (ret) {
- bch_err(c, "update error in test_delete_written: %i", ret);
+ bch_err(c, "%s(): update error: %s", __func__, bch2_err_str(ret));
goto err;
}
bch2_trans_unlock(&trans);
bch2_journal_flush_all_pins(&c->journal);
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
bch2_btree_delete_at(&trans, &iter, 0));
if (ret) {
- bch_err(c, "delete error in test_delete_written: %i", ret);
+ bch_err(c, "%s(): delete error: %s", __func__, bch2_err_str(ret));
goto err;
}
err:
@@ -137,7 +137,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
NULL, NULL, 0);
if (ret) {
- bch_err(c, "insert error in test_iterate: %i", ret);
+ bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
goto err;
}
}
@@ -146,20 +146,31 @@ static int test_iterate(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), 0, k, ret) {
- if (k.k->p.inode)
- break;
-
+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
BUG_ON(k.k->p.offset != i++);
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+ goto err;
}
BUG_ON(i != nr);
pr_info("iterating backwards");
- while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
- BUG_ON(k.k->p.offset != --i);
+ ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, U64_MAX, U32_MAX), 0, k,
+ ({
+ BUG_ON(k.k->p.offset != --i);
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
+ goto err;
+ }
BUG_ON(i);
err:
@@ -193,7 +204,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
NULL, NULL, 0);
if (ret) {
- bch_err(c, "insert error in test_iterate_extents: %i", ret);
+ bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
goto err;
}
}
@@ -202,19 +213,32 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), 0, k, ret) {
+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
BUG_ON(bkey_start_offset(k.k) != i);
i = k.k->p.offset;
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+ goto err;
}
BUG_ON(i != nr);
pr_info("iterating backwards");
- while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
- BUG_ON(k.k->p.offset != i);
- i = bkey_start_offset(k.k);
+ ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents,
+ SPOS(0, U64_MAX, U32_MAX), 0, k,
+ ({
+ BUG_ON(k.k->p.offset != i);
+ i = bkey_start_offset(k.k);
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret));
+ goto err;
}
BUG_ON(i);
@@ -248,7 +272,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
NULL, NULL, 0);
if (ret) {
- bch_err(c, "insert error in test_iterate_slots: %i", ret);
+ bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
goto err;
}
}
@@ -257,15 +281,17 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), 0, k, ret) {
- if (k.k->p.inode)
- break;
-
+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
BUG_ON(k.k->p.offset != i);
i += 2;
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+ goto err;
}
- bch2_trans_iter_exit(&trans, &iter);
BUG_ON(i != nr * 2);
@@ -273,17 +299,23 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- BTREE_ITER_SLOTS, k, ret) {
+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ BTREE_ITER_SLOTS, k, ({
+ if (i >= nr * 2)
+ break;
+
BUG_ON(k.k->p.offset != i);
BUG_ON(bkey_deleted(k.k) != (i & 1));
i++;
- if (i == nr * 2)
- break;
+ 0;
+ }));
+ if (ret < 0) {
+ bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
+ goto err;
}
- bch2_trans_iter_exit(&trans, &iter);
+ ret = 0;
err:
bch2_trans_exit(&trans);
return ret;
@@ -314,7 +346,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
NULL, NULL, 0);
if (ret) {
- bch_err(c, "insert error in test_iterate_slots_extents: %i", ret);
+ bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
goto err;
}
}
@@ -323,13 +355,18 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), 0, k, ret) {
+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
BUG_ON(bkey_start_offset(k.k) != i + 8);
BUG_ON(k.k->size != 8);
i += 16;
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret));
+ goto err;
}
- bch2_trans_iter_exit(&trans, &iter);
BUG_ON(i != nr);
@@ -337,19 +374,23 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX),
- BTREE_ITER_SLOTS, k, ret) {
+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ BTREE_ITER_SLOTS, k, ({
+ if (i == nr)
+ break;
BUG_ON(bkey_deleted(k.k) != !(i % 16));
BUG_ON(bkey_start_offset(k.k) != i);
BUG_ON(k.k->size != 8);
i = k.k->p.offset;
-
- if (i == nr)
- break;
+ 0;
+ }));
+ if (ret) {
+ bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret));
+ goto err;
}
- bch2_trans_iter_exit(&trans, &iter);
+ ret = 0;
err:
bch2_trans_exit(&trans);
return 0;
@@ -369,10 +410,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), 0);
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
bch2_trans_iter_exit(&trans, &iter);
@@ -390,10 +431,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), 0);
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
bch2_trans_iter_exit(&trans, &iter);
@@ -420,7 +461,7 @@ static int insert_test_extent(struct bch_fs *c,
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
NULL, NULL, 0);
if (ret)
- bch_err(c, "insert error in insert_test_extent: %i", ret);
+ bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
return ret;
}
@@ -483,7 +524,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
bch2_trans_init(&trans, c, 0, 0);
bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, snapid_lo), 0);
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
BUG_ON(k.k->p.snapshot != U32_MAX);
@@ -519,7 +560,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
ret = test_snapshot_filter(c, snapids[0], snapids[1]);
if (ret) {
- bch_err(c, "err %i from test_snapshot_filter", ret);
+ bch_err(c, "%s(): err from test_snapshot_filter: %s", __func__, bch2_err_str(ret));
return ret;
}
@@ -531,11 +572,8 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
static u64 test_rand(void)
{
u64 v;
-#if 0
- v = prandom_u32();
-#else
- prandom_bytes(&v, sizeof(v));
-#endif
+
+ get_random_bytes(&v, sizeof(v));
return v;
}
@@ -553,10 +591,10 @@ static int rand_insert(struct bch_fs *c, u64 nr)
k.k.p.offset = test_rand();
k.k.p.snapshot = U32_MAX;
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
+ ret = commit_do(&trans, NULL, NULL, 0,
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0));
if (ret) {
- bch_err(c, "error in rand_insert: %i", ret);
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
break;
}
}
@@ -582,17 +620,17 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
k[j].k.p.snapshot = U32_MAX;
}
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
+ ret = commit_do(&trans, NULL, NULL, 0,
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0));
if (ret) {
- bch_err(c, "error in rand_insert_multi: %i", ret);
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
break;
}
}
@@ -616,10 +654,10 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
for (i = 0; i < nr; i++) {
bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
ret = bkey_err(k);
if (ret) {
- bch_err(c, "error in rand_lookup: %i", ret);
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
break;
}
}
@@ -641,8 +679,8 @@ static int rand_mixed_trans(struct btree_trans *trans,
k = bch2_btree_iter_peek(iter);
ret = bkey_err(k);
- if (ret && ret != -EINTR)
- bch_err(trans->c, "lookup error in rand_mixed: %i", ret);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(trans->c, "%s(): lookup error: %s", __func__, bch2_err_str(ret));
if (ret)
return ret;
@@ -669,10 +707,10 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
for (i = 0; i < nr; i++) {
rand = test_rand();
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
rand_mixed_trans(&trans, &iter, &cookie, i, rand));
if (ret) {
- bch_err(c, "update error in rand_mixed: %i", ret);
+ bch_err(c, "%s(): update error: %s", __func__, bch2_err_str(ret));
break;
}
}
@@ -690,7 +728,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(&iter);
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
ret = bkey_err(k);
if (ret)
goto err;
@@ -715,10 +753,10 @@ static int rand_delete(struct bch_fs *c, u64 nr)
for (i = 0; i < nr; i++) {
struct bpos pos = SPOS(0, test_rand(), U32_MAX);
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
__do_delete(&trans, pos));
if (ret) {
- bch_err(c, "error in rand_delete: %i", ret);
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
break;
}
}
@@ -734,28 +772,23 @@ static int seq_insert(struct bch_fs *c, u64 nr)
struct bkey_s_c k;
struct bkey_i_cookie insert;
int ret = 0;
- u64 i = 0;
bkey_cookie_init(&insert.k_i);
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- insert.k.p = iter.pos;
-
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(&trans, &iter, &insert.k_i, 0));
- if (ret) {
- bch_err(c, "error in seq_insert: %i", ret);
- break;
- }
-
- if (++i == nr)
- break;
- }
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
+ NULL, NULL, 0,
+ ({
+ if (iter.pos.offset >= nr)
+ break;
+ insert.k.p = iter.pos;
+ bch2_trans_update(&trans, &iter, &insert.k_i, 0);
+ }));
+ if (ret)
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
bch2_trans_exit(&trans);
return ret;
@@ -770,10 +803,12 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), 0, k, ret)
- ;
- bch2_trans_iter_exit(&trans, &iter);
+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k,
+ 0);
+ if (ret)
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
bch2_trans_exit(&trans);
return ret;
@@ -788,22 +823,18 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX),
- BTREE_ITER_INTENT, k, ret) {
- struct bkey_i_cookie u;
-
- bkey_reassemble(&u.k_i, k);
+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX),
+ BTREE_ITER_INTENT, k,
+ NULL, NULL, 0,
+ ({
+ struct bkey_i_cookie u;
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(&trans, &iter, &u.k_i, 0));
- if (ret) {
- bch_err(c, "error in seq_overwrite: %i", ret);
- break;
- }
- }
- bch2_trans_iter_exit(&trans, &iter);
+ bkey_reassemble(&u.k_i, k);
+ bch2_trans_update(&trans, &iter, &u.k_i, 0);
+ }));
+ if (ret)
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
bch2_trans_exit(&trans);
return ret;
@@ -814,11 +845,11 @@ static int seq_delete(struct bch_fs *c, u64 nr)
int ret;
ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
- POS_MIN, SPOS_MAX,
- BTREE_ITER_ALL_SNAPSHOTS,
- NULL);
+ SPOS(0, 0, U32_MAX),
+ POS(0, U64_MAX),
+ 0, NULL);
if (ret)
- bch_err(c, "error in seq_delete: %i", ret);
+ bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
return ret;
}
@@ -855,7 +886,7 @@ static int btree_perf_test_thread(void *data)
ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
if (ret) {
- bch_err(j->c, "%ps: error %i", j->fn, ret);
+ bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
j->ret = ret;
}
@@ -871,7 +902,9 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
u64 nr, unsigned nr_threads)
{
struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
- char name_buf[20], nr_buf[20], per_sec_buf[20];
+ char name_buf[20];
+ struct printbuf nr_buf = PRINTBUF;
+ struct printbuf per_sec_buf = PRINTBUF;
unsigned i;
u64 time;
@@ -932,13 +965,15 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
time = j.finish - j.start;
scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
- bch2_hprint(&PBUF(nr_buf), nr);
- bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time));
+ prt_human_readable_u64(&nr_buf, nr);
+ prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
- name_buf, nr_buf, nr_threads,
+ name_buf, nr_buf.buf, nr_threads,
div_u64(time, NSEC_PER_SEC),
div_u64(time * nr_threads, nr),
- per_sec_buf);
+ per_sec_buf.buf);
+ printbuf_exit(&per_sec_buf);
+ printbuf_exit(&nr_buf);
return j.ret;
}
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
index 59e8dfa3d245..6813147d73d3 100644
--- a/fs/bcachefs/trace.c
+++ b/fs/bcachefs/trace.c
@@ -2,11 +2,15 @@
#include "bcachefs.h"
#include "alloc_types.h"
#include "buckets.h"
-#include "btree_types.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update_interior.h"
#include "keylist.h"
+#include "opts.h"
#include <linux/blktrace_api.h>
-#include "keylist.h"
+#include <linux/six.h>
#define CREATE_TRACE_POINTS
#include <trace/events/bcachefs.h>
diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c
new file mode 100644
index 000000000000..9764c2e6a910
--- /dev/null
+++ b/fs/bcachefs/two_state_shared_lock.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "two_state_shared_lock.h"
+
+void __bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+ __wait_event(lock->wait, bch2_two_state_trylock(lock, s));
+}
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
new file mode 100644
index 000000000000..905801772002
--- /dev/null
+++ b/fs/bcachefs/two_state_shared_lock.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TWO_STATE_LOCK_H
+#define _BCACHEFS_TWO_STATE_LOCK_H
+
+#include <linux/atomic.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+#include "util.h"
+
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+typedef struct {
+ atomic_long_t v;
+ wait_queue_head_t wait;
+} two_state_lock_t;
+
+static inline void two_state_lock_init(two_state_lock_t *lock)
+{
+ atomic_long_set(&lock->v, 0);
+ init_waitqueue_head(&lock->wait);
+}
+
+static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
+{
+ long i = s ? 1 : -1;
+
+ EBUG_ON(atomic_long_read(&lock->v) == 0);
+
+ if (atomic_long_sub_return_release(i, &lock->v) == 0)
+ wake_up_all(&lock->wait);
+}
+
+static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
+{
+ long i = s ? 1 : -1;
+ long v = atomic_long_read(&lock->v), old;
+
+ do {
+ old = v;
+
+ if (i > 0 ? v < 0 : v > 0)
+ return false;
+ } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+ old, old + i)) != old);
+ return true;
+}
+
+void __bch2_two_state_lock(two_state_lock_t *, int);
+
+static inline void bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+ if (!bch2_two_state_trylock(lock, s))
+ __bch2_two_state_lock(lock, s);
+}
+
+#endif /* _BCACHEFS_TWO_STATE_LOCK_H */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 0bbea332fcaa..c50473d4925d 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -8,6 +8,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/console.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/freezer.h>
@@ -21,22 +22,26 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/sched/clock.h>
+#include <linux/mean_and_variance.h>
#include "eytzinger.h"
#include "util.h"
static const char si_units[] = "?kMGTPEZY";
-static int __bch2_strtoh(const char *cp, u64 *res,
- u64 t_max, bool t_signed)
+/* string_get_size units: */
+static const char *const units_2[] = {
+ "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
+};
+static const char *const units_10[] = {
+ "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
+};
+
+static int parse_u64(const char *cp, u64 *res)
{
- bool positive = *cp != '-';
- unsigned u;
+ const char *start = cp;
u64 v = 0;
- if (*cp == '+' || *cp == '-')
- cp++;
-
if (!isdigit(*cp))
return -EINVAL;
@@ -50,22 +55,122 @@ static int __bch2_strtoh(const char *cp, u64 *res,
cp++;
} while (isdigit(*cp));
+ *res = v;
+ return cp - start;
+}
+
+static int bch2_pow(u64 n, u64 p, u64 *res)
+{
+ *res = 1;
+
+ while (p--) {
+ if (*res > div_u64(U64_MAX, n))
+ return -ERANGE;
+ *res *= n;
+ }
+ return 0;
+}
+
+static int parse_unit_suffix(const char *cp, u64 *res)
+{
+ const char *start = cp;
+ u64 base = 1024;
+ unsigned u;
+ int ret;
+
+ if (*cp == ' ')
+ cp++;
+
for (u = 1; u < strlen(si_units); u++)
if (*cp == si_units[u]) {
cp++;
goto got_unit;
}
- u = 0;
+
+ for (u = 0; u < ARRAY_SIZE(units_2); u++)
+ if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
+ cp += strlen(units_2[u]);
+ goto got_unit;
+ }
+
+ for (u = 0; u < ARRAY_SIZE(units_10); u++)
+ if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
+ cp += strlen(units_10[u]);
+ base = 1000;
+ goto got_unit;
+ }
+
+ *res = 1;
+ return 0;
got_unit:
- if (*cp == '\n')
+ ret = bch2_pow(base, u, res);
+ if (ret)
+ return ret;
+
+ return cp - start;
+}
+
+#define parse_or_ret(cp, _f) \
+do { \
+ int ret = _f; \
+ if (ret < 0) \
+ return ret; \
+ cp += ret; \
+} while (0)
+
+static int __bch2_strtou64_h(const char *cp, u64 *res)
+{
+ const char *start = cp;
+ u64 v = 0, b, f_n = 0, f_d = 1;
+ int ret;
+
+ parse_or_ret(cp, parse_u64(cp, &v));
+
+ if (*cp == '.') {
cp++;
- if (*cp)
- return -EINVAL;
+ ret = parse_u64(cp, &f_n);
+ if (ret < 0)
+ return ret;
+ cp += ret;
+
+ ret = bch2_pow(10, ret, &f_d);
+ if (ret)
+ return ret;
+ }
+
+ parse_or_ret(cp, parse_unit_suffix(cp, &b));
- if (fls64(v) + u * 10 > 64)
+ if (v > div_u64(U64_MAX, b))
return -ERANGE;
+ v *= b;
+
+ if (f_n > div_u64(U64_MAX, b))
+ return -ERANGE;
+
+ f_n = div_u64(f_n * b, f_d);
+ if (v + f_n < v)
+ return -ERANGE;
+ v += f_n;
+
+ *res = v;
+ return cp - start;
+}
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+ u64 t_max, bool t_signed)
+{
+ bool positive = *cp != '-';
+ u64 v = 0;
- v <<= u * 10;
+ if (*cp == '+' || *cp == '-')
+ cp++;
+
+ parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
+
+ if (*cp == '\n')
+ cp++;
+ if (*cp)
+ return -EINVAL;
if (positive) {
if (v > t_max)
@@ -86,7 +191,7 @@ got_unit:
#define STRTO_H(name, type) \
int bch2_ ## name ## _h(const char *cp, type *res) \
{ \
- u64 v; \
+ u64 v = 0; \
int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \
ANYSINT_MAX(type) != ((type) ~0ULL)); \
*res = v; \
@@ -99,58 +204,6 @@ STRTO_H(strtoll, long long)
STRTO_H(strtoull, unsigned long long)
STRTO_H(strtou64, u64)
-void bch2_hprint(struct printbuf *buf, s64 v)
-{
- int u, t = 0;
-
- for (u = 0; v >= 1024 || v <= -1024; u++) {
- t = v & ~(~0U << 10);
- v >>= 10;
- }
-
- pr_buf(buf, "%lli", v);
-
- /*
- * 103 is magic: t is in the range [-1023, 1023] and we want
- * to turn it into [-9, 9]
- */
- if (u && t && v < 100 && v > -100)
- pr_buf(buf, ".%i", t / 103);
- if (u)
- pr_buf(buf, "%c", si_units[u]);
-}
-
-void bch2_string_opt_to_text(struct printbuf *out,
- const char * const list[],
- size_t selected)
-{
- size_t i;
-
- for (i = 0; list[i]; i++)
- pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]);
-}
-
-void bch2_flags_to_text(struct printbuf *out,
- const char * const list[], u64 flags)
-{
- unsigned bit, nr = 0;
- bool first = true;
-
- if (out->pos != out->end)
- *out->pos = '\0';
-
- while (list[nr])
- nr++;
-
- while (flags && (bit = __ffs(flags)) < nr) {
- if (!first)
- pr_buf(out, ",");
- first = false;
- pr_buf(out, "%s", list[bit]);
- flags ^= 1 << bit;
- }
-}
-
u64 bch2_read_flag_list(char *opt, const char * const list[])
{
u64 ret = 0;
@@ -187,12 +240,85 @@ bool bch2_is_zero(const void *_p, size_t n)
return true;
}
-static void bch2_quantiles_update(struct quantiles *q, u64 v)
+void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+{
+ while (nr_bits)
+ prt_char(out, '0' + ((v >> --nr_bits) & 1));
+}
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
+{
+ const char *p;
+
+ if (!lines) {
+ printk("%s (null)\n", prefix);
+ return;
+ }
+
+ console_lock();
+ while (1) {
+ p = strchrnul(lines, '\n');
+ printk("%s%.*s\n", prefix, (int) (p - lines), lines);
+ if (!*p)
+ break;
+ lines = p + 1;
+ }
+ console_unlock();
+}
+
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
+{
+ unsigned nr_entries = 0;
+ int ret = 0;
+
+ stack->nr = 0;
+ ret = darray_make_room(stack, 32);
+ if (ret)
+ return ret;
+
+ if (!down_read_trylock(&task->signal->exec_update_lock))
+ return -1;
+
+ do {
+ nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0);
+ } while (nr_entries == stack->size &&
+ !(ret = darray_make_room(stack, stack->size * 2)));
+
+ stack->nr = nr_entries;
+ up_read(&task->signal->exec_update_lock);
+
+ return ret;
+}
+
+void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
+{
+ unsigned long *i;
+
+ darray_for_each(*stack, i) {
+ prt_printf(out, "[<0>] %pB", (void *) *i);
+ prt_newline(out);
+ }
+}
+
+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
+{
+ bch_stacktrace stack = { 0 };
+ int ret = bch2_save_backtrace(&stack, task);
+
+ bch2_prt_backtrace(out, &stack);
+ darray_exit(&stack);
+ return ret;
+}
+
+/* time stats: */
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
{
unsigned i = 0;
while (i < ARRAY_SIZE(q->entries)) {
- struct quantile_entry *e = q->entries + i;
+ struct bch2_quantile_entry *e = q->entries + i;
if (unlikely(!e->step)) {
e->m = v;
@@ -217,85 +343,99 @@ static void bch2_quantiles_update(struct quantiles *q, u64 v)
}
}
-/* time stats: */
-
-static void bch2_time_stats_update_one(struct time_stats *stats,
- u64 start, u64 end)
+static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
+ u64 start, u64 end)
{
u64 duration, freq;
- duration = time_after64(end, start)
- ? end - start : 0;
- freq = time_after64(end, stats->last_event)
- ? end - stats->last_event : 0;
-
- stats->count++;
-
- stats->average_duration = stats->average_duration
- ? ewma_add(stats->average_duration, duration, 6)
- : duration;
+ if (time_after64(end, start)) {
+ duration = end - start;
+ stats->duration_stats = mean_and_variance_update_inlined(stats->duration_stats,
+ duration);
+ stats->duration_stats_weighted = mean_and_variance_weighted_update(
+ stats->duration_stats_weighted,
+ duration);
+ stats->max_duration = max(stats->max_duration, duration);
+ stats->min_duration = min(stats->min_duration, duration);
+ bch2_quantiles_update(&stats->quantiles, duration);
+ }
- stats->average_frequency = stats->average_frequency
- ? ewma_add(stats->average_frequency, freq, 6)
- : freq;
+ if (time_after64(end, stats->last_event)) {
+ freq = end - stats->last_event;
+ stats->freq_stats = mean_and_variance_update_inlined(stats->freq_stats, freq);
+ stats->freq_stats_weighted = mean_and_variance_weighted_update(
+ stats->freq_stats_weighted,
+ freq);
+ stats->max_freq = max(stats->max_freq, freq);
+ stats->min_freq = min(stats->min_freq, freq);
+ stats->last_event = end;
+ }
+}
- stats->max_duration = max(stats->max_duration, duration);
+static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct bch2_time_stat_buffer *b)
+{
+ struct bch2_time_stat_buffer_entry *i;
+ unsigned long flags;
- stats->last_event = end;
+ spin_lock_irqsave(&stats->lock, flags);
+ for (i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ bch2_time_stats_update_one(stats, i->start, i->end);
+ spin_unlock_irqrestore(&stats->lock, flags);
- bch2_quantiles_update(&stats->quantiles, duration);
+ b->nr = 0;
}
-void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
+ WARN_RATELIMIT(!stats->min_duration || !stats->min_freq,
+ "time_stats: min_duration = %llu, min_freq = %llu",
+ stats->min_duration, stats->min_freq);
+
if (!stats->buffer) {
spin_lock_irqsave(&stats->lock, flags);
bch2_time_stats_update_one(stats, start, end);
- if (stats->average_frequency < 32 &&
- stats->count > 1024)
+ if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
+ stats->duration_stats.n > 1024)
stats->buffer =
- alloc_percpu_gfp(struct time_stat_buffer,
+ alloc_percpu_gfp(struct bch2_time_stat_buffer,
GFP_ATOMIC);
spin_unlock_irqrestore(&stats->lock, flags);
} else {
- struct time_stat_buffer_entry *i;
- struct time_stat_buffer *b;
+ struct bch2_time_stat_buffer *b;
preempt_disable();
b = this_cpu_ptr(stats->buffer);
BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
- b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+ b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
.start = start,
.end = end
};
- if (b->nr == ARRAY_SIZE(b->entries)) {
- spin_lock_irqsave(&stats->lock, flags);
- for (i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- bch2_time_stats_update_one(stats, i->start, i->end);
- spin_unlock_irqrestore(&stats->lock, flags);
-
- b->nr = 0;
- }
-
+ if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+ bch2_time_stats_clear_buffer(stats, b);
preempt_enable();
}
}
+#endif
static const struct time_unit {
const char *name;
- u32 nsecs;
+ u64 nsecs;
} time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "sec", NSEC_PER_SEC },
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", (u64) NSEC_PER_SEC * 60},
+ { "h", (u64) NSEC_PER_SEC * 3600},
+ { "eon", U64_MAX },
};
static const struct time_unit *pick_time_units(u64 ns)
@@ -311,57 +451,153 @@ static const struct time_unit *pick_time_units(u64 ns)
return u;
}
-static void pr_time_units(struct printbuf *out, u64 ns)
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
- pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+ prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
-void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
+static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
- const struct time_unit *u;
- u64 freq = READ_ONCE(stats->average_frequency);
- u64 q, last_q = 0;
- int i;
+ const struct time_unit *u = pick_time_units(ns);
- pr_buf(out, "count:\t\t%llu\n",
- stats->count);
- pr_buf(out, "rate:\t\t%llu/sec\n",
- freq ? div64_u64(NSEC_PER_SEC, freq) : 0);
+ prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
+ prt_tab_rjust(out);
+ prt_printf(out, "%s", u->name);
+}
- pr_buf(out, "frequency:\t");
- pr_time_units(out, freq);
+#define TABSTOP_SIZE 12
- pr_buf(out, "\navg duration:\t");
- pr_time_units(out, stats->average_duration);
+static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
+{
+ prt_str(out, name);
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, ns);
+ prt_newline(out);
+}
+
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
+{
+ const struct time_unit *u;
+ s64 f_mean = 0, d_mean = 0;
+ u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
+ int i;
+ /*
+ * avoid divide by zero
+ */
+ if (stats->freq_stats.n) {
+ f_mean = mean_and_variance_get_mean(stats->freq_stats);
+ f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+ d_mean = mean_and_variance_get_mean(stats->duration_stats);
+ d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+ }
- pr_buf(out, "\nmax duration:\t");
- pr_time_units(out, stats->max_duration);
+ printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
+ prt_printf(out, "count:");
+ prt_tab(out);
+ prt_printf(out, "%llu ",
+ stats->duration_stats.n);
+ printbuf_tabstop_pop(out);
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+
+ printbuf_tabstop_push(out, out->indent + 20);
+ printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+ printbuf_tabstop_push(out, 0);
+ printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+
+ prt_tab(out);
+ prt_printf(out, "since mount");
+ prt_tab_rjust(out);
+ prt_tab(out);
+ prt_printf(out, "recent");
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, out->indent + 20);
+ printbuf_tabstop_push(out, TABSTOP_SIZE);
+ printbuf_tabstop_push(out, 2);
+ printbuf_tabstop_push(out, TABSTOP_SIZE);
+
+ prt_printf(out, "duration of events");
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ pr_name_and_units(out, "min:", stats->min_duration);
+ pr_name_and_units(out, "max:", stats->max_duration);
+
+ prt_printf(out, "mean:");
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, d_mean);
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+ prt_newline(out);
+
+ prt_printf(out, "stddev:");
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, d_stddev);
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+
+ printbuf_indent_sub(out, 2);
+ prt_newline(out);
+
+ prt_printf(out, "time between events");
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ pr_name_and_units(out, "min:", stats->min_freq);
+ pr_name_and_units(out, "max:", stats->max_freq);
+
+ prt_printf(out, "mean:");
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, f_mean);
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+ prt_newline(out);
+
+ prt_printf(out, "stddev:");
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, f_stddev);
+ prt_tab(out);
+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+
+ printbuf_indent_sub(out, 2);
+ prt_newline(out);
+
+ printbuf_tabstops_reset(out);
i = eytzinger0_first(NR_QUANTILES);
u = pick_time_units(stats->quantiles.entries[i].m);
- pr_buf(out, "\nquantiles (%s):\t", u->name);
+ prt_printf(out, "quantiles (%s):\t", u->name);
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
q = max(stats->quantiles.entries[i].m, last_q);
- pr_buf(out, "%llu%s",
- div_u64(q, u->nsecs),
- is_last ? "\n" : " ");
+ prt_printf(out, "%llu ",
+ div_u64(q, u->nsecs));
+ if (is_last)
+ prt_newline(out);
last_q = q;
}
}
-void bch2_time_stats_exit(struct time_stats *stats)
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
free_percpu(stats->buffer);
}
-void bch2_time_stats_init(struct time_stats *stats)
+void bch2_time_stats_init(struct bch2_time_stats *stats)
{
memset(stats, 0, sizeof(*stats));
+ stats->duration_stats_weighted.w = 8;
+ stats->freq_stats_weighted.w = 8;
+ stats->min_duration = U64_MAX;
+ stats->min_freq = U64_MAX;
spin_lock_init(&stats->lock);
}
@@ -467,36 +703,45 @@ void bch2_pd_controller_init(struct bch_pd_controller *pd)
pd->backpressure = 1;
}
-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
+void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
{
- /* 2^64 - 1 is 20 digits, plus null byte */
- char rate[21];
- char actual[21];
- char target[21];
- char proportional[21];
- char derivative[21];
- char change[21];
- s64 next_io;
-
- bch2_hprint(&PBUF(rate), pd->rate.rate);
- bch2_hprint(&PBUF(actual), pd->last_actual);
- bch2_hprint(&PBUF(target), pd->last_target);
- bch2_hprint(&PBUF(proportional), pd->last_proportional);
- bch2_hprint(&PBUF(derivative), pd->last_derivative);
- bch2_hprint(&PBUF(change), pd->last_change);
-
- next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
-
- return sprintf(buf,
- "rate:\t\t%s/sec\n"
- "target:\t\t%s\n"
- "actual:\t\t%s\n"
- "proportional:\t%s\n"
- "derivative:\t%s\n"
- "change:\t\t%s/sec\n"
- "next io:\t%llims\n",
- rate, target, actual, proportional,
- derivative, change, next_io);
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 20);
+
+ prt_printf(out, "rate:");
+ prt_tab(out);
+ prt_human_readable_s64(out, pd->rate.rate);
+ prt_newline(out);
+
+ prt_printf(out, "target:");
+ prt_tab(out);
+ prt_human_readable_u64(out, pd->last_target);
+ prt_newline(out);
+
+ prt_printf(out, "actual:");
+ prt_tab(out);
+ prt_human_readable_u64(out, pd->last_actual);
+ prt_newline(out);
+
+ prt_printf(out, "proportional:");
+ prt_tab(out);
+ prt_human_readable_s64(out, pd->last_proportional);
+ prt_newline(out);
+
+ prt_printf(out, "derivative:");
+ prt_tab(out);
+ prt_human_readable_s64(out, pd->last_derivative);
+ prt_newline(out);
+
+ prt_printf(out, "change:");
+ prt_tab(out);
+ prt_human_readable_s64(out, pd->last_change);
+ prt_newline(out);
+
+ prt_printf(out, "next io:");
+ prt_tab(out);
+ prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
+ prt_newline(out);
}
/* misc: */
@@ -516,10 +761,10 @@ void bch2_bio_map(struct bio *bio, void *base, size_t size)
}
}
-int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
+int _bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
{
while (size) {
- struct page *page = alloc_page(gfp_mask);
+ struct page *page = _alloc_pages(gfp_mask, 0);
unsigned len = min_t(size_t, PAGE_SIZE, size);
if (!page)
@@ -579,21 +824,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
-void bch_scnmemcpy(struct printbuf *out,
- const char *src, size_t len)
-{
- size_t n = printbuf_remaining(out);
-
- if (n) {
- n = min(n - 1, len);
- memcpy(out->pos, src, n);
- out->pos += n;
- *out->pos = '\0';
- }
-}
-
-#include "eytzinger.h"
-
static int alignment_ok(const void *base, size_t align)
{
return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index df8c5d52f98a..ecfe54012e3d 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -17,40 +17,16 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
+#include <linux/mean_and_variance.h>
-#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - 9)
-#define PAGE_SECTORS (1UL << PAGE_SECTORS_SHIFT)
+#include "darray.h"
struct closure;
#ifdef CONFIG_BCACHEFS_DEBUG
-
#define EBUG_ON(cond) BUG_ON(cond)
-#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
-#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
-#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0)
-#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0)
-#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0)
-#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0)
-#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0)
-#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i)
-#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
-#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
-
-#else /* DEBUG */
-
+#else
#define EBUG_ON(cond)
-#define atomic_dec_bug(v) atomic_dec(v)
-#define atomic_inc_bug(v, i) atomic_inc(v)
-#define atomic_sub_bug(i, v) atomic_sub(i, v)
-#define atomic_add_bug(i, v) atomic_add(i, v)
-#define atomic_long_dec_bug(v) atomic_long_dec(v)
-#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v)
-#define atomic64_dec_bug(v) atomic64_dec(v)
-#define atomic64_inc_bug(v, i) atomic64_inc(v)
-#define atomic64_sub_bug(i, v) atomic64_sub(i, v)
-#define atomic64_add_bug(i, v) atomic64_add(i, v)
-
#endif
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
@@ -84,12 +60,14 @@ static inline void vpfree(void *p, size_t size)
free_pages((unsigned long) p, get_order(size));
}
-static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
+static inline void *_vpmalloc(size_t size, gfp_t gfp_mask)
{
- return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+ return (void *) _get_free_pages(gfp_mask|__GFP_NOWARN,
get_order(size)) ?:
- __vmalloc(size, gfp_mask, PAGE_KERNEL);
+ __vmalloc(size, gfp_mask);
}
+#define vpmalloc(_size, _gfp) \
+ alloc_hooks(_vpmalloc(_size, _gfp), void *, NULL)
static inline void kvpfree(void *p, size_t size)
{
@@ -99,12 +77,14 @@ static inline void kvpfree(void *p, size_t size)
vpfree(p, size);
}
-static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
+static inline void *_kvpmalloc(size_t size, gfp_t gfp_mask)
{
return size < PAGE_SIZE
- ? kmalloc(size, gfp_mask)
- : vpmalloc(size, gfp_mask);
+ ? _kmalloc(size, gfp_mask)
+ : _vpmalloc(size, gfp_mask);
}
+#define kvpmalloc(_size, _gfp) \
+ alloc_hooks(_kvpmalloc(_size, _gfp), void *, NULL)
int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
@@ -213,9 +193,11 @@ do { \
\
BUG_ON(_i >= (h)->used); \
(h)->used--; \
- heap_swap(h, _i, (h)->used, set_backpointer); \
- heap_sift_up(h, _i, cmp, set_backpointer); \
- heap_sift_down(h, _i, cmp, set_backpointer); \
+ if ((_i) < (h)->used) { \
+ heap_swap(h, _i, (h)->used, set_backpointer); \
+ heap_sift_up(h, _i, cmp, set_backpointer); \
+ heap_sift_down(h, _i, cmp, set_backpointer); \
+ } \
} while (0)
#define heap_pop(h, d, cmp, set_backpointer) \
@@ -238,31 +220,71 @@ do { \
#define ANYSINT_MAX(t) \
((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
-struct printbuf {
- char *pos;
- char *end;
-};
+#include "printbuf.h"
+
+#define prt_vprintf(_out, ...) bch2_prt_vprintf(_out, __VA_ARGS__)
+#define prt_printf(_out, ...) bch2_prt_printf(_out, __VA_ARGS__)
+#define printbuf_str(_buf) bch2_printbuf_str(_buf)
+#define printbuf_exit(_buf) bch2_printbuf_exit(_buf)
+
+#define printbuf_tabstops_reset(_buf) bch2_printbuf_tabstops_reset(_buf)
+#define printbuf_tabstop_pop(_buf) bch2_printbuf_tabstop_pop(_buf)
+#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n)
+
+#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n)
+#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n)
+
+#define prt_newline(_out) bch2_prt_newline(_out)
+#define prt_tab(_out) bch2_prt_tab(_out)
+#define prt_tab_rjust(_out) bch2_prt_tab_rjust(_out)
+
+#define prt_bytes_indented(...) bch2_prt_bytes_indented(__VA_ARGS__)
+#define prt_u64(_out, _v) prt_printf(_out, "%llu", (u64) (_v))
+#define prt_human_readable_u64(...) bch2_prt_human_readable_u64(__VA_ARGS__)
+#define prt_human_readable_s64(...) bch2_prt_human_readable_s64(__VA_ARGS__)
+#define prt_units_u64(...) bch2_prt_units_u64(__VA_ARGS__)
+#define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__)
+#define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__)
+#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__)
-static inline size_t printbuf_remaining(struct printbuf *buf)
+void bch2_pr_time_units(struct printbuf *, u64);
+
+#ifdef __KERNEL__
+static inline void pr_time(struct printbuf *out, u64 time)
{
- return buf->end - buf->pos;
+ prt_printf(out, "%llu", time);
}
+#else
+#include <time.h>
+static inline void pr_time(struct printbuf *out, u64 _time)
+{
+ char time_str[64];
+ time_t time = _time;
+ struct tm *tm = localtime(&time);
+ size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
+ if (!err)
+ prt_printf(out, "(formatting error)");
+ else
+ prt_printf(out, "%s", time_str);
+}
+#endif
-#define _PBUF(_buf, _len) \
- ((struct printbuf) { \
- .pos = _buf, \
- .end = _buf + _len, \
- })
-
-#define PBUF(_buf) _PBUF(_buf, sizeof(_buf))
+#ifdef __KERNEL__
+static inline void uuid_unparse_lower(u8 *uuid, char *out)
+{
+ sprintf(out, "%pUb", uuid);
+}
+#else
+#include <uuid/uuid.h>
+#endif
-#define pr_buf(_out, ...) \
-do { \
- (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \
- __VA_ARGS__); \
-} while (0)
+static inline void pr_uuid(struct printbuf *out, u8 *uuid)
+{
+ char uuid_str[40];
-void bch_scnmemcpy(struct printbuf *, const char *, size_t);
+ uuid_unparse_lower(uuid, uuid_str);
+ prt_printf(out, "%s", uuid_str);
+}
int bch2_strtoint_h(const char *, int *);
int bch2_strtouint_h(const char *, unsigned int *);
@@ -326,8 +348,8 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
_r; \
})
-#define snprint(buf, size, var) \
- snprintf(buf, size, \
+#define snprint(out, var) \
+ prt_printf(out, \
type_is(var, int) ? "%i\n" \
: type_is(var, unsigned) ? "%u\n" \
: type_is(var, long) ? "%li\n" \
@@ -337,60 +359,71 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
: type_is(var, char *) ? "%s\n" \
: "%i\n", var)
-void bch2_hprint(struct printbuf *, s64);
-
bool bch2_is_zero(const void *, size_t);
-void bch2_string_opt_to_text(struct printbuf *,
- const char * const [], size_t);
-
-void bch2_flags_to_text(struct printbuf *, const char * const[], u64);
u64 bch2_read_flag_list(char *, const char * const[]);
+void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines);
+
+typedef DARRAY(unsigned long) bch_stacktrace;
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *);
+void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *);
+
#define NR_QUANTILES 15
#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
-struct quantiles {
- struct quantile_entry {
+struct bch2_quantiles {
+ struct bch2_quantile_entry {
u64 m;
u64 step;
} entries[NR_QUANTILES];
};
-struct time_stat_buffer {
+struct bch2_time_stat_buffer {
unsigned nr;
- struct time_stat_buffer_entry {
+ struct bch2_time_stat_buffer_entry {
u64 start;
u64 end;
} entries[32];
};
-struct time_stats {
+struct bch2_time_stats {
spinlock_t lock;
- u64 count;
/* all fields are in nanoseconds */
- u64 average_duration;
- u64 average_frequency;
u64 max_duration;
+ u64 min_duration;
+ u64 max_freq;
+ u64 min_freq;
u64 last_event;
- struct quantiles quantiles;
+ struct bch2_quantiles quantiles;
- struct time_stat_buffer __percpu *buffer;
+ struct mean_and_variance duration_stats;
+ struct mean_and_variance_weighted duration_stats_weighted;
+ struct mean_and_variance freq_stats;
+ struct mean_and_variance_weighted freq_stats_weighted;
+ struct bch2_time_stat_buffer __percpu *buffer;
};
-void __bch2_time_stats_update(struct time_stats *stats, u64, u64);
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+#endif
-static inline void bch2_time_stats_update(struct time_stats *stats, u64 start)
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
{
__bch2_time_stats_update(stats, start, local_clock());
}
-void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);
+void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-void bch2_time_stats_exit(struct time_stats *);
-void bch2_time_stats_init(struct time_stats *);
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
#define ewma_add(ewma, val, weight) \
({ \
@@ -444,7 +477,7 @@ struct bch_pd_controller {
void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
void bch2_pd_controller_init(struct bch_pd_controller *);
-size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *);
+void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
#define sysfs_pd_controller_attribute(name) \
rw_attribute(name##_rate); \
@@ -468,7 +501,7 @@ do { \
sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \
\
if (attr == &sysfs_##name##_rate_debug) \
- return bch2_pd_controller_print_debug(var, buf); \
+ bch2_pd_controller_debug_to_text(out, var); \
} while (0)
#define sysfs_pd_controller_store(name, var) \
@@ -501,7 +534,9 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
}
void bch2_bio_map(struct bio *bio, void *base, size_t);
-int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
+int _bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
+#define bch2_bio_alloc_pages(_bio, _size, _gfp) \
+ alloc_hooks(_bch2_bio_alloc_pages(_bio, _size, _gfp), int, -ENOMEM)
static inline sector_t bdev_sectors(struct block_device *bdev)
{
@@ -514,6 +549,26 @@ do { \
submit_bio(bio); \
} while (0)
+#define kthread_wait(cond) \
+({ \
+ int _ret = 0; \
+ \
+ while (1) { \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ if (kthread_should_stop()) { \
+ _ret = -1; \
+ break; \
+ } \
+ \
+ if (cond) \
+ break; \
+ \
+ schedule(); \
+ } \
+ set_current_state(TASK_RUNNING); \
+ _ret; \
+})
+
#define kthread_wait_freezable(cond) \
({ \
int _ret = 0; \
@@ -590,6 +645,20 @@ static inline void memmove_u64s_down(void *dst, const void *src,
__memmove_u64s_down(dst, src, u64s);
}
+static inline void __memmove_u64s_down_small(void *dst, const void *src,
+ unsigned u64s)
+{
+ memcpy_u64s_small(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down_small(void *dst, const void *src,
+ unsigned u64s)
+{
+ EBUG_ON(dst > src);
+
+ __memmove_u64s_down_small(dst, src, u64s);
+}
+
static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
unsigned u64s)
{
@@ -653,35 +722,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}
-static inline struct bio_vec next_contig_bvec(struct bio *bio,
- struct bvec_iter *iter)
-{
- struct bio_vec bv = bio_iter_iovec(bio, *iter);
-
- bio_advance_iter(bio, iter, bv.bv_len);
-#ifndef CONFIG_HIGHMEM
- while (iter->bi_size) {
- struct bio_vec next = bio_iter_iovec(bio, *iter);
-
- if (page_address(bv.bv_page) + bv.bv_offset + bv.bv_len !=
- page_address(next.bv_page) + next.bv_offset)
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter(bio, iter, next.bv_len);
- }
-#endif
- return bv;
-}
-
-#define __bio_for_each_contig_segment(bv, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bv = next_contig_bvec((bio), &(iter))), 1);)
-
-#define bio_for_each_contig_segment(bv, bio, iter) \
- __bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
-
void sort_cmp_size(void *base, size_t num, size_t size,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
@@ -710,6 +750,31 @@ do { \
#define array_remove_item(_array, _nr, _pos) \
array_remove_items(_array, _nr, _pos, 1)
+static inline void __move_gap(void *array, size_t element_size,
+ size_t nr, size_t size,
+ size_t old_gap, size_t new_gap)
+{
+ size_t gap_end = old_gap + size - nr;
+
+ if (new_gap < old_gap) {
+ size_t move = old_gap - new_gap;
+
+ memmove(array + element_size * (gap_end - move),
+ array + element_size * (old_gap - move),
+ element_size * move);
+ } else if (new_gap > old_gap) {
+ size_t move = new_gap - old_gap;
+
+ memmove(array + element_size * old_gap,
+ array + element_size * gap_end,
+ element_size * move);
+ }
+}
+
+/* Move the gap in a gap buffer: */
+#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \
+ __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+
#define bubble_sort(_base, _nr, _cmp) \
do { \
ssize_t _i, _end; \
@@ -778,13 +843,4 @@ static inline int u8_cmp(u8 l, u8 r)
return cmp_int(l, r);
}
-#ifdef __KERNEL__
-static inline void uuid_unparse_lower(u8 *uuid, char *out)
-{
- sprintf(out, "%plU", uuid);
-}
-#else
-#include <uuid/uuid.h>
-#endif
-
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index a2d6bb7136c7..5143b603bf67 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
+#include <linux/math.h>
#include <linux/string.h>
#include <asm/unaligned.h>
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
index c099cdc0605f..53a694d71967 100644
--- a/fs/bcachefs/vstructs.h
+++ b/fs/bcachefs/vstructs.h
@@ -20,7 +20,7 @@
({ \
BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \
\
- (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
+ (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
})
#define vstruct_bytes(_s) \
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 7dec2f5e573e..9f77bb2ecf5f 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -69,32 +69,51 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
.cmp_bkey = xattr_cmp_bkey,
};
-const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+ unsigned flags, struct printbuf *err)
{
const struct xattr_handler *handler;
struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
- if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
- return "value too small";
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) {
+ prt_printf(err, "incorrect value size (%zu < %zu)",
+ bkey_val_bytes(k.k), sizeof(*xattr.v));
+ return -BCH_ERR_invalid_bkey;
+ }
if (bkey_val_u64s(k.k) <
xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len)))
- return "value too small";
+ le16_to_cpu(xattr.v->x_val_len))) {
+ prt_printf(err, "value too small (%zu < %u)",
+ bkey_val_u64s(k.k),
+ xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len)));
+ return -BCH_ERR_invalid_bkey;
+ }
+ /* XXX why +4 ? */
if (bkey_val_u64s(k.k) >
xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len) + 4))
- return "value too big";
+ le16_to_cpu(xattr.v->x_val_len) + 4)) {
+ prt_printf(err, "value too big (%zu > %u)",
+ bkey_val_u64s(k.k),
+ xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len) + 4));
+ return -BCH_ERR_invalid_bkey;
+ }
handler = bch2_xattr_type_to_handler(xattr.v->x_type);
- if (!handler)
- return "invalid type";
+ if (!handler) {
+ prt_printf(err, "invalid type (%u)", xattr.v->x_type);
+ return -BCH_ERR_invalid_bkey;
+ }
- if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
- return "xattr name has invalid characters";
+ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) {
+ prt_printf(err, "xattr name has invalid characters");
+ return -BCH_ERR_invalid_bkey;
+ }
- return NULL;
+ return 0;
}
void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -105,17 +124,17 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
handler = bch2_xattr_type_to_handler(xattr.v->x_type);
if (handler && handler->prefix)
- pr_buf(out, "%s", handler->prefix);
+ prt_printf(out, "%s", handler->prefix);
else if (handler)
- pr_buf(out, "(type %u)", xattr.v->x_type);
+ prt_printf(out, "(type %u)", xattr.v->x_type);
else
- pr_buf(out, "(unknown type %u)", xattr.v->x_type);
+ prt_printf(out, "(unknown type %u)", xattr.v->x_type);
- bch_scnmemcpy(out, xattr.v->x_name,
- xattr.v->x_name_len);
- pr_buf(out, ":");
- bch_scnmemcpy(out, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
+ prt_printf(out, "%.*s:%.*s",
+ xattr.v->x_name_len,
+ xattr.v->x_name,
+ le16_to_cpu(xattr.v->x_val_len),
+ (char *) xattr_val(xattr.v));
}
static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
@@ -311,13 +330,9 @@ retry:
if (ret)
goto err;
- for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs,
- SPOS(inum, offset, snapshot), 0, k, ret) {
- BUG_ON(k.k->p.inode < inum);
-
- if (k.k->p.inode > inum)
- break;
-
+ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
+ SPOS(inum, offset, snapshot),
+ POS(inum, U64_MAX), 0, k, ret) {
if (k.k->type != KEY_TYPE_xattr)
continue;
@@ -329,23 +344,25 @@ retry:
offset = iter.pos.offset;
bch2_trans_iter_exit(&trans, &iter);
err:
- if (ret == -EINTR)
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_exit(&trans);
if (ret)
- return ret;
+ goto out;
ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
if (ret)
- return ret;
+ goto out;
ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
if (ret)
- return ret;
+ goto out;
return buf.used;
+out:
+ return bch2_err_class(ret);
}
static int bch2_xattr_get_handler(const struct xattr_handler *handler,
@@ -354,11 +371,14 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ int ret;
- return bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+ ret = bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
+ return bch2_err_class(ret);
}
static int bch2_xattr_set_handler(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *dentry, struct inode *vinode,
const char *name, const void *value,
size_t size, int flags)
@@ -366,11 +386,13 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+ int ret;
- return bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_xattr_set(&trans, inode_inum(inode), &hash,
name, value, size,
handler->flags, flags));
+ return bch2_err_class(ret);
}
static const struct xattr_handler bch_xattr_user_handler = {
@@ -422,12 +444,11 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_opts opts =
- bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
+ bch2_inode_opts_to_opts(&inode->ei_inode);
const struct bch_option *opt;
int id, inode_opt_id;
- char buf[512];
- struct printbuf out = PBUF(buf);
- unsigned val_len;
+ struct printbuf out = PRINTBUF;
+ int ret;
u64 v;
id = bch2_opt_lookup(name);
@@ -448,16 +469,21 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
return -ENODATA;
v = bch2_opt_get_by_id(&opts, id);
- bch2_opt_to_text(&out, c, opt, v, 0);
+ bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
- val_len = out.pos - buf;
+ ret = out.pos;
- if (buffer && val_len > size)
- return -ERANGE;
+ if (out.allocation_failure) {
+ ret = -ENOMEM;
+ } else if (buffer) {
+ if (out.pos > size)
+ ret = -ERANGE;
+ else
+ memcpy(buffer, out.buf, out.pos);
+ }
- if (buffer)
- memcpy(buffer, buf, val_len);
- return val_len;
+ printbuf_exit(&out);
+ return ret;
}
static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
@@ -491,6 +517,7 @@ static int inode_opt_set_fn(struct bch_inode_info *inode,
}
static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *dentry, struct inode *vinode,
const char *name, const void *value,
size_t size, int flags)
@@ -523,7 +550,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
memcpy(buf, value, size);
buf[size] = '\0';
- ret = bch2_opt_parse(c, NULL, opt, buf, &v);
+ ret = bch2_opt_parse(c, opt, buf, &v, NULL);
kfree(buf);
if (ret < 0)
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index f4f896545e1c..1a4cff3a9d96 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -6,13 +6,13 @@
extern const struct bch_hash_desc bch2_xattr_hash_desc;
-const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
+int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-#define bch2_bkey_ops_xattr (struct bkey_ops) { \
+#define bch2_bkey_ops_xattr ((struct bkey_ops) { \
.key_invalid = bch2_xattr_invalid, \
.val_to_text = bch2_xattr_to_text, \
-}
+})
static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
{
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 3a91130a4fbd..c74b7376990d 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -2,7 +2,7 @@
#define _LINUX_GENERIC_RADIX_TREE_H
/**
- * DOC: Generic radix trees/sparse arrays:
+ * DOC: Generic radix trees/sparse arrays
*
* Very simple and minimalistic, supporting arbitrary size entries up to
* PAGE_SIZE.
@@ -38,13 +38,15 @@
#include <asm/page.h>
#include <linux/bug.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
#include <linux/log2.h>
+#include <linux/math.h>
+#include <linux/types.h>
struct genradix_root;
struct __genradix {
- struct genradix_root __rcu *root;
+ struct genradix_root *root;
};
/*
@@ -115,6 +117,11 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
#define __genradix_cast(_radix) (typeof((_radix)->type[0]) *)
#define __genradix_obj_size(_radix) sizeof((_radix)->type[0])
+#define __genradix_objs_per_page(_radix) \
+ (PAGE_SIZE / sizeof((_radix)->type[0]))
+#define __genradix_page_remainder(_radix) \
+ (PAGE_SIZE % sizeof((_radix)->type[0]))
+
#define __genradix_idx_to_offset(_radix, _idx) \
__idx_to_offset(_idx, __genradix_obj_size(_radix))
@@ -178,11 +185,35 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
#define genradix_iter_peek(_iter, _radix) \
(__genradix_cast(_radix) \
__genradix_iter_peek(_iter, &(_radix)->tree, \
- PAGE_SIZE / __genradix_obj_size(_radix)))
+ __genradix_objs_per_page(_radix)))
+
+void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *,
+ size_t, size_t);
+
+/**
+ * genradix_iter_peek - get first entry at or below iterator's current
+ * position
+ * @_iter: a genradix_iter
+ * @_radix: genradix being iterated over
+ *
+ * If no more entries exist at or below @_iter's current position, returns NULL
+ */
+#define genradix_iter_peek_prev(_iter, _radix) \
+ (__genradix_cast(_radix) \
+ __genradix_iter_peek_prev(_iter, &(_radix)->tree, \
+ __genradix_objs_per_page(_radix), \
+ __genradix_obj_size(_radix) + \
+ __genradix_page_remainder(_radix)))
static inline void __genradix_iter_advance(struct genradix_iter *iter,
size_t obj_size)
{
+ if (iter->offset + obj_size < iter->offset) {
+ iter->offset = SIZE_MAX;
+ iter->pos = SIZE_MAX;
+ return;
+ }
+
iter->offset += obj_size;
if (!is_power_of_2(obj_size) &&
@@ -195,6 +226,25 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
#define genradix_iter_advance(_iter, _radix) \
__genradix_iter_advance(_iter, __genradix_obj_size(_radix))
+static inline void __genradix_iter_rewind(struct genradix_iter *iter,
+ size_t obj_size)
+{
+ if (iter->offset == 0 ||
+ iter->offset == SIZE_MAX) {
+ iter->offset = SIZE_MAX;
+ return;
+ }
+
+ if ((iter->offset & (PAGE_SIZE - 1)) == 0)
+ iter->offset -= PAGE_SIZE % obj_size;
+
+ iter->offset -= obj_size;
+ iter->pos--;
+}
+
+#define genradix_iter_rewind(_iter, _radix) \
+ __genradix_iter_rewind(_iter, __genradix_obj_size(_radix))
+
#define genradix_for_each_from(_radix, _iter, _p, _start) \
for (_iter = genradix_iter_init(_radix, _start); \
(_p = genradix_iter_peek(&_iter, _radix)) != NULL; \
@@ -212,6 +262,23 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
#define genradix_for_each(_radix, _iter, _p) \
genradix_for_each_from(_radix, _iter, _p, 0)
+#define genradix_last_pos(_radix) \
+ (SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1)
+
+/**
+ * genradix_for_each_reverse - iterate over entry in a genradix, reverse order
+ * @_radix: genradix to iterate over
+ * @_iter: a genradix_iter to track current position
+ * @_p: pointer to genradix entry type
+ *
+ * On every iteration, @_p will point to the current entry, and @_iter.pos
+ * will be the current entry's index.
+ */
+#define genradix_for_each_reverse(_radix, _iter, _p) \
+ for (_iter = genradix_iter_init(_radix, genradix_last_pos(_radix));\
+ (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\
+ genradix_iter_rewind(&_iter, _radix))
+
int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
/**
diff --git a/include/linux/six.h b/include/linux/six.h
index 477c33eb00d7..16ad2073f71c 100644
--- a/include/linux/six.h
+++ b/include/linux/six.h
@@ -80,9 +80,10 @@ union six_lock_state {
};
struct {
- unsigned read_lock:27;
+ unsigned read_lock:26;
unsigned write_locking:1;
unsigned intent_lock:1;
+ unsigned nospin:1;
unsigned waiters:3;
/*
* seq works much like in seqlocks: it's incremented every time
@@ -107,16 +108,23 @@ struct six_lock {
union six_lock_state state;
unsigned intent_lock_recurse;
struct task_struct *owner;
- struct optimistic_spin_queue osq;
unsigned __percpu *readers;
-
+ struct optimistic_spin_queue osq;
raw_spinlock_t wait_lock;
- struct list_head wait_list[2];
+ struct list_head wait_list;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
};
+struct six_lock_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum six_lock_type lock_want;
+ bool lock_acquired;
+ u64 start_time;
+};
+
typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
static __always_inline void __six_lock_init(struct six_lock *lock,
@@ -125,8 +133,7 @@ static __always_inline void __six_lock_init(struct six_lock *lock,
{
atomic64_set(&lock->state.counter, 0);
raw_spin_lock_init(&lock->wait_lock);
- INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
- INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
+ INIT_LIST_HEAD(&lock->wait_list);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
debug_check_no_locks_freed((void *) lock, sizeof(*lock));
lockdep_init_map(&lock->dep_map, name, key, 0);
@@ -143,10 +150,37 @@ do { \
#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v)
#define __SIX_LOCK(type) \
-bool six_trylock_##type(struct six_lock *); \
-bool six_relock_##type(struct six_lock *, u32); \
-int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
-void six_unlock_##type(struct six_lock *);
+bool six_trylock_ip_##type(struct six_lock *, unsigned long); \
+bool six_relock_ip_##type(struct six_lock *, u32, unsigned long); \
+int six_lock_ip_##type(struct six_lock *, six_lock_should_sleep_fn, \
+ void *, unsigned long); \
+int six_lock_ip_waiter_##type(struct six_lock *, struct six_lock_waiter *,\
+ six_lock_should_sleep_fn, void *, unsigned long);\
+void six_unlock_ip_##type(struct six_lock *, unsigned long); \
+ \
+static inline bool six_trylock_##type(struct six_lock *lock) \
+{ \
+ return six_trylock_ip_##type(lock, _THIS_IP_); \
+} \
+static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \
+{ \
+ return six_relock_ip_##type(lock, seq, _THIS_IP_); \
+} \
+static inline int six_lock_##type(struct six_lock *lock, \
+ six_lock_should_sleep_fn fn, void *p)\
+{ \
+ return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \
+} \
+static inline int six_lock_waiter_##type(struct six_lock *lock, \
+ struct six_lock_waiter *wait, \
+ six_lock_should_sleep_fn fn, void *p) \
+{ \
+ return six_lock_ip_waiter_##type(lock, wait, fn, p, _THIS_IP_); \
+} \
+static inline void six_unlock_##type(struct six_lock *lock) \
+{ \
+ return six_unlock_ip_##type(lock, _THIS_IP_); \
+}
__SIX_LOCK(read)
__SIX_LOCK(intent)
@@ -182,6 +216,21 @@ static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
}
+static inline int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
+{
+ SIX_LOCK_DISPATCH(type, six_lock_ip_waiter, lock, wait, should_sleep_fn, p, ip);
+}
+
+static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+ SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p);
+}
+
static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
{
SIX_LOCK_DISPATCH(type, six_unlock, lock);
@@ -196,8 +245,13 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
void six_lock_wakeup_all(struct six_lock *);
-void six_lock_pcpu_free_rcu(struct six_lock *);
void six_lock_pcpu_free(struct six_lock *);
void six_lock_pcpu_alloc(struct six_lock *);
+struct six_lock_count {
+ unsigned n[3];
+};
+
+struct six_lock_count six_lock_counts(struct six_lock *);
+
#endif /* _LINUX_SIX_H */
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index 1e82ff890a0c..8f0f16061285 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -7,21 +7,29 @@
#include <linux/tracepoint.h>
+#define TRACE_BPOS_entries(name) \
+ __field(u64, name##_inode ) \
+ __field(u64, name##_offset ) \
+ __field(u32, name##_snapshot )
+
+#define TRACE_BPOS_assign(dst, src) \
+ __entry->dst##_inode = (src).inode; \
+ __entry->dst##_offset = (src).offset; \
+ __entry->dst##_snapshot = (src).snapshot
+
DECLARE_EVENT_CLASS(bpos,
- TP_PROTO(struct bpos *p),
+ TP_PROTO(const struct bpos *p),
TP_ARGS(p),
TP_STRUCT__entry(
- __field(u64, inode )
- __field(u64, offset )
+ TRACE_BPOS_entries(p)
),
TP_fast_assign(
- __entry->inode = p->inode;
- __entry->offset = p->offset;
+ TRACE_BPOS_assign(p, *p);
),
- TP_printk("%llu:%llu", __entry->inode, __entry->offset)
+ TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
);
DECLARE_EVENT_CLASS(bkey,
@@ -44,6 +52,31 @@ DECLARE_EVENT_CLASS(bkey,
__entry->offset, __entry->size)
);
+DECLARE_EVENT_CLASS(btree_node,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u8, level )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->level = b->c.level;
+ __entry->btree_id = b->c.btree_id;
+ TRACE_BPOS_assign(pos, b->key.k.p);
+ ),
+
+ TP_printk("%d,%d %u %s %llu:%llu:%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->level,
+ bch2_btree_ids[__entry->btree_id],
+ __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
DECLARE_EVENT_CLASS(bch_fs,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c),
@@ -71,10 +104,10 @@ DECLARE_EVENT_CLASS(bio,
),
TP_fast_assign(
- __entry->dev = bio->bi_disk ? bio_dev(bio) : 0;
+ __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0;
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio->bi_iter.bi_size >> 9;
- blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
),
TP_printk("%d,%d %s %llu + %u",
@@ -82,9 +115,29 @@ DECLARE_EVENT_CLASS(bio,
(unsigned long long)__entry->sector, __entry->nr_sector)
);
+/* super-io.c: */
+TRACE_EVENT(write_super,
+ TP_PROTO(struct bch_fs *c, unsigned long ip),
+ TP_ARGS(c, ip),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(unsigned long, ip )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->ip = ip;
+ ),
+
+ TP_printk("%d,%d for %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (void *) __entry->ip)
+);
+
/* io.c: */
-DEFINE_EVENT(bio, read_split,
+DEFINE_EVENT(bio, read_promote,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
@@ -94,12 +147,17 @@ DEFINE_EVENT(bio, read_bounce,
TP_ARGS(bio)
);
+DEFINE_EVENT(bio, read_split,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio)
+);
+
DEFINE_EVENT(bio, read_retry,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
-DEFINE_EVENT(bio, promote,
+DEFINE_EVENT(bio, read_reuse_race,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
@@ -122,17 +180,21 @@ DEFINE_EVENT(bio, journal_write,
);
TRACE_EVENT(journal_reclaim_start,
- TP_PROTO(struct bch_fs *c, u64 min_nr,
+ TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
+ u64 min_nr, u64 min_key_cache,
u64 prereserved, u64 prereserved_total,
u64 btree_cache_dirty, u64 btree_cache_total,
u64 btree_key_cache_dirty, u64 btree_key_cache_total),
- TP_ARGS(c, min_nr, prereserved, prereserved_total,
+ TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
btree_cache_dirty, btree_cache_total,
btree_key_cache_dirty, btree_key_cache_total),
TP_STRUCT__entry(
__field(dev_t, dev )
+ __field(bool, direct )
+ __field(bool, kicked )
__field(u64, min_nr )
+ __field(u64, min_key_cache )
__field(u64, prereserved )
__field(u64, prereserved_total )
__field(u64, btree_cache_dirty )
@@ -143,7 +205,10 @@ TRACE_EVENT(journal_reclaim_start,
TP_fast_assign(
__entry->dev = c->dev;
+ __entry->direct = direct;
+ __entry->kicked = kicked;
__entry->min_nr = min_nr;
+ __entry->min_key_cache = min_key_cache;
__entry->prereserved = prereserved;
__entry->prereserved_total = prereserved_total;
__entry->btree_cache_dirty = btree_cache_dirty;
@@ -152,9 +217,12 @@ TRACE_EVENT(journal_reclaim_start,
__entry->btree_key_cache_total = btree_key_cache_total;
),
- TP_printk("%d,%d min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+ TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->direct,
+ __entry->kicked,
__entry->min_nr,
+ __entry->min_key_cache,
__entry->prereserved,
__entry->prereserved_total,
__entry->btree_cache_dirty,
@@ -177,7 +245,7 @@ TRACE_EVENT(journal_reclaim_finish,
__entry->nr_flushed = nr_flushed;
),
- TP_printk("%d%d flushed %llu",
+ TP_printk("%d,%d flushed %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->nr_flushed)
);
@@ -185,44 +253,65 @@ TRACE_EVENT(journal_reclaim_finish,
/* bset.c: */
DEFINE_EVENT(bpos, bkey_pack_pos_fail,
- TP_PROTO(struct bpos *p),
+ TP_PROTO(const struct bpos *p),
TP_ARGS(p)
);
-/* Btree */
+/* Btree cache: */
-DECLARE_EVENT_CLASS(btree_node,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b),
+TRACE_EVENT(btree_cache_scan,
+ TP_PROTO(long nr_to_scan, long can_free, long ret),
+ TP_ARGS(nr_to_scan, can_free, ret),
TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u8, level )
- __field(u8, id )
- __field(u64, inode )
- __field(u64, offset )
+ __field(long, nr_to_scan )
+ __field(long, can_free )
+ __field(long, ret )
),
TP_fast_assign(
- __entry->dev = c->dev;
- __entry->level = b->c.level;
- __entry->id = b->c.btree_id;
- __entry->inode = b->key.k.p.inode;
- __entry->offset = b->key.k.p.offset;
+ __entry->nr_to_scan = nr_to_scan;
+ __entry->can_free = can_free;
+ __entry->ret = ret;
),
- TP_printk("%d,%d %u id %u %llu:%llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->level, __entry->id,
- __entry->inode, __entry->offset)
+ TP_printk("scanned for %li nodes, can free %li, ret %li",
+ __entry->nr_to_scan, __entry->can_free, __entry->ret)
);
-DEFINE_EVENT(btree_node, btree_read,
+DEFINE_EVENT(btree_node, btree_cache_reap,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-TRACE_EVENT(btree_write,
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+/* Btree */
+
+DEFINE_EVENT(btree_node, btree_node_read,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_node_write,
TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
TP_ARGS(b, bytes, sectors),
@@ -252,268 +341,365 @@ DEFINE_EVENT(btree_node, btree_node_free,
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_node_reap,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
TRACE_EVENT(btree_reserve_get_fail,
- TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl),
- TP_ARGS(c, required, cl),
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ size_t required,
+ int ret),
+ TP_ARGS(trans_fn, caller_ip, required, ret),
TP_STRUCT__entry(
- __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
__field(size_t, required )
- __field(struct closure *, cl )
+ __array(char, ret, 32 )
),
TP_fast_assign(
- __entry->dev = c->dev;
- __entry->required = required;
- __entry->cl = cl;
+ strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->required = required;
+ strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
),
- TP_printk("%d,%d required %zu by %p",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->required, __entry->cl)
+ TP_printk("%s %pS required %zu ret %s",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ __entry->required,
+ __entry->ret)
);
-DEFINE_EVENT(btree_node, btree_split,
+DEFINE_EVENT(btree_node, btree_node_compact,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_compact,
+DEFINE_EVENT(btree_node, btree_node_merge,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_merge,
+DEFINE_EVENT(btree_node, btree_node_split,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_set_root,
+DEFINE_EVENT(btree_node, btree_node_rewrite,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-TRACE_EVENT(btree_cache_scan,
- TP_PROTO(unsigned long nr_to_scan_pages,
- unsigned long nr_to_scan_nodes,
- unsigned long can_free_nodes,
- long ret),
- TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret),
+DEFINE_EVENT(btree_node, btree_node_set_root,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_path_relock_fail,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path,
+ unsigned level),
+ TP_ARGS(trans, caller_ip, path, level),
TP_STRUCT__entry(
- __field(unsigned long, nr_to_scan_pages )
- __field(unsigned long, nr_to_scan_nodes )
- __field(unsigned long, can_free_nodes )
- __field(long, ret )
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, btree_id )
+ __field(u8, level )
+ TRACE_BPOS_entries(pos)
+ __array(char, node, 24 )
+ __field(u32, iter_lock_seq )
+ __field(u32, node_lock_seq )
),
TP_fast_assign(
- __entry->nr_to_scan_pages = nr_to_scan_pages;
- __entry->nr_to_scan_nodes = nr_to_scan_nodes;
- __entry->can_free_nodes = can_free_nodes;
- __entry->ret = ret;
+ struct btree *b = btree_path_node(path, level);
+
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = path->btree_id;
+ __entry->level = path->level;
+ TRACE_BPOS_assign(pos, path->pos);
+ if (IS_ERR(b))
+ strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
+ else
+ scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+ __entry->iter_lock_seq = path->l[level].lock_seq;
+ __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
),
- TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li",
- __entry->nr_to_scan_pages,
- __entry->nr_to_scan_nodes,
- __entry->can_free_nodes,
- __entry->ret)
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_ids[__entry->btree_id],
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->level,
+ __entry->node,
+ __entry->iter_lock_seq,
+ __entry->node_lock_seq)
);
-TRACE_EVENT(btree_node_relock_fail,
- TP_PROTO(const char *trans_fn,
+TRACE_EVENT(btree_path_upgrade_fail,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos,
- unsigned long node,
- u32 iter_lock_seq,
- u32 node_lock_seq),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq),
+ struct btree_path *path,
+ unsigned level),
+ TP_ARGS(trans, caller_ip, path, level),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
- __array(char, caller, 32 )
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
__field(u8, btree_id )
- __field(u64, pos_inode )
- __field(u64, pos_offset )
- __field(u32, pos_snapshot )
- __field(unsigned long, node )
+ __field(u8, level )
+ TRACE_BPOS_entries(pos)
+ __field(u8, locked )
+ __field(u8, self_read_count )
+ __field(u8, self_intent_count)
+ __field(u8, read_count )
+ __field(u8, intent_count )
__field(u32, iter_lock_seq )
__field(u32, node_lock_seq )
),
TP_fast_assign(
- strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
- snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip);
- __entry->btree_id = btree_id;
- __entry->pos_inode = pos->inode;
- __entry->pos_offset = pos->offset;
- __entry->pos_snapshot = pos->snapshot;
- __entry->node = node;
- __entry->iter_lock_seq = iter_lock_seq;
- __entry->node_lock_seq = node_lock_seq;
- ),
-
- TP_printk("%s %s btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
+ struct six_lock_count c;
+
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = path->btree_id;
+ __entry->level = level;
+ TRACE_BPOS_assign(pos, path->pos);
+ __entry->locked = btree_node_locked(path, level);
+
+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+ __entry->self_read_count = c.n[SIX_LOCK_read];
+ __entry->self_intent_count = c.n[SIX_LOCK_intent];
+ c = six_lock_counts(&path->l[level].b->c.lock);
+ __entry->read_count = c.n[SIX_LOCK_read];
+ __entry->intent_count = c.n[SIX_LOCK_read];
+ __entry->iter_lock_seq = path->l[level].lock_seq;
+ __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
+ ),
+
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
- __entry->caller,
- __entry->btree_id,
+ (void *) __entry->caller_ip,
+ bch2_btree_ids[__entry->btree_id],
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
- __entry->node,
+ __entry->level,
+ __entry->locked,
+ __entry->self_read_count,
+ __entry->self_intent_count,
+ __entry->read_count,
+ __entry->intent_count,
__entry->iter_lock_seq,
__entry->node_lock_seq)
);
/* Garbage collection */
-DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(bch_fs, gc_start,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_end,
+DEFINE_EVENT(bch_fs, gc_gens_start,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
-DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
+DEFINE_EVENT(bch_fs, gc_gens_end,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c)
);
/* Allocator */
-TRACE_EVENT(alloc_scan,
- TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped),
- TP_ARGS(ca, found, inc_gen, inc_gen_skipped),
+DECLARE_EVENT_CLASS(bucket_alloc,
+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+ u64 bucket,
+ u64 free,
+ u64 avail,
+ u64 copygc_wait_amount,
+ s64 copygc_waiting_for,
+ struct bucket_alloc_state *s,
+ bool nonblocking,
+ const char *err),
+ TP_ARGS(ca, alloc_reserve, bucket, free, avail,
+ copygc_wait_amount, copygc_waiting_for,
+ s, nonblocking, err),
TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, found )
- __field(u64, inc_gen )
- __field(u64, inc_gen_skipped )
+ __field(u8, dev )
+ __array(char, reserve, 16 )
+ __field(u64, bucket )
+ __field(u64, free )
+ __field(u64, avail )
+ __field(u64, copygc_wait_amount )
+ __field(s64, copygc_waiting_for )
+ __field(u64, seen )
+ __field(u64, open )
+ __field(u64, need_journal_commit )
+ __field(u64, nouse )
+ __field(bool, nonblocking )
+ __field(u64, nocow )
+ __array(char, err, 32 )
),
TP_fast_assign(
- __entry->dev = ca->dev;
- __entry->found = found;
- __entry->inc_gen = inc_gen;
- __entry->inc_gen_skipped = inc_gen_skipped;
+ __entry->dev = ca->dev_idx;
+ strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+ __entry->bucket = bucket;
+ __entry->free = free;
+ __entry->avail = avail;
+ __entry->copygc_wait_amount = copygc_wait_amount;
+ __entry->copygc_waiting_for = copygc_waiting_for;
+ __entry->seen = s->buckets_seen;
+ __entry->open = s->skipped_open;
+ __entry->need_journal_commit = s->skipped_need_journal_commit;
+ __entry->nouse = s->skipped_nouse;
+ __entry->nonblocking = nonblocking;
+ __entry->nocow = s->skipped_nocow;
+ strscpy(__entry->err, err, sizeof(__entry->err));
),
- TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->found, __entry->inc_gen, __entry->inc_gen_skipped)
+ TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
+ __entry->reserve,
+ __entry->dev,
+ __entry->bucket,
+ __entry->free,
+ __entry->avail,
+ __entry->copygc_wait_amount,
+ __entry->copygc_waiting_for,
+ __entry->seen,
+ __entry->open,
+ __entry->need_journal_commit,
+ __entry->nouse,
+ __entry->nocow,
+ __entry->nonblocking,
+ __entry->err)
);
-TRACE_EVENT(invalidate,
- TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
- TP_ARGS(ca, offset, sectors),
+DEFINE_EVENT(bucket_alloc, bucket_alloc,
+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+ u64 bucket,
+ u64 free,
+ u64 avail,
+ u64 copygc_wait_amount,
+ s64 copygc_waiting_for,
+ struct bucket_alloc_state *s,
+ bool nonblocking,
+ const char *err),
+ TP_ARGS(ca, alloc_reserve, bucket, free, avail,
+ copygc_wait_amount, copygc_waiting_for,
+ s, nonblocking, err)
+);
+
+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+ u64 bucket,
+ u64 free,
+ u64 avail,
+ u64 copygc_wait_amount,
+ s64 copygc_waiting_for,
+ struct bucket_alloc_state *s,
+ bool nonblocking,
+ const char *err),
+ TP_ARGS(ca, alloc_reserve, bucket, free, avail,
+ copygc_wait_amount, copygc_waiting_for,
+ s, nonblocking, err)
+);
+
+TRACE_EVENT(discard_buckets,
+ TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+ u64 need_journal_commit, u64 discarded, const char *err),
+ TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
TP_STRUCT__entry(
- __field(unsigned, sectors )
__field(dev_t, dev )
- __field(__u64, offset )
+ __field(u64, seen )
+ __field(u64, open )
+ __field(u64, need_journal_commit )
+ __field(u64, discarded )
+ __array(char, err, 16 )
),
TP_fast_assign(
- __entry->dev = ca->dev;
- __entry->offset = offset,
- __entry->sectors = sectors;
+ __entry->dev = c->dev;
+ __entry->seen = seen;
+ __entry->open = open;
+ __entry->need_journal_commit = need_journal_commit;
+ __entry->discarded = discarded;
+ strscpy(__entry->err, err, sizeof(__entry->err));
),
- TP_printk("invalidated %u sectors at %d,%d sector=%llu",
- __entry->sectors,
- MAJOR(__entry->dev),
- MINOR(__entry->dev),
- __entry->offset)
+ TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->seen,
+ __entry->open,
+ __entry->need_journal_commit,
+ __entry->discarded,
+ __entry->err)
);
-DECLARE_EVENT_CLASS(bucket_alloc,
- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
- TP_ARGS(ca, reserve),
+TRACE_EVENT(bucket_invalidate,
+ TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
+ TP_ARGS(c, dev, bucket, sectors),
TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(enum alloc_reserve, reserve )
+ __field(dev_t, dev )
+ __field(u32, dev_idx )
+ __field(u32, sectors )
+ __field(u64, bucket )
),
TP_fast_assign(
- __entry->dev = ca->dev;
- __entry->reserve = reserve;
+ __entry->dev = c->dev;
+ __entry->dev_idx = dev;
+ __entry->sectors = sectors;
+ __entry->bucket = bucket;
),
- TP_printk("%d,%d reserve %d",
+ TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->reserve)
-);
-
-DEFINE_EVENT(bucket_alloc, bucket_alloc,
- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
- TP_ARGS(ca, reserve)
+ __entry->dev_idx, __entry->bucket,
+ __entry->sectors)
);
-DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
- TP_ARGS(ca, reserve)
-);
+/* Moving IO */
-DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
- TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
- TP_ARGS(ca, reserve)
+DEFINE_EVENT(bkey, move_extent_read,
+ TP_PROTO(const struct bkey *k),
+ TP_ARGS(k)
);
-/* Moving IO */
-
-DEFINE_EVENT(bkey, move_extent,
+DEFINE_EVENT(bkey, move_extent_write,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
-DEFINE_EVENT(bkey, move_alloc_fail,
+DEFINE_EVENT(bkey, move_extent_finish,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
-DEFINE_EVENT(bkey, move_race,
+TRACE_EVENT(move_extent_fail,
+ TP_PROTO(struct bch_fs *c, const char *msg),
+ TP_ARGS(c, msg),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __string(msg, msg )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __assign_str(msg, msg);
+ ),
+
+ TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
+);
+
+DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
@@ -540,6 +726,39 @@ TRACE_EVENT(move_data,
__entry->sectors_moved, __entry->keys_moved)
);
+TRACE_EVENT(evacuate_bucket,
+ TP_PROTO(struct bch_fs *c, struct bpos *bucket,
+ unsigned sectors, unsigned bucket_size,
+ u64 fragmentation, int ret),
+ TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u64, member )
+ __field(u64, bucket )
+ __field(u32, sectors )
+ __field(u32, bucket_size )
+ __field(u64, fragmentation )
+ __field(int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->member = bucket->inode;
+ __entry->bucket = bucket->offset;
+ __entry->sectors = sectors;
+ __entry->bucket_size = bucket_size;
+ __entry->fragmentation = fragmentation;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->member, __entry->bucket,
+ __entry->sectors, __entry->bucket_size,
+ __entry->fragmentation, __entry->ret)
+);
+
TRACE_EVENT(copygc,
TP_PROTO(struct bch_fs *c,
u64 sectors_moved, u64 sectors_not_moved,
@@ -592,308 +811,331 @@ TRACE_EVENT(copygc_wait,
__entry->wait_amount, __entry->until)
);
-DECLARE_EVENT_CLASS(transaction_restart,
- TP_PROTO(const char *trans_fn,
+/* btree transactions: */
+
+DECLARE_EVENT_CLASS(transaction_event,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip),
+ TP_ARGS(trans, caller_ip),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
),
TP_fast_assign(
- strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
),
TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
);
-DEFINE_EVENT(transaction_restart, transaction_restart_ip,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, transaction_commit,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_blocked_journal_reclaim,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, trans_restart_injected,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get,
- TP_PROTO(const char *trans_fn,
+TRACE_EVENT(trans_restart_split_race,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree *b),
+ TP_ARGS(trans, caller_ip, b),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, level )
+ __field(u16, written )
+ __field(u16, blocks )
+ __field(u16, u64s_remaining )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->level = b->c.level;
+ __entry->written = b->written;
+ __entry->blocks = btree_blocks(trans->c);
+ __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b);
+ ),
+
+ TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
+ __entry->trans_fn, (void *) __entry->caller_ip,
+ __entry->level,
+ __entry->written, __entry->blocks,
+ __entry->u64s_remaining)
+);
+
+DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, trans_restart_journal_res_get,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim,
- TP_PROTO(const char *trans_fn,
+
+TRACE_EVENT(trans_restart_journal_preres_get,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ unsigned flags),
+ TP_ARGS(trans, caller_ip, flags),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(unsigned, flags )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("%s %pS %x", __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ __entry->flags)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_journal_reclaim,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_fault_inject,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, trans_restart_fault_inject,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_traverse_all,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, trans_traverse_all,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas,
- TP_PROTO(const char *trans_fn,
+DEFINE_EVENT(transaction_event, trans_restart_mark_replicas,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_too_many_iters,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans_fn, caller_ip)
+ TP_ARGS(trans, caller_ip)
);
DECLARE_EVENT_CLASS(transaction_restart_iter,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos),
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
- __array(char, caller, 32 )
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
__field(u8, btree_id )
- __field(u64, pos_inode )
- __field(u64, pos_offset )
- __field(u32, pos_snapshot )
+ TRACE_BPOS_entries(pos)
),
TP_fast_assign(
- strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
- snprintf(__entry->caller, sizeof(__entry->caller), "%pS", (void *) caller_ip);
- __entry->btree_id = btree_id;
- __entry->pos_inode = pos->inode;
- __entry->pos_offset = pos->offset;
- __entry->pos_snapshot = pos->snapshot;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos)
),
- TP_printk("%s %s btree %u pos %llu:%llu:%u",
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u",
__entry->trans_fn,
- __entry->caller,
- __entry->btree_id,
+ (void *) __entry->caller_ip,
+ bch2_btree_ids[__entry->btree_id],
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
-DEFINE_EVENT(transaction_restart_iter, trans_restart_mark,
- TP_PROTO(const char *trans_fn,
+TRACE_EVENT(trans_restart_upgrade,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
-);
+ struct btree_path *path,
+ unsigned old_locks_want,
+ unsigned new_locks_want),
+ TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want),
-DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade,
- TP_PROTO(const char *trans_fn,
- unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
-);
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, btree_id )
+ __field(u8, old_locks_want )
+ __field(u8, new_locks_want )
+ TRACE_BPOS_entries(pos)
+ ),
-DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade,
- TP_PROTO(const char *trans_fn,
- unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = path->btree_id;
+ __entry->old_locks_want = old_locks_want;
+ __entry->new_locks_want = new_locks_want;
+ TRACE_BPOS_assign(pos, path->pos)
+ ),
+
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_ids[__entry->btree_id],
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->old_locks_want,
+ __entry->new_locks_want)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *pos),
- TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
);
-TRACE_EVENT(trans_restart_would_deadlock,
- TP_PROTO(const char *trans_fn,
- unsigned long caller_ip,
- bool in_traverse_all,
- unsigned reason,
- enum btree_id have_btree_id,
- unsigned have_iter_type,
- struct bpos *have_pos,
- enum btree_id want_btree_id,
- unsigned want_iter_type,
- struct bpos *want_pos),
- TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason,
- have_btree_id, have_iter_type, have_pos,
- want_btree_id, want_iter_type, want_pos),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
- __field(unsigned long, caller_ip )
- __field(u8, in_traverse_all )
- __field(u8, reason )
- __field(u8, have_btree_id )
- __field(u8, have_iter_type )
- __field(u8, want_btree_id )
- __field(u8, want_iter_type )
-
- __field(u64, have_pos_inode )
- __field(u64, have_pos_offset )
- __field(u32, have_pos_snapshot)
- __field(u32, want_pos_snapshot)
- __field(u64, want_pos_inode )
- __field(u64, want_pos_offset )
- ),
-
- TP_fast_assign(
- strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->in_traverse_all = in_traverse_all;
- __entry->reason = reason;
- __entry->have_btree_id = have_btree_id;
- __entry->have_iter_type = have_iter_type;
- __entry->want_btree_id = want_btree_id;
- __entry->want_iter_type = want_iter_type;
-
- __entry->have_pos_inode = have_pos->inode;
- __entry->have_pos_offset = have_pos->offset;
- __entry->have_pos_snapshot = have_pos->snapshot;
+DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path),
+ TP_ARGS(trans, caller_ip, path)
+);
- __entry->want_pos_inode = want_pos->inode;
- __entry->want_pos_offset = want_pos->offset;
- __entry->want_pos_snapshot = want_pos->snapshot;
- ),
+DEFINE_EVENT(transaction_event, trans_restart_would_deadlock,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
- TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->in_traverse_all,
- __entry->reason,
- __entry->have_btree_id,
- __entry->have_iter_type,
- __entry->have_pos_inode,
- __entry->have_pos_offset,
- __entry->have_pos_snapshot,
- __entry->want_btree_id,
- __entry->want_iter_type,
- __entry->want_pos_inode,
- __entry->want_pos_offset,
- __entry->want_pos_snapshot)
+DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
);
TRACE_EVENT(trans_restart_would_deadlock_write,
- TP_PROTO(const char *trans_fn),
- TP_ARGS(trans_fn),
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
),
TP_fast_assign(
- strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
),
TP_printk("%s", __entry->trans_fn)
);
TRACE_EVENT(trans_restart_mem_realloced,
- TP_PROTO(const char *trans_fn,
+ TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
unsigned long bytes),
- TP_ARGS(trans_fn, caller_ip, bytes),
+ TP_ARGS(trans, caller_ip, bytes),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(unsigned long, bytes )
),
TP_fast_assign(
- strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->bytes = bytes;
),
@@ -904,6 +1146,89 @@ TRACE_EVENT(trans_restart_mem_realloced,
__entry->bytes)
);
+TRACE_EVENT(trans_restart_key_cache_key_realloced,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path,
+ unsigned old_u64s,
+ unsigned new_u64s),
+ TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(enum btree_id, btree_id )
+ TRACE_BPOS_entries(pos)
+ __field(u32, old_u64s )
+ __field(u32, new_u64s )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ __entry->old_u64s = old_u64s;
+ __entry->new_u64s = new_u64s;
+ ),
+
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_ids[__entry->btree_id],
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->old_u64s,
+ __entry->new_u64s)
+);
+
+DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(write_buffer_flush,
+ TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
+ TP_ARGS(trans, nr, skipped, fast, size),
+
+ TP_STRUCT__entry(
+ __field(size_t, nr )
+ __field(size_t, skipped )
+ __field(size_t, fast )
+ __field(size_t, size )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->skipped = skipped;
+ __entry->fast = fast;
+ __entry->size = size;
+ ),
+
+ TP_printk("%zu/%zu skipped %zu fast %zu",
+ __entry->nr, __entry->size, __entry->skipped, __entry->fast)
+);
+
+TRACE_EVENT(write_buffer_flush_slowpath,
+ TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
+ TP_ARGS(trans, nr, size),
+
+ TP_STRUCT__entry(
+ __field(size_t, nr )
+ __field(size_t, size )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->size = size;
+ ),
+
+ TP_printk("%zu/%zu", __entry->nr, __entry->size)
+);
+
#endif /* _TRACE_BCACHE_H */
/* This part must be outside protection */
diff --git a/kernel/locking/six.c b/kernel/locking/six.c
index 75a735acd11b..3d366a843eb5 100644
--- a/kernel/locking/six.c
+++ b/kernel/locking/six.c
@@ -6,18 +6,23 @@
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
#include <linux/six.h>
#include <linux/slab.h>
+#include <trace/events/lock.h>
+
#ifdef DEBUG
#define EBUG_ON(cond) BUG_ON(cond)
#else
#define EBUG_ON(cond) do {} while (0)
#endif
-#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
-#define six_release(l) lock_release(l, 0, _RET_IP_)
+#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip)
+#define six_release(l, ip) lock_release(l, ip)
+
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
struct six_lock_vals {
/* Value we add to the lock in order to take the lock: */
@@ -65,14 +70,15 @@ struct six_lock_vals {
}
static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
- union six_lock_state old)
+ union six_lock_state old,
+ struct task_struct *owner)
{
if (type != SIX_LOCK_intent)
return;
if (!old.intent_lock) {
EBUG_ON(lock->owner);
- lock->owner = current;
+ lock->owner = owner;
} else {
EBUG_ON(lock->owner != current);
}
@@ -88,64 +94,21 @@ static inline unsigned pcpu_read_count(struct six_lock *lock)
return read_count;
}
-struct six_lock_waiter {
- struct list_head list;
- struct task_struct *task;
-};
-
/* This is probably up there with the more evil things I've done */
#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
-static inline void six_lock_wakeup(struct six_lock *lock,
- union six_lock_state state,
- unsigned waitlist_id)
-{
- if (waitlist_id == SIX_LOCK_write) {
- if (state.write_locking && !state.read_lock) {
- struct task_struct *p = READ_ONCE(lock->owner);
- if (p)
- wake_up_process(p);
- }
- } else {
- struct list_head *wait_list = &lock->wait_list[waitlist_id];
- struct six_lock_waiter *w, *next;
-
- if (!(state.waiters & (1 << waitlist_id)))
- return;
-
- clear_bit(waitlist_bitnr(waitlist_id),
- (unsigned long *) &lock->state.v);
-
- raw_spin_lock(&lock->wait_lock);
-
- list_for_each_entry_safe(w, next, wait_list, list) {
- list_del_init(&w->list);
-
- if (wake_up_process(w->task) &&
- waitlist_id != SIX_LOCK_read) {
- if (!list_empty(wait_list))
- set_bit(waitlist_bitnr(waitlist_id),
- (unsigned long *) &lock->state.v);
- break;
- }
- }
-
- raw_spin_unlock(&lock->wait_lock);
- }
-}
-
-static __always_inline bool do_six_trylock_type(struct six_lock *lock,
- enum six_lock_type type,
- bool try)
+static int __do_six_trylock_type(struct six_lock *lock,
+ enum six_lock_type type,
+ struct task_struct *task,
+ bool try)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state old, new;
- bool ret;
+ int ret;
u64 v;
- EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+ EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
-
EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
/*
@@ -164,7 +127,6 @@ static __always_inline bool do_six_trylock_type(struct six_lock *lock,
*/
if (type == SIX_LOCK_read && lock->readers) {
-retry:
preempt_disable();
this_cpu_inc(*lock->readers); /* signal that we own lock */
@@ -181,38 +143,30 @@ retry:
* lock, issue a wakeup because we might have caused a
* spurious trylock failure:
*/
- if (old.write_locking) {
- struct task_struct *p = READ_ONCE(lock->owner);
-
- if (p)
- wake_up_process(p);
- }
-
+#if 0
/*
- * If we failed from the lock path and the waiting bit wasn't
- * set, set it:
+ * This code should be sufficient, but we're seeing unexplained
+ * lost wakeups:
*/
- if (!try && !ret) {
- v = old.v;
-
- do {
- new.v = old.v = v;
-
- if (!(old.v & l[type].lock_fail))
- goto retry;
-
- if (new.waiters & (1 << type))
- break;
-
- new.waiters |= 1 << type;
- } while ((v = atomic64_cmpxchg(&lock->state.counter,
- old.v, new.v)) != old.v);
- }
+ if (old.write_locking)
+ ret = -1 - SIX_LOCK_write;
+#else
+ if (!ret)
+ ret = -1 - SIX_LOCK_write;
+#endif
} else if (type == SIX_LOCK_write && lock->readers) {
if (try) {
atomic64_add(__SIX_VAL(write_locking, 1),
&lock->state.counter);
smp_mb__after_atomic();
+ } else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) {
+ atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write),
+ &lock->state.counter);
+ /*
+ * pairs with barrier after unlock and before checking
+ * for readers in unlock path
+ */
+ smp_mb__after_atomic();
}
ret = !pcpu_read_count(lock);
@@ -229,7 +183,8 @@ retry:
if (try && !ret) {
old.v = atomic64_add_return(v, &lock->state.counter);
- six_lock_wakeup(lock, old, SIX_LOCK_read);
+ if (old.waiters & (1 << SIX_LOCK_read))
+ ret = -1 - SIX_LOCK_read;
} else {
atomic64_add(v, &lock->state.counter);
}
@@ -243,8 +198,7 @@ retry:
if (type == SIX_LOCK_write)
new.write_locking = 0;
- } else if (!try && type != SIX_LOCK_write &&
- !(new.waiters & (1 << type)))
+ } else if (!try && !(new.waiters & (1 << type)))
new.waiters |= 1 << type;
else
break; /* waiting bit already set */
@@ -256,28 +210,99 @@ retry:
EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
}
- if (ret)
- six_set_owner(lock, type, old);
+ if (ret > 0)
+ six_set_owner(lock, type, old, task);
- EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
+ EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking));
return ret;
}
+static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
+{
+ struct six_lock_waiter *w, *next;
+ struct task_struct *task;
+ bool saw_one;
+ int ret;
+again:
+ ret = 0;
+ saw_one = false;
+ raw_spin_lock(&lock->wait_lock);
+
+ list_for_each_entry_safe(w, next, &lock->wait_list, list) {
+ if (w->lock_want != lock_type)
+ continue;
+
+ if (saw_one && lock_type != SIX_LOCK_read)
+ goto unlock;
+ saw_one = true;
+
+ ret = __do_six_trylock_type(lock, lock_type, w->task, false);
+ if (ret <= 0)
+ goto unlock;
+
+ __list_del(w->list.prev, w->list.next);
+ task = w->task;
+ /*
+ * Do no writes to @w besides setting lock_acquired - otherwise
+ * we would need a memory barrier:
+ */
+ barrier();
+ w->lock_acquired = true;
+ wake_up_process(task);
+ }
+
+ clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v);
+unlock:
+ raw_spin_unlock(&lock->wait_lock);
+
+ if (ret < 0) {
+ lock_type = -ret - 1;
+ goto again;
+ }
+}
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+ union six_lock_state state,
+ enum six_lock_type lock_type)
+{
+ if (lock_type == SIX_LOCK_write && state.read_lock)
+ return;
+
+ if (!(state.waiters & (1 << lock_type)))
+ return;
+
+ __six_lock_wakeup(lock, lock_type);
+}
+
+static bool do_six_trylock_type(struct six_lock *lock,
+ enum six_lock_type type,
+ bool try)
+{
+ int ret;
+
+ ret = __do_six_trylock_type(lock, type, current, try);
+ if (ret < 0)
+ __six_lock_wakeup(lock, -ret - 1);
+
+ return ret > 0;
+}
+
__always_inline __flatten
-static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type,
+ unsigned long ip)
{
if (!do_six_trylock_type(lock, type, true))
return false;
if (type != SIX_LOCK_write)
- six_acquire(&lock->dep_map, 1);
+ six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
return true;
}
__always_inline __flatten
static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
- unsigned seq)
+ unsigned seq, unsigned long ip)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state old;
@@ -304,15 +329,10 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
* Similar to the lock path, we may have caused a spurious write
* lock fail and need to issue a wakeup:
*/
- if (old.write_locking) {
- struct task_struct *p = READ_ONCE(lock->owner);
-
- if (p)
- wake_up_process(p);
- }
-
if (ret)
- six_acquire(&lock->dep_map, 1);
+ six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
+ else
+ six_lock_wakeup(lock, old, SIX_LOCK_write);
return ret;
}
@@ -327,38 +347,47 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
old.v,
old.v + l[type].lock_val)) != old.v);
- six_set_owner(lock, type, old);
+ six_set_owner(lock, type, old, current);
if (type != SIX_LOCK_write)
- six_acquire(&lock->dep_map, 1);
+ six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
return true;
}
#ifdef CONFIG_LOCK_SPIN_ON_OWNER
-static inline int six_can_spin_on_owner(struct six_lock *lock)
+static inline bool six_can_spin_on_owner(struct six_lock *lock)
{
struct task_struct *owner;
- int retval = 1;
+ bool ret;
if (need_resched())
- return 0;
+ return false;
rcu_read_lock();
owner = READ_ONCE(lock->owner);
- if (owner)
- retval = owner->on_cpu;
+ ret = !owner || owner_on_cpu(owner);
rcu_read_unlock();
- /*
- * if lock->owner is not set, the mutex owner may have just acquired
- * it and not set the owner yet or the mutex has been released.
- */
- return retval;
+
+ return ret;
+}
+
+static inline void six_set_nospin(struct six_lock *lock)
+{
+ union six_lock_state old, new;
+ u64 v = READ_ONCE(lock->state.v);
+
+ do {
+ new.v = old.v = v;
+ new.nospin = true;
+ } while ((v = atomic64_cmpxchg(&lock->state.counter, old.v, new.v)) != old.v);
}
static inline bool six_spin_on_owner(struct six_lock *lock,
- struct task_struct *owner)
+ struct task_struct *owner,
+ u64 end_time)
{
bool ret = true;
+ unsigned loop = 0;
rcu_read_lock();
while (lock->owner == owner) {
@@ -370,7 +399,13 @@ static inline bool six_spin_on_owner(struct six_lock *lock,
*/
barrier();
- if (!owner->on_cpu || need_resched()) {
+ if (!owner_on_cpu(owner) || need_resched()) {
+ ret = false;
+ break;
+ }
+
+ if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
+ six_set_nospin(lock);
ret = false;
break;
}
@@ -385,6 +420,7 @@ static inline bool six_spin_on_owner(struct six_lock *lock,
static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
{
struct task_struct *task = current;
+ u64 end_time;
if (type == SIX_LOCK_write)
return false;
@@ -396,6 +432,8 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
if (!osq_lock(&lock->osq))
goto fail;
+ end_time = sched_clock() + 10 * NSEC_PER_USEC;
+
while (1) {
struct task_struct *owner;
@@ -404,7 +442,7 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
* release the lock or go to sleep.
*/
owner = READ_ONCE(lock->owner);
- if (owner && !six_spin_on_owner(lock, owner))
+ if (owner && !six_spin_on_owner(lock, owner, end_time))
break;
if (do_six_trylock_type(lock, type, false)) {
@@ -457,10 +495,11 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
noinline
static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
+ struct six_lock_waiter *wait,
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
{
union six_lock_state old;
- struct six_lock_waiter wait;
int ret = 0;
if (type == SIX_LOCK_write) {
@@ -469,97 +508,125 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
smp_mb__after_atomic();
}
- ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
- if (ret)
- goto out_before_sleep;
+ trace_contention_begin(lock, 0);
+ lock_contended(&lock->dep_map, ip);
if (six_optimistic_spin(lock, type))
- goto out_before_sleep;
+ goto out;
+
+ wait->task = current;
+ wait->lock_want = type;
+ wait->lock_acquired = false;
+
+ raw_spin_lock(&lock->wait_lock);
+ if (!(lock->state.waiters & (1 << type)))
+ set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v);
+ /*
+ * Retry taking the lock after taking waitlist lock, have raced with an
+ * unlock:
+ */
+ ret = __do_six_trylock_type(lock, type, current, false);
+ if (ret <= 0) {
+ wait->start_time = local_clock();
+
+ if (!list_empty(&lock->wait_list)) {
+ struct six_lock_waiter *last =
+ list_last_entry(&lock->wait_list,
+ struct six_lock_waiter, list);
+
+ if (time_before_eq64(wait->start_time, last->start_time))
+ wait->start_time = last->start_time + 1;
+ }
+
+ list_add_tail(&wait->list, &lock->wait_list);
+ }
+ raw_spin_unlock(&lock->wait_lock);
- lock_contended(&lock->dep_map, _RET_IP_);
+ if (unlikely(ret > 0)) {
+ ret = 0;
+ goto out;
+ }
- INIT_LIST_HEAD(&wait.list);
- wait.task = current;
+ if (unlikely(ret < 0)) {
+ __six_lock_wakeup(lock, -ret - 1);
+ ret = 0;
+ }
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (type == SIX_LOCK_write)
- EBUG_ON(lock->owner != current);
- else if (list_empty_careful(&wait.list)) {
- raw_spin_lock(&lock->wait_lock);
- list_add_tail(&wait.list, &lock->wait_list[type]);
- raw_spin_unlock(&lock->wait_lock);
- }
- if (do_six_trylock_type(lock, type, false))
+ if (wait->lock_acquired)
break;
ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
- if (ret)
+ if (unlikely(ret)) {
+ raw_spin_lock(&lock->wait_lock);
+ if (!wait->lock_acquired)
+ list_del(&wait->list);
+ raw_spin_unlock(&lock->wait_lock);
+
+ if (wait->lock_acquired)
+ do_six_unlock_type(lock, type);
break;
+ }
schedule();
}
__set_current_state(TASK_RUNNING);
-
- if (!list_empty_careful(&wait.list)) {
- raw_spin_lock(&lock->wait_lock);
- list_del_init(&wait.list);
- raw_spin_unlock(&lock->wait_lock);
- }
-out_before_sleep:
- if (ret && type == SIX_LOCK_write) {
+out:
+ if (ret && type == SIX_LOCK_write && lock->state.write_locking) {
old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
&lock->state.counter);
six_lock_wakeup(lock, old, SIX_LOCK_read);
}
+ trace_contention_end(lock, 0);
return ret;
}
-__always_inline
-static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
+__always_inline __flatten
+static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
{
int ret;
+ wait->start_time = 0;
+
if (type != SIX_LOCK_write)
- six_acquire(&lock->dep_map, 0);
+ six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
ret = do_six_trylock_type(lock, type, true) ? 0
- : __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
+ : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p, ip);
if (ret && type != SIX_LOCK_write)
- six_release(&lock->dep_map);
+ six_release(&lock->dep_map, ip);
if (!ret)
- lock_acquired(&lock->dep_map, _RET_IP_);
+ lock_acquired(&lock->dep_map, ip);
return ret;
}
+__always_inline
+static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
+{
+ struct six_lock_waiter wait;
+
+ return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p, ip);
+}
+
__always_inline __flatten
-static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state state;
- EBUG_ON(type == SIX_LOCK_write &&
- !(lock->state.v & __SIX_LOCK_HELD_intent));
-
- if (type != SIX_LOCK_write)
- six_release(&lock->dep_map);
-
- if (type == SIX_LOCK_intent) {
- EBUG_ON(lock->owner != current);
-
- if (lock->intent_lock_recurse) {
- --lock->intent_lock_recurse;
- return;
- }
-
+ if (type == SIX_LOCK_intent)
lock->owner = NULL;
- }
if (type == SIX_LOCK_read &&
lock->readers) {
@@ -568,39 +635,75 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
smp_mb(); /* between unlocking and checking for waiters */
state.v = READ_ONCE(lock->state.v);
} else {
+ u64 v = l[type].unlock_val;
+
+ if (type != SIX_LOCK_read)
+ v -= lock->state.v & __SIX_VAL(nospin, 1);
+
EBUG_ON(!(lock->state.v & l[type].held_mask));
- state.v = atomic64_add_return_release(l[type].unlock_val,
- &lock->state.counter);
+ state.v = atomic64_add_return_release(v, &lock->state.counter);
}
six_lock_wakeup(lock, state, l[type].unlock_wakeup);
}
+__always_inline __flatten
+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type,
+ unsigned long ip)
+{
+ EBUG_ON(type == SIX_LOCK_write &&
+ !(lock->state.v & __SIX_LOCK_HELD_intent));
+ EBUG_ON((type == SIX_LOCK_write ||
+ type == SIX_LOCK_intent) &&
+ lock->owner != current);
+
+ if (type != SIX_LOCK_write)
+ six_release(&lock->dep_map, ip);
+
+ if (type == SIX_LOCK_intent &&
+ lock->intent_lock_recurse) {
+ --lock->intent_lock_recurse;
+ return;
+ }
+
+ do_six_unlock_type(lock, type);
+}
+
#define __SIX_LOCK(type) \
-bool six_trylock_##type(struct six_lock *lock) \
+bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip) \
{ \
- return __six_trylock_type(lock, SIX_LOCK_##type); \
+ return __six_trylock_type(lock, SIX_LOCK_##type, ip); \
} \
-EXPORT_SYMBOL_GPL(six_trylock_##type); \
+EXPORT_SYMBOL_GPL(six_trylock_ip_##type); \
\
-bool six_relock_##type(struct six_lock *lock, u32 seq) \
+bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
{ \
- return __six_relock_type(lock, SIX_LOCK_##type, seq); \
+ return __six_relock_type(lock, SIX_LOCK_##type, seq, ip); \
} \
-EXPORT_SYMBOL_GPL(six_relock_##type); \
+EXPORT_SYMBOL_GPL(six_relock_ip_##type); \
\
-int six_lock_##type(struct six_lock *lock, \
- six_lock_should_sleep_fn should_sleep_fn, void *p) \
+int six_lock_ip_##type(struct six_lock *lock, \
+ six_lock_should_sleep_fn should_sleep_fn, void *p, \
+ unsigned long ip) \
{ \
- return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\
+ return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
} \
-EXPORT_SYMBOL_GPL(six_lock_##type); \
+EXPORT_SYMBOL_GPL(six_lock_ip_##type); \
\
-void six_unlock_##type(struct six_lock *lock) \
+int six_lock_ip_waiter_##type(struct six_lock *lock, \
+ struct six_lock_waiter *wait, \
+ six_lock_should_sleep_fn should_sleep_fn, void *p,\
+ unsigned long ip) \
{ \
- __six_unlock_type(lock, SIX_LOCK_##type); \
+ return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
} \
-EXPORT_SYMBOL_GPL(six_unlock_##type);
+EXPORT_SYMBOL_GPL(six_lock_ip_waiter_##type); \
+ \
+void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \
+{ \
+ __six_unlock_type(lock, SIX_LOCK_##type, ip); \
+} \
+EXPORT_SYMBOL_GPL(six_unlock_ip_##type);
__SIX_LOCK(read)
__SIX_LOCK(intent)
@@ -639,7 +742,7 @@ bool six_lock_tryupgrade(struct six_lock *lock)
if (lock->readers)
this_cpu_dec(*lock->readers);
- six_set_owner(lock, SIX_LOCK_intent, old);
+ six_set_owner(lock, SIX_LOCK_intent, old, current);
return true;
}
@@ -671,7 +774,7 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
{
const struct six_lock_vals l[] = LOCK_VALS;
- six_acquire(&lock->dep_map, 0);
+ six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
/* XXX: assert already locked, and that we don't overflow: */
@@ -698,47 +801,20 @@ EXPORT_SYMBOL_GPL(six_lock_increment);
void six_lock_wakeup_all(struct six_lock *lock)
{
+ union six_lock_state state = lock->state;
struct six_lock_waiter *w;
- raw_spin_lock(&lock->wait_lock);
+ six_lock_wakeup(lock, state, SIX_LOCK_read);
+ six_lock_wakeup(lock, state, SIX_LOCK_intent);
+ six_lock_wakeup(lock, state, SIX_LOCK_write);
- list_for_each_entry(w, &lock->wait_list[0], list)
- wake_up_process(w->task);
- list_for_each_entry(w, &lock->wait_list[1], list)
+ raw_spin_lock(&lock->wait_lock);
+ list_for_each_entry(w, &lock->wait_list, list)
wake_up_process(w->task);
-
raw_spin_unlock(&lock->wait_lock);
}
EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
-struct free_pcpu_rcu {
- struct rcu_head rcu;
- void __percpu *p;
-};
-
-static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
-{
- struct free_pcpu_rcu *rcu =
- container_of(_rcu, struct free_pcpu_rcu, rcu);
-
- free_percpu(rcu->p);
- kfree(rcu);
-}
-
-void six_lock_pcpu_free_rcu(struct six_lock *lock)
-{
- struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
-
- if (!rcu)
- return;
-
- rcu->p = lock->readers;
- lock->readers = NULL;
-
- call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
-}
-EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
-
void six_lock_pcpu_free(struct six_lock *lock)
{
BUG_ON(lock->readers && pcpu_read_count(lock));
@@ -757,3 +833,20 @@ void six_lock_pcpu_alloc(struct six_lock *lock)
#endif
}
EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
+
+/*
+ * Returns lock held counts, for both read and intent
+ */
+struct six_lock_count six_lock_counts(struct six_lock *lock)
+{
+ struct six_lock_count ret;
+
+ ret.n[SIX_LOCK_read] = !lock->readers
+ ? lock->state.read_lock
+ : pcpu_read_count(lock);
+ ret.n[SIX_LOCK_intent] = lock->state.intent_lock + lock->intent_lock_recurse;
+ ret.n[SIX_LOCK_write] = lock->state.seq & 1;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_counts);
diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
index a7bafc413730..41f1bcdc4488 100644
--- a/lib/generic-radix-tree.c
+++ b/lib/generic-radix-tree.c
@@ -1,7 +1,9 @@
+#include <linux/atomic.h>
#include <linux/export.h>
#include <linux/generic-radix-tree.h>
#include <linux/gfp.h>
+#include <linux/kmemleak.h>
#define GENRADIX_ARY (PAGE_SIZE / sizeof(struct genradix_node *))
#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
@@ -36,12 +38,12 @@ static inline size_t genradix_depth_size(unsigned depth)
#define GENRADIX_DEPTH_MASK \
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
-unsigned genradix_root_to_depth(struct genradix_root *r)
+static inline unsigned genradix_root_to_depth(struct genradix_root *r)
{
return (unsigned long) r & GENRADIX_DEPTH_MASK;
}
-struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
{
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
}
@@ -75,6 +77,27 @@ void *__genradix_ptr(struct __genradix *radix, size_t offset)
}
EXPORT_SYMBOL(__genradix_ptr);
+static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
+{
+ struct genradix_node *node;
+
+ node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO);
+
+ /*
+ * We're using pages (not slab allocations) directly for kernel data
+ * structures, so we need to explicitly inform kmemleak of them in order
+ * to avoid false positive memory leak reports.
+ */
+ kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask);
+ return node;
+}
+
+static inline void genradix_free_node(struct genradix_node *node)
+{
+ kmemleak_free(node);
+ free_page((unsigned long)node);
+}
+
/*
* Returns pointer to the specified byte @offset within @radix, allocating it if
* necessary - newly allocated slots are always zeroed out:
@@ -97,8 +120,7 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
break;
if (!new_node) {
- new_node = (void *)
- __get_free_page(gfp_mask|__GFP_ZERO);
+ new_node = genradix_alloc_node(gfp_mask);
if (!new_node)
return NULL;
}
@@ -121,8 +143,7 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
n = READ_ONCE(*p);
if (!n) {
if (!new_node) {
- new_node = (void *)
- __get_free_page(gfp_mask|__GFP_ZERO);
+ new_node = genradix_alloc_node(gfp_mask);
if (!new_node)
return NULL;
}
@@ -133,7 +154,7 @@ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
}
if (new_node)
- free_page((unsigned long) new_node);
+ genradix_free_node(new_node);
return &n->data[offset];
}
@@ -146,6 +167,10 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
struct genradix_root *r;
struct genradix_node *n;
unsigned level, i;
+
+ if (iter->offset == SIZE_MAX)
+ return NULL;
+
restart:
r = READ_ONCE(radix->root);
if (!r)
@@ -164,10 +189,17 @@ restart:
(GENRADIX_ARY - 1);
while (!n->children[i]) {
+ size_t objs_per_ptr = genradix_depth_size(level);
+
+ if (iter->offset + objs_per_ptr < iter->offset) {
+ iter->offset = SIZE_MAX;
+ iter->pos = SIZE_MAX;
+ return NULL;
+ }
+
i++;
- iter->offset = round_down(iter->offset +
- genradix_depth_size(level),
- genradix_depth_size(level));
+ iter->offset = round_down(iter->offset + objs_per_ptr,
+ objs_per_ptr);
iter->pos = (iter->offset >> PAGE_SHIFT) *
objs_per_page;
if (i == GENRADIX_ARY)
@@ -181,6 +213,64 @@ restart:
}
EXPORT_SYMBOL(__genradix_iter_peek);
+void *__genradix_iter_peek_prev(struct genradix_iter *iter,
+ struct __genradix *radix,
+ size_t objs_per_page,
+ size_t obj_size_plus_page_remainder)
+{
+ struct genradix_root *r;
+ struct genradix_node *n;
+ unsigned level, i;
+
+ if (iter->offset == SIZE_MAX)
+ return NULL;
+
+restart:
+ r = READ_ONCE(radix->root);
+ if (!r)
+ return NULL;
+
+ n = genradix_root_to_node(r);
+ level = genradix_root_to_depth(r);
+
+ if (ilog2(iter->offset) >= genradix_depth_shift(level)) {
+ iter->offset = genradix_depth_size(level);
+ iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+
+ iter->offset -= obj_size_plus_page_remainder;
+ iter->pos--;
+ }
+
+ while (level) {
+ level--;
+
+ i = (iter->offset >> genradix_depth_shift(level)) &
+ (GENRADIX_ARY - 1);
+
+ while (!n->children[i]) {
+ size_t objs_per_ptr = genradix_depth_size(level);
+
+ iter->offset = round_down(iter->offset, objs_per_ptr);
+ iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+
+ if (!iter->offset)
+ return NULL;
+
+ iter->offset -= obj_size_plus_page_remainder;
+ iter->pos--;
+
+ if (!i)
+ goto restart;
+ --i;
+ }
+
+ n = n->children[i];
+ }
+
+ return &n->data[iter->offset & (PAGE_SIZE - 1)];
+}
+EXPORT_SYMBOL(__genradix_iter_peek_prev);
+
static void genradix_free_recurse(struct genradix_node *n, unsigned level)
{
if (level) {
@@ -191,7 +281,7 @@ static void genradix_free_recurse(struct genradix_node *n, unsigned level)
genradix_free_recurse(n->children[i], level - 1);
}
- free_page((unsigned long) n);
+ genradix_free_node(n);
}
int __genradix_prealloc(struct __genradix *radix, size_t size,