summaryrefslogtreecommitdiff
path: root/fs/bcachefs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/backpointers.c47
-rw-r--r--fs/bcachefs/bcachefs_format.h7
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h95
-rw-r--r--fs/bcachefs/bkey_buf.h40
-rw-r--r--fs/bcachefs/btree_gc.c4
-rw-r--r--fs/bcachefs/btree_io.c7
-rw-r--r--fs/bcachefs/btree_iter.c32
-rw-r--r--fs/bcachefs/btree_journal_iter.c209
-rw-r--r--fs/bcachefs/btree_journal_iter.h35
-rw-r--r--fs/bcachefs/btree_journal_iter_types.h38
-rw-r--r--fs/bcachefs/btree_node_scan.c2
-rw-r--r--fs/bcachefs/btree_trans_commit.c2
-rw-r--r--fs/bcachefs/btree_update.c18
-rw-r--r--fs/bcachefs/btree_update.h17
-rw-r--r--fs/bcachefs/btree_update_interior.c2
-rw-r--r--fs/bcachefs/buckets.c2
-rw-r--r--fs/bcachefs/chardev.c215
-rw-r--r--fs/bcachefs/darray.c23
-rw-r--r--fs/bcachefs/darray.h19
-rw-r--r--fs/bcachefs/dirent.c2
-rw-r--r--fs/bcachefs/disk_accounting.c143
-rw-r--r--fs/bcachefs/ec.c19
-rw-r--r--fs/bcachefs/error.c12
-rw-r--r--fs/bcachefs/extent_update.c62
-rw-r--r--fs/bcachefs/extent_update.h2
-rw-r--r--fs/bcachefs/fs-io-buffered.c28
-rw-r--r--fs/bcachefs/fs.c2
-rw-r--r--fs/bcachefs/fsck.c5
-rw-r--r--fs/bcachefs/inode.c11
-rw-r--r--fs/bcachefs/inode.h2
-rw-r--r--fs/bcachefs/inode_format.h3
-rw-r--r--fs/bcachefs/journal.c8
-rw-r--r--fs/bcachefs/journal_io.c26
-rw-r--r--fs/bcachefs/journal_io.h23
-rw-r--r--fs/bcachefs/journal_reclaim.c14
-rw-r--r--fs/bcachefs/journal_types.h2
-rw-r--r--fs/bcachefs/migrate.c4
-rw-r--r--fs/bcachefs/move.c4
-rw-r--r--fs/bcachefs/namei.c3
-rw-r--r--fs/bcachefs/opts.c2
-rw-r--r--fs/bcachefs/opts.h7
-rw-r--r--fs/bcachefs/rebalance.c7
-rw-r--r--fs/bcachefs/recovery.c63
-rw-r--r--fs/bcachefs/recovery_passes_format.h2
-rw-r--r--fs/bcachefs/replicas.c14
-rw-r--r--fs/bcachefs/replicas.h2
-rw-r--r--fs/bcachefs/sb-counters_format.h11
-rw-r--r--fs/bcachefs/sb-members.c4
-rw-r--r--fs/bcachefs/snapshot.c3
-rw-r--r--fs/bcachefs/snapshot.h15
-rw-r--r--fs/bcachefs/str_hash.h14
-rw-r--r--fs/bcachefs/super-io.c57
-rw-r--r--fs/bcachefs/super-io.h3
-rw-r--r--fs/bcachefs/super.c202
-rw-r--r--fs/bcachefs/super.h22
-rw-r--r--fs/bcachefs/xattr.c19
56 files changed, 1090 insertions, 546 deletions
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 45d3db41225a..cb25cddb759b 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -532,10 +532,6 @@ static int check_bp_exists(struct btree_trans *trans,
struct btree_iter other_extent_iter = {};
CLASS(printbuf, buf)();
- if (bpos_lt(bp->k.p, s->bp_start) ||
- bpos_gt(bp->k.p, s->bp_end))
- return 0;
-
CLASS(btree_iter, bp_iter)(trans, BTREE_ID_backpointers, bp->k.p, 0);
struct bkey_s_c bp_k = bch2_btree_iter_peek_slot(&bp_iter);
int ret = bkey_err(bp_k);
@@ -690,6 +686,10 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
struct bkey_i_backpointer bp;
bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
+ if (bpos_lt(bp.k.p, s->bp_start) ||
+ bpos_gt(bp.k.p, s->bp_end))
+ continue;
+
int ret = !empty
? check_bp_exists(trans, s, &bp, k)
: bch2_bucket_backpointer_mod(trans, k, &bp, true);
@@ -897,7 +897,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen &&
+ if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointer_bucket_gen &&
(bp.v->bucket_gen != a->gen ||
bp.v->pad)) {
ret = bch2_backpointer_del(trans, bp_k.k->p);
@@ -929,6 +929,14 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
if (sectors[ALLOC_dirty] != a->dirty_sectors ||
sectors[ALLOC_cached] != a->cached_sectors ||
sectors[ALLOC_stripe] != a->stripe_sectors) {
+ /*
+ * Post 1.14 upgrade, we assume that backpointers are mostly
+ * correct and a sector count mismatch is probably due to a
+ * write buffer race
+ *
+ * Pre upgrade, we expect all the buckets to be wrong, a write
+ * buffer flush is pointless:
+ */
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
if (ret)
@@ -976,12 +984,22 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
goto next;
struct bpos bucket = bp_pos_to_bucket(ca, pos);
- u64 next = ca->mi.nbuckets;
-
- unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets);
- if (bitmap)
- next = min_t(u64, next,
- find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset));
+ u64 next = min(bucket.offset, ca->mi.nbuckets);
+
+ unsigned long *mismatch = READ_ONCE(ca->bucket_backpointer_mismatch.buckets);
+ unsigned long *empty = READ_ONCE(ca->bucket_backpointer_empty.buckets);
+ /*
+ * Find the first bucket with mismatches - but
+ * not empty buckets; we don't need to pin those
+ * because we just recreate all backpointers in
+ * those buckets
+ */
+ if (mismatch && empty)
+ next = find_next_andnot_bit(mismatch, empty, ca->mi.nbuckets, next);
+ else if (mismatch)
+ next = find_next_bit(mismatch, ca->mi.nbuckets, next);
+ else
+ next = ca->mi.nbuckets;
bucket.offset = next;
if (bucket.offset == ca->mi.nbuckets)
@@ -1108,17 +1126,18 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
if (ret)
goto err;
- u64 nr_buckets = 0, nr_mismatches = 0;
+ u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0;
for_each_member_device(c, ca) {
nr_buckets += ca->mi.nbuckets;
nr_mismatches += ca->bucket_backpointer_mismatch.nr;
+ nr_empty += ca->bucket_backpointer_empty.nr;
}
if (!nr_mismatches)
goto err;
- bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
- nr_mismatches, nr_buckets);
+ bch_info(c, "scanning for missing backpointers in %llu/%llu buckets, %llu buckets with no backpointers",
+ nr_mismatches - nr_empty, nr_buckets, nr_empty);
while (1) {
ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 19961b4f30b8..b2de993d802b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -706,7 +706,8 @@ struct bch_sb_field_ext {
x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \
x(fast_device_removal, BCH_VERSION(1, 27)) \
x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \
- x(extent_snapshot_whiteouts, BCH_VERSION(1, 29))
+ x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \
+ x(31bit_dirent_offset, BCH_VERSION(1, 30))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -1378,7 +1379,8 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_alloc_v4)) \
x(quotas, 5, 0, \
BIT_ULL(KEY_TYPE_quota)) \
- x(stripes, 6, 0, \
+ x(stripes, 6, \
+ BTREE_IS_data, \
BIT_ULL(KEY_TYPE_stripe)) \
x(reflink, 7, \
BTREE_IS_extents| \
@@ -1463,7 +1465,6 @@ static inline bool btree_id_can_reconstruct(enum btree_id btree)
switch (btree) {
case BTREE_ID_snapshot_trees:
case BTREE_ID_deleted_inodes:
- case BTREE_ID_logged_ops:
case BTREE_ID_rebalance_work:
case BTREE_ID_subvolume_children:
return true;
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 52594e925eb7..5dc562f2a881 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -66,33 +66,46 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_STOP _IO(0xbc, 3)
#endif
-#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
-#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
-#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage)
-#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage)
-#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
-#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
-#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
-#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal)
-
-#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
-#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
-
-#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2)
-
-#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
-#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
-#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
-#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters)
+#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_ADD_v2 _IOW(0xbc, 23, struct bch_ioctl_disk_v2)
+#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_REMOVE_v2 _IOW(0xbc, 24, struct bch_ioctl_disk_v2)
+#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_ONLINE_v2 _IOW(0xbc, 25, struct bch_ioctl_disk_v2)
+#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_OFFLINE_v2 _IOW(0xbc, 26, struct bch_ioctl_disk_v2)
+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
+#define BCH_IOCTL_DISK_SET_STATE_v2 _IOW(0xbc, 22, struct bch_ioctl_disk_set_state_v2)
+#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
+#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage)
+#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage)
+#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
+#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
+#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
+#define BCH_IOCTL_DISK_RESIZE_v2 _IOW(0xbc, 27, struct bch_ioctl_disk_resize_v2)
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc, 15, struct bch_ioctl_disk_resize_journal)
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL_v2 _IOW(0xbc, 28, struct bch_ioctl_disk_resize_journal_v2)
+
+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
+
+#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2)
+
+#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
+#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
+#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
+#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters)
/* ioctl below act on a particular file, not the filesystem as a whole: */
#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
+struct bch_ioctl_err_msg {
+ __u64 msg_ptr;
+ __u32 msg_len;
+ __u32 pad;
+};
+
/*
* BCH_IOCTL_QUERY_UUID: get filesystem UUID
*
@@ -104,13 +117,6 @@ struct bch_ioctl_query_uuid {
__uuid_t uuid;
};
-#if 0
-struct bch_ioctl_start {
- __u32 flags;
- __u32 pad;
-};
-#endif
-
/*
* BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
*
@@ -164,6 +170,13 @@ struct bch_ioctl_disk {
__u64 dev;
};
+struct bch_ioctl_disk_v2 {
+ __u32 flags;
+ __u32 pad;
+ __u64 dev;
+ struct bch_ioctl_err_msg err;
+};
+
/*
* BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
*
@@ -181,6 +194,14 @@ struct bch_ioctl_disk_set_state {
__u64 dev;
};
+struct bch_ioctl_disk_set_state_v2 {
+ __u32 flags;
+ __u8 new_state;
+ __u8 pad[3];
+ __u64 dev;
+ struct bch_ioctl_err_msg err;
+};
+
#define BCH_DATA_OPS() \
x(scrub, 0) \
x(rereplicate, 1) \
@@ -392,6 +413,14 @@ struct bch_ioctl_disk_resize {
__u64 nbuckets;
};
+struct bch_ioctl_disk_resize_v2 {
+ __u32 flags;
+ __u32 pad;
+ __u64 dev;
+ __u64 nbuckets;
+ struct bch_ioctl_err_msg err;
+};
+
/*
* BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
*
@@ -405,6 +434,14 @@ struct bch_ioctl_disk_resize_journal {
__u64 nbuckets;
};
+struct bch_ioctl_disk_resize_journal_v2 {
+ __u32 flags;
+ __u32 pad;
+ __u64 dev;
+ __u64 nbuckets;
+ struct bch_ioctl_err_msg err;
+};
+
struct bch_ioctl_subvolume {
__u32 flags;
__u32 dirfd;
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
index a30c4ae8eb36..0a1fc582f53a 100644
--- a/fs/bcachefs/bkey_buf.h
+++ b/fs/bcachefs/bkey_buf.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_BKEY_BUF_H
#define _BCACHEFS_BKEY_BUF_H
+#include <linux/mempool.h>
+
#include "bcachefs.h"
#include "bkey.h"
@@ -10,41 +12,49 @@ struct bkey_buf {
u64 onstack[12];
};
-static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
- struct bch_fs *c, unsigned u64s)
+static inline int bch2_bkey_buf_realloc_noprof(struct bkey_buf *s,
+ struct bch_fs *c, unsigned u64s)
{
if (s->k == (void *) s->onstack &&
u64s > ARRAY_SIZE(s->onstack)) {
- s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+ s->k = mempool_alloc_noprof(&c->large_bkey_pool, GFP_NOFS);
memcpy(s->k, s->onstack, sizeof(s->onstack));
}
+
+ return 0; /* for alloc_hooks() macro */
}
+#define bch2_bkey_buf_realloc(...) alloc_hooks(bch2_bkey_buf_realloc_noprof(__VA_ARGS__))
-static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
- struct bch_fs *c,
- struct bkey_s_c k)
+static inline int bch2_bkey_buf_reassemble_noprof(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct bkey_s_c k)
{
bch2_bkey_buf_realloc(s, c, k.k->u64s);
bkey_reassemble(s->k, k);
+ return 0;
}
+#define bch2_bkey_buf_reassemble(...) alloc_hooks(bch2_bkey_buf_reassemble_noprof(__VA_ARGS__))
-static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
- struct bch_fs *c,
- struct bkey_i *src)
+static inline int bch2_bkey_buf_copy_noprof(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct bkey_i *src)
{
bch2_bkey_buf_realloc(s, c, src->k.u64s);
bkey_copy(s->k, src);
+ return 0;
}
+#define bch2_bkey_buf_copy(...) alloc_hooks(bch2_bkey_buf_copy_noprof(__VA_ARGS__))
-static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
- struct bch_fs *c,
- struct btree *b,
- struct bkey_packed *src)
+static inline int bch2_bkey_buf_unpack_noprof(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct btree *b,
+ struct bkey_packed *src)
{
- bch2_bkey_buf_realloc(s, c, BKEY_U64s +
- bkeyp_val_u64s(&b->format, src));
+ bch2_bkey_buf_realloc(s, c, BKEY_U64s + bkeyp_val_u64s(&b->format, src));
bch2_bkey_unpack(b, s->k, src);
+ return 0;
}
+#define bch2_bkey_buf_unpack(...) alloc_hooks(bch2_bkey_buf_unpack_noprof(__VA_ARGS__))
static inline void bch2_bkey_buf_init(struct bkey_buf *s)
{
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index ae7d260589d8..43f294284d57 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -356,7 +356,7 @@ again:
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
BUG_ON(bpos_gt(k.k->p, b->data->max_key));
@@ -470,7 +470,7 @@ again:
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) {
bch2_bkey_buf_reassemble(&cur_k, c, k);
bch2_btree_and_journal_iter_advance(&iter);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 276cf088539e..2e3dd9bacac5 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -131,10 +131,10 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
- p = kvmalloc(size, GFP_NOWAIT);
+ p = kvmalloc(size, GFP_NOWAIT|__GFP_ACCOUNT|__GFP_RECLAIMABLE);
if (!p) {
*used_mempool = true;
- p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS|__GFP_ACCOUNT|__GFP_RECLAIMABLE);
}
memalloc_nofs_restore(flags);
return p;
@@ -1014,6 +1014,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
k = bkey_p_next(k);
continue;
drop_this_key:
+ ret = 0;
next_good_key = k->u64s;
if (!next_good_key ||
@@ -1470,7 +1471,7 @@ start:
}
prt_newline(&buf);
- if (failed.nr)
+ if (ret || failed.nr)
bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
async_object_list_del(c, btree_read_bio, rb->list_idx);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 8962c481e310..76f430f93dc1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -650,7 +650,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str
i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
+ const struct bkey_i *j_k =
bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
i->k->k.p);
@@ -848,7 +848,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
break;
bch2_btree_and_journal_iter_advance(jiter);
- k = bch2_btree_and_journal_iter_peek(jiter);
+ k = bch2_btree_and_journal_iter_peek(c, jiter);
if (!k.k)
break;
@@ -898,7 +898,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
- k = bch2_btree_and_journal_iter_peek(&jiter);
+ k = bch2_btree_and_journal_iter_peek(c, &jiter);
if (!k.k) {
CLASS(printbuf, buf)();
@@ -2120,10 +2120,10 @@ void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_
}
}
-static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos search_pos,
- struct bpos end_pos)
+static const struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos search_pos,
+ struct bpos end_pos)
{
struct btree_path *path = btree_iter_path(trans, iter);
@@ -2139,7 +2139,7 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
struct btree_iter *iter)
{
struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
+ const struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
if (k) {
iter->k = k->k;
@@ -2156,7 +2156,7 @@ void btree_trans_peek_journal(struct btree_trans *trans,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *next_journal =
+ const struct bkey_i *next_journal =
bch2_btree_journal_peek(trans, iter, search_key,
k->k ? k->k->p : path_l(path)->b->key.k.p);
if (next_journal) {
@@ -2165,10 +2165,10 @@ void btree_trans_peek_journal(struct btree_trans *trans,
}
}
-static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos search_key,
- struct bpos end_pos)
+static const struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos search_key,
+ struct bpos end_pos)
{
struct btree_path *path = btree_iter_path(trans, iter);
@@ -2186,7 +2186,7 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *next_journal =
+ const struct bkey_i *next_journal =
bch2_btree_journal_peek_prev(trans, iter, search_key,
k->k ? k->k->p : path_l(path)->b->data->min_key);
@@ -2366,7 +2366,9 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
+ !(iter->flags & BTREE_ITER_nofilter_whiteouts) &&
+ bkey_eq(end, POS_MAX));
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 24f2fbe84ad7..a6f344faf751 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -46,21 +46,22 @@ static size_t __bch2_journal_key_search(struct journal_keys *keys,
enum btree_id id, unsigned level,
struct bpos pos)
{
+ struct bch_fs *c = container_of(keys, struct bch_fs, journal_keys);
size_t l = 0, r = keys->nr, m;
while (l < r) {
m = l + ((r - l) >> 1);
- if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
+ if (__journal_key_cmp(c, id, level, pos, idx_to_key(keys, m)) > 0)
l = m + 1;
else
r = m;
}
BUG_ON(l < keys->nr &&
- __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
+ __journal_key_cmp(c, id, level, pos, idx_to_key(keys, l)) > 0);
BUG_ON(l &&
- __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
+ __journal_key_cmp(c, id, level, pos, idx_to_key(keys, l - 1)) <= 0);
return l;
}
@@ -72,10 +73,20 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
}
+static inline struct journal_key_range_overwritten *__overwrite_range(struct journal_keys *keys, u32 idx)
+{
+ return idx ? keys->overwrites.data + idx : NULL;
+}
+
+static inline struct journal_key_range_overwritten *overwrite_range(struct journal_keys *keys, u32 idx)
+{
+ return idx ? rcu_dereference(keys->overwrites.data) + idx : NULL;
+}
+
/* Returns first non-overwritten key >= search key: */
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos,
- struct bpos end_pos, size_t *idx)
+const struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos,
+ struct bpos end_pos, size_t *idx)
{
struct journal_keys *keys = &c->journal_keys;
unsigned iters = 0;
@@ -87,7 +98,7 @@ search:
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
while (*idx &&
- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+ __journal_key_cmp(c, btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
--(*idx);
iters++;
if (iters == 10) {
@@ -96,23 +107,23 @@ search:
}
}
- struct bkey_i *ret = NULL;
+ const struct bkey_i *ret = NULL;
rcu_read_lock(); /* for overwritten_ranges */
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
- if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
+ if (__journal_key_cmp(c, btree_id, level, end_pos, k) < 0)
break;
if (k->overwritten) {
if (k->overwritten_range)
- *idx = rcu_dereference(k->overwritten_range)->end;
+ *idx = overwrite_range(keys, k->overwritten_range)->end;
else
*idx += 1;
continue;
}
- if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
- ret = k->k;
+ if (__journal_key_cmp(c, btree_id, level, pos, k) <= 0) {
+ ret = journal_key_k(c, k);
break;
}
@@ -129,9 +140,9 @@ search:
return ret;
}
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos,
- struct bpos end_pos, size_t *idx)
+const struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos,
+ struct bpos end_pos, size_t *idx)
{
struct journal_keys *keys = &c->journal_keys;
unsigned iters = 0;
@@ -146,7 +157,7 @@ search:
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
while (*idx < keys->nr &&
- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) {
+ __journal_key_cmp(c, btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) {
(*idx)++;
iters++;
if (iters == 10) {
@@ -158,25 +169,25 @@ search:
if (*idx == keys->nr)
--(*idx);
- struct bkey_i *ret = NULL;
+ const struct bkey_i *ret = NULL;
rcu_read_lock(); /* for overwritten_ranges */
while (true) {
k = idx_to_key(keys, *idx);
- if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
+ if (__journal_key_cmp(c, btree_id, level, end_pos, k) > 0)
break;
if (k->overwritten) {
if (k->overwritten_range)
- *idx = rcu_dereference(k->overwritten_range)->start;
+ *idx = overwrite_range(keys, k->overwritten_range)->start;
if (!*idx)
break;
--(*idx);
continue;
}
- if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
- ret = k->k;
+ if (__journal_key_cmp(c, btree_id, level, pos, k) >= 0) {
+ ret = journal_key_k(c, k);
break;
}
@@ -194,8 +205,8 @@ search:
return ret;
}
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos)
+const struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos)
{
size_t idx = 0;
@@ -264,13 +275,8 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
struct journal_key n = {
.btree_id = id,
.level = level,
- .k = k,
.allocated = true,
- /*
- * Ensure these keys are done last by journal replay, to unblock
- * journal reclaim:
- */
- .journal_seq = U64_MAX,
+ .allocated_k = k,
};
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
@@ -278,8 +284,8 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
BUG_ON(test_bit(BCH_FS_rw, &c->flags));
if (idx < keys->size &&
- journal_key_cmp(&n, &keys->data[idx]) == 0) {
- struct bkey_i *o = keys->data[idx].k;
+ journal_key_cmp(c, &n, &keys->data[idx]) == 0) {
+ struct bkey_i *o = journal_key_k(c, &keys->data[idx]);
if (k->k.type == KEY_TYPE_accounting &&
o->k.type == KEY_TYPE_accounting) {
@@ -291,7 +297,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
}
if (keys->data[idx].allocated)
- kfree(keys->data[idx].k);
+ kfree(keys->data[idx].allocated_k);
keys->data[idx] = n;
return 0;
}
@@ -376,17 +382,20 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
unsigned level, struct bpos pos)
{
- struct journal_keys *keys = &trans->c->journal_keys;
+ if (!trans->journal_replay_not_finished)
+ return false;
+
+ struct bch_fs *c = trans->c;
+ struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
- if (!trans->journal_replay_not_finished)
+ if (idx >= keys->size ||
+ keys->data[idx].btree_id != btree ||
+ keys->data[idx].level != level)
return false;
- return (idx < keys->size &&
- keys->data[idx].btree_id == btree &&
- keys->data[idx].level == level &&
- bpos_eq(keys->data[idx].k->k.p, pos) &&
- bkey_deleted(&keys->data[idx].k->k));
+ struct bkey_i *k = journal_key_k(c, &keys->data[idx]);
+ return bpos_eq(k->k.p, pos) && bkey_deleted(&k->k);
}
static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
@@ -403,9 +412,9 @@ static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos
bool next_overwritten = next && next->overwritten;
struct journal_key_range_overwritten *prev_range =
- prev_overwritten ? prev->overwritten_range : NULL;
+ prev_overwritten ? __overwrite_range(keys, prev->overwritten_range) : NULL;
struct journal_key_range_overwritten *next_range =
- next_overwritten ? next->overwritten_range : NULL;
+ next_overwritten ? __overwrite_range(keys, next->overwritten_range) : NULL;
BUG_ON(prev_range && prev_range->end != idx);
BUG_ON(next_range && next_range->start != idx + 1);
@@ -413,37 +422,47 @@ static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos
if (prev_range && next_range) {
prev_range->end = next_range->end;
- keys->data[pos].overwritten_range = prev_range;
+ keys->data[pos].overwritten_range = prev->overwritten_range;
+
+ u32 old = next->overwritten_range;
+
for (size_t i = next_range->start; i < next_range->end; i++) {
struct journal_key *ip = keys->data + idx_to_pos(keys, i);
- BUG_ON(ip->overwritten_range != next_range);
- ip->overwritten_range = prev_range;
+ BUG_ON(ip->overwritten_range != old);
+ ip->overwritten_range = prev->overwritten_range;
}
-
- kfree_rcu_mightsleep(next_range);
} else if (prev_range) {
prev_range->end++;
- k->overwritten_range = prev_range;
+ k->overwritten_range = prev->overwritten_range;
if (next_overwritten) {
prev_range->end++;
- next->overwritten_range = prev_range;
+ next->overwritten_range = prev->overwritten_range;
}
} else if (next_range) {
next_range->start--;
- k->overwritten_range = next_range;
+ k->overwritten_range = next->overwritten_range;
if (prev_overwritten) {
next_range->start--;
- prev->overwritten_range = next_range;
+ prev->overwritten_range = next->overwritten_range;
}
} else if (prev_overwritten || next_overwritten) {
- struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
- if (!r)
+ /* 0 is a sentinel value */
+ if (darray_resize_rcu(&keys->overwrites, max(keys->overwrites.nr + 1, 2)))
return;
- r->start = idx - (size_t) prev_overwritten;
- r->end = idx + 1 + (size_t) next_overwritten;
+ if (!keys->overwrites.nr)
+ darray_push(&keys->overwrites, (struct journal_key_range_overwritten) {});
+
+ darray_push(&keys->overwrites, ((struct journal_key_range_overwritten) {
+ .start = idx - (size_t) prev_overwritten,
+ .end = idx + 1 + (size_t) next_overwritten,
+ }));
+
+ smp_wmb();
+ u32 r = keys->overwrites.nr - 1;
+
+ k->overwritten_range = r;
- rcu_assign_pointer(k->overwritten_range, r);
if (prev_overwritten)
prev->overwritten_range = r;
if (next_overwritten)
@@ -457,11 +476,15 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
- if (idx < keys->size &&
- keys->data[idx].btree_id == btree &&
- keys->data[idx].level == level &&
- bpos_eq(keys->data[idx].k->k.p, pos) &&
- !keys->data[idx].overwritten) {
+ if (idx >= keys->size ||
+ keys->data[idx].btree_id != btree ||
+ keys->data[idx].level != level ||
+ keys->data[idx].overwritten)
+ return;
+
+ struct bkey_i *k = journal_key_k(c, &keys->data[idx]);
+
+ if (bpos_eq(k->k.p, pos)) {
guard(mutex)(&keys->overwrite_lock);
__bch2_journal_key_overwritten(keys, idx);
}
@@ -476,7 +499,7 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
}
}
-static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+static struct bkey_s_c bch2_journal_iter_peek(struct bch_fs *c, struct journal_iter *iter)
{
journal_iter_verify(iter);
@@ -490,10 +513,10 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
BUG_ON(cmp);
if (!k->overwritten)
- return bkey_i_to_s_c(k->k);
+ return bkey_i_to_s_c(journal_key_k(c, k));
if (k->overwritten_range)
- iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
+ iter->idx = idx_to_pos(iter->keys, overwrite_range(iter->keys, k->overwritten_range)->end);
else
bch2_journal_iter_advance(iter);
}
@@ -554,7 +577,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
while (nr--) {
bch2_btree_and_journal_iter_advance(&iter);
- struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
+ struct bkey_s_c k = bch2_btree_and_journal_iter_peek(c, &iter);
if (!k.k)
break;
@@ -565,7 +588,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
bch2_bkey_buf_exit(&tmp, c);
}
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct bch_fs *c, struct btree_and_journal_iter *iter)
{
struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
size_t iters = 0;
@@ -586,7 +609,7 @@ again:
bch2_journal_iter_advance_btree(iter);
if (iter->trans->journal_replay_not_finished)
- while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+ while ((journal_k = bch2_journal_iter_peek(c, &iter->journal)).k &&
bpos_lt(journal_k.k->p, iter->pos))
bch2_journal_iter_advance(&iter->journal);
@@ -658,15 +681,22 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
/*
* When keys compare equal, oldest compares first:
*/
-static int journal_sort_key_cmp(const void *_l, const void *_r)
+static int journal_sort_key_cmp(const void *_l, const void *_r, const void *priv)
{
+ struct bch_fs *c = (void *) priv;
const struct journal_key *l = _l;
const struct journal_key *r = _r;
int rewind = l->rewind && r->rewind ? -1 : 1;
- return journal_key_cmp(l, r) ?:
- ((cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->journal_offset, r->journal_offset)) * rewind);
+ int cmp = journal_key_cmp(c, l, r);
+ if (cmp)
+ return cmp;
+
+ if (l->allocated || r->allocated)
+ return cmp_int(l->allocated, r->allocated);
+
+ return ((cmp_int(l->journal_seq_offset, r->journal_seq_offset) ?:
+ cmp_int(l->journal_offset, r->journal_offset)) * rewind);
}
void bch2_journal_keys_put(struct bch_fs *c)
@@ -680,20 +710,16 @@ void bch2_journal_keys_put(struct bch_fs *c)
move_gap(keys, keys->nr);
- darray_for_each(*keys, i) {
- if (i->overwritten_range &&
- (i == &darray_last(*keys) ||
- i->overwritten_range != i[1].overwritten_range))
- kfree(i->overwritten_range);
-
+ darray_for_each(*keys, i)
if (i->allocated)
- kfree(i->k);
- }
+ kfree(i->allocated_k);
kvfree(keys->data);
keys->data = NULL;
keys->nr = keys->gap = keys->size = 0;
+ darray_exit(&keys->overwrites);
+
struct journal_replay **i;
struct genradix_iter iter;
@@ -704,8 +730,10 @@ void bch2_journal_keys_put(struct bch_fs *c)
static void __journal_keys_sort(struct journal_keys *keys)
{
- sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
- journal_sort_key_cmp, NULL);
+ struct bch_fs *c = container_of(keys, struct bch_fs, journal_keys);
+
+ sort_r_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
+ journal_sort_key_cmp, NULL, c);
cond_resched();
@@ -717,9 +745,10 @@ static void __journal_keys_sort(struct journal_keys *keys)
* compare each individual accounting key against the version in
* the btree during replay:
*/
- if (src->k->k.type != KEY_TYPE_accounting &&
+ struct bkey_i *k = journal_key_k(c, src);
+ if (k->k.type != KEY_TYPE_accounting &&
src + 1 < &darray_top(*keys) &&
- !journal_key_cmp(src, src + 1))
+ !journal_key_cmp(c, src, src + 1))
continue;
*dst++ = *src;
@@ -763,8 +792,7 @@ int bch2_journal_keys_sort(struct bch_fs *c)
.btree_id = entry->btree_id,
.level = entry->level,
.rewind = rewind,
- .k = k,
- .journal_seq = le64_to_cpu(i->j.seq),
+ .journal_seq_offset = journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)),
.journal_offset = k->_data - i->j._data,
};
@@ -801,13 +829,18 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
move_gap(keys, keys->nr);
- darray_for_each(*keys, i)
+ darray_for_each(*keys, i) {
+ struct bkey_i *k = journal_key_k(c, i);
+
if (!(i->btree_id == btree &&
i->level >= level_min &&
i->level <= level_max &&
- bpos_ge(i->k->k.p, start) &&
- bpos_le(i->k->k.p, end)))
+ bpos_ge(k->k.p, start) &&
+ bpos_le(k->k.p, end)))
keys->data[dst++] = *i;
+ else if (i->allocated)
+ kfree(i->allocated_k);
+ }
keys->nr = keys->gap = dst;
}
@@ -825,7 +858,7 @@ void bch2_journal_keys_dump(struct bch_fs *c)
prt_printf(&buf, "btree=");
bch2_btree_id_to_text(&buf, i->btree_id);
prt_printf(&buf, " l=%u ", i->level);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(journal_key_k(c, i)));
pr_err("%s", buf.buf);
}
}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 2a3082919b8d..8dc8e778be6c 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -29,6 +29,22 @@ struct btree_and_journal_iter {
bool fail_if_too_many_whiteouts;
};
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
+{
+ return (seq - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static inline struct bkey_i *journal_key_k(struct bch_fs *c,
+ const struct journal_key *k)
+{
+ if (k->allocated)
+ return k->allocated_k;
+
+ struct journal_replay *i = *genradix_ptr(&c->journal_entries, k->journal_seq_offset);
+
+ return (struct bkey_i *) (i->j._data + k->journal_offset);
+}
+
static inline int __journal_key_btree_cmp(enum btree_id l_btree_id,
unsigned l_level,
const struct journal_key *r)
@@ -37,25 +53,28 @@ static inline int __journal_key_btree_cmp(enum btree_id l_btree_id,
cmp_int(l_btree_id, r->btree_id);
}
-static inline int __journal_key_cmp(enum btree_id l_btree_id,
+static inline int __journal_key_cmp(struct bch_fs *c,
+ enum btree_id l_btree_id,
unsigned l_level,
struct bpos l_pos,
const struct journal_key *r)
{
return __journal_key_btree_cmp(l_btree_id, l_level, r) ?:
- bpos_cmp(l_pos, r->k->k.p);
+ bpos_cmp(l_pos, journal_key_k(c, r)->k.p);
}
-static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+static inline int journal_key_cmp(struct bch_fs *c,
+ const struct journal_key *l, const struct journal_key *r)
{
- return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+ return __journal_key_cmp(c, l->btree_id, l->level,
+ journal_key_k(c, l)->k.p, r);
}
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
+const struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
+const struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+const struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
@@ -71,7 +90,7 @@ bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned,
void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct bch_fs *, struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h
index 86aacb254fb2..4495fc92f848 100644
--- a/fs/bcachefs/btree_journal_iter_types.h
+++ b/fs/bcachefs/btree_journal_iter_types.h
@@ -2,21 +2,47 @@
#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
+struct journal_ptr {
+ bool csum_good;
+ struct bch_csum csum;
+ u8 dev;
+ u32 bucket;
+ u32 bucket_offset;
+ u64 sector;
+};
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+ DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
+
+ bool csum_good;
+ bool ignore_blacklisted;
+ bool ignore_not_dirty;
+ /* must be last: */
+ struct jset j;
+};
+
struct journal_key_range_overwritten {
size_t start, end;
};
struct journal_key {
- u64 journal_seq;
- u32 journal_offset;
+ union {
+ struct {
+ u32 journal_seq_offset;
+ u32 journal_offset;
+ };
+ struct bkey_i *allocated_k;
+ };
enum btree_id btree_id:8;
unsigned level:8;
bool allocated:1;
bool overwritten:1;
bool rewind:1;
- struct journal_key_range_overwritten __rcu *
- overwritten_range;
- struct bkey_i *k;
+ u32 overwritten_range;
};
struct journal_keys {
@@ -31,7 +57,9 @@ struct journal_keys {
size_t gap;
atomic_t ref;
bool initial_ref_held;
+
struct mutex overwrite_lock;
+ DARRAY(struct journal_key_range_overwritten) overwrites;
};
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 6b747c053e91..b618a0bd1186 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -534,7 +534,7 @@ int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos node_min, struct bpos node_max)
{
- if (btree_id_recovers_from_scan(btree))
+ if (!btree_id_recovers_from_scan(btree))
return 0;
struct find_btree_nodes *f = &c->found_btree_nodes;
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 4d58bdb233e9..5fa7f2f9f1e9 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -54,7 +54,7 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u);
if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
+ const struct bkey_i *j_k =
bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
if (j_k)
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index f59f018fe0d8..b70eb095a37e 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -12,7 +12,6 @@
#include "extents.h"
#include "keylist.h"
#include "snapshot.h"
-#include "super-io.h"
#include "trace.h"
#include <linux/string_helpers.h>
@@ -159,21 +158,6 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
return ret;
}
-static inline enum bch_bkey_type extent_whiteout_type(struct bch_fs *c, enum btree_id btree, const struct bkey *k)
-{
- /*
- * KEY_TYPE_extent_whiteout indicates that there isn't a real extent
- * present at that position: key start positions inclusive of
- * KEY_TYPE_extent_whiteout (but not KEY_TYPE_whiteout) are
- * monotonically increasing
- */
- return btree_id_is_extents_snapshots(btree) &&
- bkey_deleted(k) &&
- !bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_snapshot_whiteouts)
- ? KEY_TYPE_extent_whiteout
- : KEY_TYPE_whiteout;
-}
-
int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
struct btree_iter *iter,
enum btree_iter_update_trigger_flags flags,
@@ -419,7 +403,7 @@ __btree_trans_update_by_path(struct btree_trans *trans,
i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
+ const struct bkey_i *j_k =
bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
if (j_k) {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 663739db82b1..18560ca80057 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -5,6 +5,7 @@
#include "btree_iter.h"
#include "journal.h"
#include "snapshot.h"
+#include "super-io.h"
struct bch_fs;
struct btree;
@@ -110,6 +111,22 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
: 0;
}
+static inline enum bch_bkey_type extent_whiteout_type(struct bch_fs *c, enum btree_id btree,
+ const struct bkey *k)
+{
+ /*
+ * KEY_TYPE_extent_whiteout indicates that there isn't a real extent
+ * present at that position: key start positions inclusive of
+ * KEY_TYPE_extent_whiteout (but not KEY_TYPE_whiteout) are
+ * monotonically increasing
+ */
+ return btree_id_is_extents_snapshots(btree) &&
+ bkey_deleted(k) &&
+ !bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_snapshot_whiteouts)
+ ? KEY_TYPE_extent_whiteout
+ : KEY_TYPE_whiteout;
+}
+
int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
enum btree_iter_update_trigger_flags,
struct bkey_s_c, struct bkey_s_c);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 65ca54c5b0ff..a9877a47bfc6 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -95,7 +95,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
if (!b->c.level)
goto out;
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) {
if (k.k->type != KEY_TYPE_btree_ptr_v2)
goto out;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 87a6f4dce296..280b169efb62 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -111,7 +111,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
CLASS(printbuf, buf)();
int ret = 0;
- CLASS(bch2_dev_tryget, ca)(c, p.ptr.dev);
+ CLASS(bch2_dev_tryget_noerror, ca)(c, p.ptr.dev);
if (!ca) {
if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
trans, ptr_to_invalid_device,
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 467fc45e84fe..f6f90d421f27 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -187,6 +187,18 @@ static long bch2_ioctl_stop(struct bch_fs *c)
}
#endif
+static int copy_ioctl_err_msg(struct bch_ioctl_err_msg *dst, struct printbuf *src, int ret)
+{
+ if (ret) {
+ prt_printf(src, "error=%s", bch2_err_str(ret));
+ ret = copy_to_user_errcode((void __user *)(ulong)dst->msg_ptr,
+ src->buf,
+ min(src->pos, dst->msg_len)) ?: ret;
+ }
+
+ return ret;
+}
+
static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
{
char *path;
@@ -203,13 +215,37 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
if (ret)
return ret;
- ret = bch2_dev_add(c, path);
- if (!IS_ERR(path))
- kfree(path);
+ CLASS(printbuf, err)();
+ ret = bch2_dev_add(c, path, &err);
+ if (ret)
+ bch_err(c, "%s", err.buf);
+ kfree(path);
return ret;
}
+static long bch2_ioctl_disk_add_v2(struct bch_fs *c, struct bch_ioctl_disk_v2 arg)
+{
+ char *path = NULL;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (arg.flags || arg.pad)
+ return -EINVAL;
+
+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(path);
+ if (ret)
+ return ret;
+
+ CLASS(printbuf, err)();
+ ret = bch2_dev_add(c, path, &err);
+ kfree(path);
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
+}
+
static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
{
if (!capable(CAP_SYS_ADMIN))
@@ -226,7 +262,32 @@ static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
if (IS_ERR(ca))
return PTR_ERR(ca);
- return bch2_dev_remove(c, ca, arg.flags);
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_remove(c, ca, arg.flags, &err);
+ if (ret)
+ bch_err(ca, "%s", err.buf);
+ return ret;
+}
+
+static long bch2_ioctl_disk_remove_v2(struct bch_fs *c, struct bch_ioctl_disk_v2 arg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+ BCH_FORCE_IF_METADATA_LOST|
+ BCH_FORCE_IF_DEGRADED|
+ BCH_BY_INDEX)) ||
+ arg.pad)
+ return -EINVAL;
+
+ struct bch_dev *ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_remove(c, ca, arg.flags, &err);
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
}
static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
@@ -245,11 +306,36 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
if (ret)
return ret;
- ret = bch2_dev_online(c, path);
+ CLASS(printbuf, err)();
+ ret = bch2_dev_online(c, path, &err);
+ if (ret)
+ bch_err(c, "%s", err.buf);
kfree(path);
return ret;
}
+static long bch2_ioctl_disk_online_v2(struct bch_fs *c, struct bch_ioctl_disk_v2 arg)
+{
+ char *path;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (arg.flags || arg.pad)
+ return -EINVAL;
+
+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(path);
+ if (ret)
+ return ret;
+
+ CLASS(printbuf, err)();
+ ret = bch2_dev_online(c, path, &err);
+ kfree(path);
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
+}
+
static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
{
if (!capable(CAP_SYS_ADMIN))
@@ -266,7 +352,32 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
if (IS_ERR(ca))
return PTR_ERR(ca);
- return bch2_dev_offline(c, ca, arg.flags);
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_offline(c, ca, arg.flags, &err);
+ if (ret)
+ bch_err(ca, "%s", err.buf);
+ return ret;
+}
+
+static long bch2_ioctl_disk_offline_v2(struct bch_fs *c, struct bch_ioctl_disk_v2 arg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+ BCH_FORCE_IF_METADATA_LOST|
+ BCH_FORCE_IF_DEGRADED|
+ BCH_BY_INDEX)) ||
+ arg.pad)
+ return -EINVAL;
+
+ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_offline(c, ca, arg.flags, &err);
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
}
static long bch2_ioctl_disk_set_state(struct bch_fs *c,
@@ -287,11 +398,40 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
if (IS_ERR(ca))
return PTR_ERR(ca);
- int ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags, &err);
bch_err_msg(ca, ret, "setting device state");
return ret;
}
+static long bch2_ioctl_disk_set_state_v2(struct bch_fs *c,
+ struct bch_ioctl_disk_set_state_v2 arg)
+{
+ CLASS(printbuf, err)();
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+ BCH_FORCE_IF_METADATA_LOST|
+ BCH_FORCE_IF_DEGRADED|
+ BCH_BY_INDEX)) ||
+ arg.pad[0] || arg.pad[1] || arg.pad[2] ||
+ arg.new_state >= BCH_MEMBER_STATE_NR)
+ return -EINVAL;
+
+ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags);
+ int ret = PTR_ERR_OR_ZERO(ca);
+ if (ret) {
+ prt_printf(&err, "device %llu not found\n", arg.dev);
+ goto err;
+ }
+
+ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags, &err);
+err:
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
+}
+
struct bch_data_ctx {
struct thread_with_file thr;
@@ -620,7 +760,30 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c,
if (IS_ERR(ca))
return PTR_ERR(ca);
- return bch2_dev_resize(c, ca, arg.nbuckets);
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_resize(c, ca, arg.nbuckets, &err);
+ if (ret)
+ bch_err(ca, "%s", err.buf);
+ return ret;
+}
+
+static long bch2_ioctl_disk_resize_v2(struct bch_fs *c,
+ struct bch_ioctl_disk_resize_v2 arg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad)
+ return -EINVAL;
+
+ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_resize(c, ca, arg.nbuckets, &err);
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
}
static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
@@ -643,6 +806,28 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
return bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
}
+static long bch2_ioctl_disk_resize_journal_v2(struct bch_fs *c,
+ struct bch_ioctl_disk_resize_journal_v2 arg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad)
+ return -EINVAL;
+
+ if (arg.nbuckets > U32_MAX)
+ return -EINVAL;
+
+ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ CLASS(printbuf, err)();
+ int ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
+}
+
#define BCH_IOCTL(_name, _argtype) \
do { \
_argtype i; \
@@ -684,20 +869,34 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
switch (cmd) {
case BCH_IOCTL_DISK_ADD:
BCH_IOCTL(disk_add, struct bch_ioctl_disk);
+ case BCH_IOCTL_DISK_ADD_v2:
+ BCH_IOCTL(disk_add_v2, struct bch_ioctl_disk_v2);
case BCH_IOCTL_DISK_REMOVE:
BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
+ case BCH_IOCTL_DISK_REMOVE_v2:
+ BCH_IOCTL(disk_remove_v2, struct bch_ioctl_disk_v2);
case BCH_IOCTL_DISK_ONLINE:
BCH_IOCTL(disk_online, struct bch_ioctl_disk);
+ case BCH_IOCTL_DISK_ONLINE_v2:
+ BCH_IOCTL(disk_online_v2, struct bch_ioctl_disk_v2);
case BCH_IOCTL_DISK_OFFLINE:
BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
+ case BCH_IOCTL_DISK_OFFLINE_v2:
+ BCH_IOCTL(disk_offline_v2, struct bch_ioctl_disk_v2);
case BCH_IOCTL_DISK_SET_STATE:
BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
+ case BCH_IOCTL_DISK_SET_STATE_v2:
+ BCH_IOCTL(disk_set_state_v2, struct bch_ioctl_disk_set_state_v2);
case BCH_IOCTL_DATA:
BCH_IOCTL(data, struct bch_ioctl_data);
case BCH_IOCTL_DISK_RESIZE:
BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+ case BCH_IOCTL_DISK_RESIZE_v2:
+ BCH_IOCTL(disk_resize_v2, struct bch_ioctl_disk_resize_v2);
case BCH_IOCTL_DISK_RESIZE_JOURNAL:
BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
+ case BCH_IOCTL_DISK_RESIZE_JOURNAL_v2:
+ BCH_IOCTL(disk_resize_journal_v2, struct bch_ioctl_disk_resize_journal_v2);
case BCH_IOCTL_FSCK_ONLINE:
BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
case BCH_IOCTL_QUERY_ACCOUNTING:
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
index e86d36d23e9e..6940037bd19e 100644
--- a/fs/bcachefs/darray.c
+++ b/fs/bcachefs/darray.c
@@ -1,11 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/log2.h>
+#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include "darray.h"
-int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp,
+ bool rcu)
{
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
@@ -20,18 +22,25 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_
if (unlikely(check_mul_overflow(new_size, element_size, &bytes)))
return -ENOMEM;
- void *data = likely(bytes < INT_MAX)
+ void *old = d->data;
+ void *new = likely(bytes < INT_MAX)
? kvmalloc_noprof(bytes, gfp)
: vmalloc_noprof(bytes);
- if (!data)
+ if (!new)
return -ENOMEM;
if (d->size)
- memcpy(data, d->data, d->size * element_size);
- if (d->data != d->preallocated)
- kvfree(d->data);
- d->data = data;
+ memcpy(new, old, d->size * element_size);
+
+ rcu_assign_pointer(d->data, new);
d->size = new_size;
+
+ if (old != d->preallocated) {
+ if (!rcu)
+ kvfree(old);
+ else
+ kvfree_rcu_mightsleep(old);
+ }
}
return 0;
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 4080ee99aadd..b4f284fe9652 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -34,17 +34,17 @@ typedef DARRAY(s16) darray_s16;
typedef DARRAY(s32) darray_s32;
typedef DARRAY(s64) darray_s64;
-int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
+int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t, bool);
#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
-#define __darray_resize(_d, _element_size, _new_size, _gfp) \
+#define __darray_resize(_d, _element_size, _new_size, _gfp, _rcu) \
(unlikely((_new_size) > (_d)->size) \
- ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
+ ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp), _rcu)\
: 0)
#define darray_resize_gfp(_d, _new_size, _gfp) \
- __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
+ __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp, false)
#define darray_resize(_d, _new_size) \
darray_resize_gfp(_d, _new_size, GFP_KERNEL)
@@ -55,6 +55,12 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
#define darray_make_room(_d, _more) \
darray_make_room_gfp(_d, _more, GFP_KERNEL)
+#define darray_resize_rcu(_d, _new_size) \
+ __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), GFP_KERNEL, true)
+
+#define darray_make_room_rcu(_d, _more) \
+ darray_resize_rcu((_d), (_d)->nr + (_more))
+
#define darray_room(_d) ((_d).size - (_d).nr)
#define darray_top(_d) ((_d).data[(_d).nr])
@@ -107,8 +113,11 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
#define __darray_for_each(_d, _i) \
for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
+#define darray_for_each_from(_d, _i, _start) \
+ for (typeof(&(_d).data[0]) _i = _start; _i < (_d).data + (_d).nr; _i++)
+
#define darray_for_each(_d, _i) \
- for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+ darray_for_each_from(_d, _i, (_d).data)
#define darray_for_each_reverse(_d, _i) \
for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index cb44b35e0f1d..fe6f3d874a47 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -95,7 +95,7 @@ static u64 bch2_dirent_hash(const struct bch_hash_info *info,
bch2_str_hash_update(&ctx, info, name->name, name->len);
/* [0,2) reserved for dots */
- return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
+ return max_t(u64, bch2_str_hash_end(&ctx, info, true), 2);
}
static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 5944ad6d0f8d..809c76b68ba8 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -734,6 +734,37 @@ invalid_device:
goto fsck_err;
}
+static struct journal_key *accumulate_newer_accounting_keys(struct bch_fs *c, struct journal_key *i)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ struct bkey_i *k = journal_key_k(c, i);
+
+ darray_for_each_from(*keys, j, i + 1) {
+ if (journal_key_cmp(c, i, j))
+ return j;
+
+ struct bkey_i *n = journal_key_k(c, j);
+ if (n->k.type == KEY_TYPE_accounting) {
+ WARN_ON(bversion_cmp(k->k.bversion, n->k.bversion) >= 0);
+
+ bch2_accounting_accumulate(bkey_i_to_accounting(k),
+ bkey_i_to_s_c_accounting(n));
+ j->overwritten = true;
+ }
+ }
+
+ return &darray_top(*keys);
+}
+
+static struct journal_key *accumulate_and_read_journal_accounting(struct btree_trans *trans, struct journal_key *i)
+{
+ struct bch_fs *c = trans->c;
+ struct journal_key *next = accumulate_newer_accounting_keys(c, i);
+
+ int ret = accounting_read_key(trans, bkey_i_to_s_c(journal_key_k(c, i)));
+ return ret ? ERR_PTR(ret) : next;
+}
+
/*
* At startup time, initialize the in memory accounting from the btree (and
* journal)
@@ -759,80 +790,76 @@ int bch2_accounting_read(struct bch_fs *c)
percpu_memset(c->usage, 0, sizeof(*c->usage));
}
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_key *jk = keys->data;
+
+ while (jk < &darray_top(*keys) &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0)
+ jk++;
+
+ struct journal_key *end = jk;
+ while (end < &darray_top(*keys) &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0)
+ end++;
+
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
iter.flags &= ~BTREE_ITER_with_journal;
int ret = for_each_btree_key_continue(trans, iter,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
-
- if (k.k->type != KEY_TYPE_accounting)
- continue;
-
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
- if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- break;
-
- if (!bch2_accounting_is_mem(&acc_k)) {
- struct disk_accounting_pos next;
- memset(&next, 0, sizeof(next));
- next.type = acc_k.type + 1;
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
- continue;
- }
+ struct bkey u;
+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
- accounting_read_key(trans, k);
- }));
- bch2_trans_iter_exit(&iter);
- if (ret)
- return ret;
-
- struct journal_keys *keys = &c->journal_keys;
- struct journal_key *dst = keys->data;
- move_gap(keys, keys->nr);
-
- darray_for_each(*keys, i) {
- if (i->k->k.type == KEY_TYPE_accounting) {
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
+ if (k.k->type != KEY_TYPE_accounting)
+ continue;
- if (!bch2_accounting_is_mem(&acc_k))
- continue;
+ while (jk < end &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0)
+ jk = accumulate_and_read_journal_accounting(trans, jk);
- struct bkey_s_c k = bkey_i_to_s_c(i->k);
- unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
- sizeof(acc->k.data[0]),
- accounting_pos_cmp, &k.k->p);
+ while (jk < end &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 &&
+ bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) {
+ jk->overwritten = true;
+ jk++;
+ }
- bool applied = idx < acc->k.nr &&
- bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
+ if (jk < end &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0)
+ jk = accumulate_and_read_journal_accounting(trans, jk);
- if (applied)
- continue;
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
- if (i + 1 < &darray_top(*keys) &&
- i[1].k->k.type == KEY_TYPE_accounting &&
- !journal_key_cmp(i, i + 1)) {
- WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+ break;
- i[1].journal_seq = i[0].journal_seq;
+ if (!bch2_accounting_is_mem(&acc_k)) {
+ struct disk_accounting_pos next_acc;
+ memset(&next_acc, 0, sizeof(next_acc));
+ next_acc.type = acc_k.type + 1;
+ struct bpos next = disk_accounting_pos_to_bpos(&next_acc);
+ if (jk < end)
+ next = bpos_min(next, journal_key_k(c, jk)->k.p);
+
+ bch2_btree_iter_set_pos(&iter, next);
+ continue;
+ }
- bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k),
- bkey_s_c_to_accounting(k));
- continue;
- }
+ accounting_read_key(trans, k);
+ }));
+ bch2_trans_iter_exit(&iter);
+ if (ret)
+ return ret;
- ret = accounting_read_key(trans, k);
- if (ret)
- return ret;
- }
+ while (jk < end)
+ jk = accumulate_and_read_journal_accounting(trans, jk);
- *dst++ = *i;
- }
+ struct journal_key *dst = keys->data;
+ darray_for_each(*keys, i)
+ if (!i->overwritten)
+ *dst++ = *i;
keys->gap = keys->nr = dst - keys->data;
guard(percpu_write)(&c->mark_lock);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 85ec9f877c18..c2840cb674b2 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -35,8 +35,6 @@
#include <linux/raid/pq.h>
#include <linux/raid/xor.h>
-static bool bch2_stripe_is_open(struct bch_fs *, u64);
-
static void raid5_recov(unsigned disks, unsigned failed_idx,
size_t size, void **data)
{
@@ -388,20 +386,11 @@ int bch2_trigger_stripe(struct btree_trans *trans,
new_s->nr_redundant != old_s->nr_redundant));
if (flags & BTREE_TRIGGER_transactional) {
- u64 old_lru_pos = stripe_lru_pos(old_s);
- u64 new_lru_pos = stripe_lru_pos(new_s);
-
- if (new_lru_pos == STRIPE_LRU_POS_EMPTY &&
- !bch2_stripe_is_open(c, idx)) {
- _new.k->type = KEY_TYPE_deleted;
- set_bkey_val_u64s(_new.k, 0);
- new_s = NULL;
- new_lru_pos = 0;
- }
-
int ret = bch2_lru_change(trans,
- BCH_LRU_STRIPE_FRAGMENTATION, idx,
- old_lru_pos, new_lru_pos);
+ BCH_LRU_STRIPE_FRAGMENTATION,
+ idx,
+ stripe_lru_pos(old_s),
+ stripe_lru_pos(new_s));
if (ret)
return ret;
}
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 32a286b3a74e..e33f3166c48a 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -141,14 +141,16 @@ void bch2_io_error_work(struct work_struct *work)
if (ca->mi.state >= BCH_MEMBER_STATE_ro)
return;
- bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
- BCH_FORCE_IF_DEGRADED);
CLASS(printbuf, buf)();
__bch2_log_msg_start(ca->name, &buf);
- prt_printf(&buf, "writes erroring for %u seconds, setting %s ro",
- c->opts.write_error_timeout,
- dev ? "device" : "filesystem");
+ prt_printf(&buf, "writes erroring for %u seconds\n",
+ c->opts.write_error_timeout);
+
+ bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
+ BCH_FORCE_IF_DEGRADED, &buf);
+
+ prt_printf(&buf, "setting %s ro", dev ? "device" : "filesystem");
if (!dev)
bch2_fs_emergency_read_only2(c, &buf);
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index c4b0ea1adaa8..73eb28090bc7 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -98,11 +98,13 @@ static int count_iters_for_insert(struct btree_trans *trans,
return ret2 ?: ret;
}
-int bch2_extent_atomic_end(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos *end)
+int bch2_extent_trim_atomic(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *insert)
{
- unsigned nr_iters = 0;
+ enum bch_bkey_type whiteout_type =
+ extent_whiteout_type(trans->c, iter->btree_id, &insert->k);
+ struct bpos end = insert->k.p;
struct btree_iter copy;
bch2_trans_copy_iter(&copy, iter);
@@ -111,42 +113,60 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
if (ret)
goto err;
+ copy.flags |= BTREE_ITER_nofilter_whiteouts;
+
+ /*
+ * We're doing our own whiteout filtering, but we still need to pass a
+ * max key to avoid popping an assert in bch2_snapshot_is_ancestor():
+ */
struct bkey_s_c k;
- for_each_btree_key_max_continue_norestart(copy, *end, 0, k, ret) {
+ unsigned nr_iters = 0;
+ for_each_btree_key_max_continue_norestart(copy,
+ POS(insert->k.p.inode, U64_MAX),
+ 0, k, ret) {
unsigned offset = 0;
if (bkey_gt(iter->pos, bkey_start_pos(k.k)))
offset = iter->pos.offset - bkey_start_offset(k.k);
- ret = count_iters_for_insert(trans, k, offset, end, &nr_iters);
- if (ret)
- break;
+ if (bkey_extent_whiteout(k.k)) {
+ if (bpos_gt(k.k->p, insert->k.p)) {
+ if (k.k->type == KEY_TYPE_extent_whiteout)
+ break;
+ else
+ continue;
+ } else if (k.k->type != whiteout_type) {
+ nr_iters += 1;
+ if (nr_iters >= EXTENT_ITERS_MAX) {
+ end = bpos_min(end, k.k->p);
+ break;
+ }
+ }
+ } else {
+ if (bpos_ge(bkey_start_pos(k.k), end))
+ break;
+
+ ret = count_iters_for_insert(trans, k, offset, &end, &nr_iters);
+ if (ret)
+ break;
+ }
}
err:
bch2_trans_iter_exit(&copy);
- return ret < 0 ? ret : 0;
-}
-
-int bch2_extent_trim_atomic(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *k)
-{
- struct bpos end = k->k.p;
- int ret = bch2_extent_atomic_end(trans, iter, &end);
- if (ret)
+ if (ret < 0)
return ret;
/* tracepoint */
- if (bpos_lt(end, k->k.p)) {
+ if (bpos_lt(end, insert->k.p)) {
if (trace_extent_trim_atomic_enabled()) {
CLASS(printbuf, buf)();
bch2_bpos_to_text(&buf, end);
prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
+ bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(insert));
trace_extent_trim_atomic(trans->c, buf.buf);
}
- bch2_cut_back(end, k);
+ bch2_cut_back(end, insert);
}
return 0;
}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index 34467db53f45..2d956d971b11 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -4,8 +4,6 @@
#include "bcachefs.h"
-int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
- struct bpos *);
int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
struct bkey_i *);
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 0005569ecace..fd8beb5167ee 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -42,6 +42,14 @@ struct readpages_iter {
folios folios;
};
+static inline void readpages_iter_folio_revert(struct readahead_control *ractl,
+ struct folio *folio)
+{
+ bch2_folio_release(folio);
+ ractl->_nr_pages += folio_nr_pages(folio);
+ ractl->_index -= folio_nr_pages(folio);
+}
+
static int readpages_iter_init(struct readpages_iter *iter,
struct readahead_control *ractl)
{
@@ -52,9 +60,7 @@ static int readpages_iter_init(struct readpages_iter *iter,
while ((folio = __readahead_folio(ractl))) {
if (!bch2_folio_create(folio, GFP_KERNEL) ||
darray_push(&iter->folios, folio)) {
- bch2_folio_release(folio);
- ractl->_nr_pages += folio_nr_pages(folio);
- ractl->_index -= folio_nr_pages(folio);
+ readpages_iter_folio_revert(ractl, folio);
return iter->folios.nr ? 0 : -ENOMEM;
}
@@ -64,6 +70,15 @@ static int readpages_iter_init(struct readpages_iter *iter,
return 0;
}
+static void readpages_iter_exit(struct readpages_iter *iter,
+ struct readahead_control *ractl)
+{
+ darray_for_each_reverse(iter->folios, folio) {
+ readpages_iter_folio_revert(ractl, *folio);
+ folio_get(*folio);
+ }
+}
+
static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
{
if (iter->idx >= iter->folios.nr)
@@ -290,7 +305,10 @@ void bch2_readahead(struct readahead_control *ractl)
* scheduling.
*/
blk_start_plug(&plug);
- bch2_pagecache_add_get(inode);
+ if (!bch2_pagecache_add_tryget(inode)) {
+ readpages_iter_exit(&readpages_iter, ractl);
+ goto out;
+ }
struct btree_trans *trans = bch2_trans_get(c);
while ((folio = readpage_iter_peek(&readpages_iter))) {
@@ -317,6 +335,7 @@ void bch2_readahead(struct readahead_control *ractl)
bch2_trans_put(trans);
bch2_pagecache_add_put(inode);
+out:
blk_finish_plug(&plug);
darray_exit(&readpages_iter.folios);
}
@@ -759,7 +778,6 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
struct bch2_folio_reservation *res = fsdata;
unsigned offset = pos - folio_pos(folio);
- lockdep_assert_held(&inode->v.i_rwsem);
BUG_ON(offset + copied > folio_size(folio));
if (unlikely(copied < len && !folio_test_uptodate(folio))) {
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 52722a5e8526..0425238a83ee 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -857,9 +857,7 @@ static int bch2_symlink(struct mnt_idmap *idmap,
if (IS_ERR(inode))
return bch2_err_class(PTR_ERR(inode));
- inode_lock(&inode->v);
ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
- inode_unlock(&inode->v);
if (unlikely(ret))
goto err;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 01c1c6372229..ccc44b1fc178 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -266,7 +266,8 @@ create_lostfound:
root_inode.bi_nlink++;
- ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
+ ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu,
+ inode_opt_get(c, &root_inode, inodes_32bit));
if (ret)
goto err;
@@ -573,7 +574,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
new_inode.bi_subvol = subvolid;
- int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
+ int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu, false) ?:
bch2_btree_iter_traverse(&inode_iter) ?:
bch2_inode_write(trans, &inode_iter, &new_inode);
bch2_trans_iter_exit(&inode_iter);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index d5e5190f0663..4aa130ff7cf6 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -944,11 +944,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
}
static struct bkey_i_inode_alloc_cursor *
-bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
+bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max,
+ bool is_32bit)
{
struct bch_fs *c = trans->c;
- u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1;
+ u64 cursor_idx = is_32bit ? 0 : cpu + 1;
cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);
@@ -967,7 +968,7 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m
if (IS_ERR(cursor))
return cursor;
- if (c->opts.inodes_32bit) {
+ if (is_32bit) {
*min = BLOCKDEV_INODE_MAX;
*max = INT_MAX;
} else {
@@ -996,11 +997,11 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m
int bch2_inode_create(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode_u,
- u32 snapshot, u64 cpu)
+ u32 snapshot, u64 cpu, bool is_32bit)
{
u64 min, max;
struct bkey_i_inode_alloc_cursor *cursor =
- bch2_inode_alloc_cursor_get(trans, cpu, &min, &max);
+ bch2_inode_alloc_cursor_get(trans, cpu, &min, &max, is_32bit);
int ret = PTR_ERR_OR_ZERO(cursor);
if (ret)
return ret;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index b8ec3e628d90..79092ea74844 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -172,7 +172,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
struct bch_inode_unpacked *);
int bch2_inode_create(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, u32, u64);
+ struct bch_inode_unpacked *, u32, u64, bool);
int bch2_inode_rm(struct bch_fs *, subvol_inum);
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index 1f00938b1bdc..e07fa6cc99bd 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -144,7 +144,8 @@ enum inode_opt_id {
x(unlinked, 7) \
x(backptr_untrusted, 8) \
x(has_child_snapshot, 9) \
- x(has_case_insensitive, 10)
+ x(has_case_insensitive, 10) \
+ x(31bit_dirent_offset, 11)
/* bits 20+ reserved for packed fields below: */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 07869436a964..93ac0faedf7d 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -120,6 +120,7 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
INIT_LIST_HEAD(&p->flushed[i]);
atomic_set(&p->count, count);
p->devs.nr = 0;
+ p->bytes = 0;
}
/*
@@ -264,6 +265,11 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
+ struct journal_entry_pin_list *pin_list =
+ journal_seq_pin(j, journal_cur_seq(j));
+ pin_list->bytes = roundup_pow_of_two(vstruct_bytes(buf->data));
+ j->dirty_entry_bytes += pin_list->bytes;
+
if (trace_journal_entry_close_enabled() && trace) {
CLASS(printbuf, err)();
guard(printbuf_atomic)(&err);
@@ -737,9 +743,9 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
return ret;
CLASS(printbuf, buf)();
+ prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret));
bch2_journal_debug_to_text(&buf, j);
bch2_print_str(c, KERN_ERR, buf.buf);
- prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret));
closure_wait_event(&j->async_wait,
!bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 093e4acad085..c5458c61f49a 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -3,6 +3,7 @@
#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_io.h"
+#include "btree_journal_iter.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "buckets.h"
@@ -106,11 +107,6 @@ static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *cs
return !bch2_crc_cmp(j->csum, *csum);
}
-static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
-{
- return (seq - c->journal_entries_base_seq) & (~0U >> 1);
-}
-
static void __journal_replay_free(struct bch_fs *c,
struct journal_replay *i)
{
@@ -195,6 +191,23 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
}
}
+ /* Drop overwrites, log entries if we don't need them: */
+ if (!c->opts.retain_recovery_info &&
+ !c->opts.journal_rewind) {
+ struct jset_entry *dst = j->start;
+ vstruct_for_each_safe(j, src) {
+ if (src->type == BCH_JSET_ENTRY_log ||
+ src->type == BCH_JSET_ENTRY_overwrite)
+ continue;
+
+ memcpy(dst, src, vstruct_bytes(src));
+ dst = vstruct_next(dst);
+ }
+
+ j->u64s = cpu_to_le32((u64 *) dst - j->_data);
+ bytes = vstruct_bytes(j);
+ }
+
jlist->last_seq = max(jlist->last_seq, last_seq);
_i = genradix_ptr_alloc(&c->journal_entries,
@@ -209,6 +222,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
*/
dup = *_i;
if (dup) {
+ BUG_ON(dup->j.seq != j->seq);
+
bool identical = bytes == vstruct_bytes(&dup->j) &&
!memcmp(j, &dup->j, bytes);
bool not_identical = !identical &&
@@ -239,6 +254,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
if (entry_ptr.csum_good && !identical)
goto replace;
+ BUG_ON(dup->j.seq != j->seq);
return ret;
}
replace:
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index f53c5c81d137..f8754bf71264 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -7,29 +7,6 @@
void bch2_journal_pos_from_member_info_set(struct bch_fs *);
void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
-struct journal_ptr {
- bool csum_good;
- struct bch_csum csum;
- u8 dev;
- u32 bucket;
- u32 bucket_offset;
- u64 sector;
-};
-
-/*
- * Only used for holding the journal entries we read in btree_journal_read()
- * during cache_registration
- */
-struct journal_replay {
- DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
-
- bool csum_good;
- bool ignore_blacklisted;
- bool ignore_not_dirty;
- /* must be last: */
- struct jset j;
-};
-
static inline bool journal_replay_ignore(struct journal_replay *i)
{
return !i || i->ignore_blacklisted || i->ignore_not_dirty;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index f23e5ee9ad75..bd1885607d3e 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -148,6 +148,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
+ size_t mem_limit = max_t(ssize_t, 0,
+ (totalram_pages() * PAGE_SIZE) / 4 - j->dirty_entry_bytes);
+
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
if (!ca->journal.nr ||
!ca->mi.durability)
@@ -180,6 +183,7 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
* @nr_devs_want largest devices:
*/
space = dev_space[nr_devs_want - 1];
+ space.total = min(space.total, mem_limit >> 9);
space.next_entry = min(space.next_entry, min_bucket_size);
return space;
}
@@ -328,9 +332,17 @@ void bch2_journal_reclaim_fast(struct journal *j)
* Unpin journal entries whose reference counts reached zero, meaning
* all btree nodes got written out
*/
+ struct journal_entry_pin_list *pin_list;
while (!fifo_empty(&j->pin) &&
j->pin.front <= j->seq_ondisk &&
- !atomic_read(&fifo_peek_front(&j->pin).count)) {
+ !atomic_read(&(pin_list = &fifo_peek_front(&j->pin))->count)) {
+
+ if (WARN_ON(j->dirty_entry_bytes < pin_list->bytes))
+ pin_list->bytes = j->dirty_entry_bytes;
+
+ j->dirty_entry_bytes -= pin_list->bytes;
+ pin_list->bytes = 0;
+
j->pin.front++;
popped = true;
}
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 51104bbb99da..7c9273bd0e15 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -71,6 +71,7 @@ struct journal_entry_pin_list {
struct list_head flushed[JOURNAL_PIN_TYPE_NR];
atomic_t count;
struct bch_devs_list devs;
+ size_t bytes;
};
struct journal;
@@ -253,6 +254,7 @@ struct journal {
u64 front, back, size, mask;
struct journal_entry_pin_list *data;
} pin;
+ size_t dirty_entry_bytes;
struct journal_space space[journal_space_nr];
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index a66d01d04e57..892990b4a6a6 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -125,6 +125,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c,
if (!btree_type_has_ptrs(id))
continue;
+ /* Stripe keys have pointers, but are handled separately */
+ if (id == BTREE_ID_stripes)
+ continue;
+
int ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index df6833416855..4f41f1f6ec6c 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -819,7 +819,9 @@ static int bch2_move_data(struct bch_fs *c,
unsigned min_depth_this_btree = min_depth;
- if (!btree_type_has_ptrs(id))
+ /* Stripe keys have pointers, but are handled separately */
+ if (!btree_type_has_ptrs(id) ||
+ id == BTREE_ID_stripes)
min_depth_this_btree = max(min_depth_this_btree, 1);
for (unsigned level = min_depth_this_btree;
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index d1019052f182..5c321a0d1f89 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -62,7 +62,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (flags & BCH_CREATE_TMPFILE)
new_inode->bi_flags |= BCH_INODE_unlinked;
- ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
+ ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu,
+ inode_opt_get(c, dir_u, inodes_32bit));
if (ret)
goto err;
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 921f9049912d..c3ef35dc01e2 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -525,7 +525,7 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id
switch (id) {
case Opt_state:
if (ca)
- return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED);
+ return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED, NULL);
break;
case Opt_compression:
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 84ce69a7f131..31a3abcbd83e 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -242,7 +242,7 @@ enum fsck_err_opts {
x(inodes_32bit, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- BCH_SB_INODE_32BIT, true, \
+ BCH_SB_INODE_32BIT, false, \
NULL, "Constrain inode numbers to 32 bits") \
x(shard_inode_numbers_bits, u8, \
OPT_FS|OPT_FORMAT, \
@@ -321,6 +321,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Don't kick drives out when splitbrain detected")\
+ x(no_version_check, u8, \
+ OPT_HIDDEN, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Don't fail reading the superblock due to incompatible version")\
x(verbose, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 17ca56b0e2ac..e1db63d75a99 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -444,8 +444,9 @@ static int do_rebalance_extent(struct moving_context *ctxt,
bch2_bkey_buf_init(&sk);
- ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
- extent_iter, &io_opts, &data_opts));
+ ret = lockrestart_do(trans,
+ bkey_err(k = next_rebalance_extent(trans, work_pos,
+ extent_iter, &io_opts, &data_opts)));
if (ret || !k.k)
goto out;
@@ -587,7 +588,7 @@ static int do_rebalance(struct moving_context *ctxt)
ret = k->k.type == KEY_TYPE_cookie
? do_rebalance_scan(ctxt, k->k.p.inode,
le64_to_cpu(bkey_i_to_cookie(k)->v.cookie))
- : lockrestart_do(trans, do_rebalance_extent(ctxt, k->k.p, &extent_iter));
+ : do_rebalance_extent(ctxt, k->k.p, &extent_iter);
if (ret)
break;
}
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 29e81f96db0f..6319144a440c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -64,7 +64,6 @@ int bch2_btree_lost_data(struct bch_fs *c,
* but in debug mode we want the next fsck run to be clean:
*/
ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0, &write_sb) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0, &write_sb) ?: ret;
#endif
write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
@@ -182,9 +181,12 @@ void bch2_reconstruct_alloc(struct bch_fs *c)
*/
static void zero_out_btree_mem_ptr(struct journal_keys *keys)
{
- darray_for_each(*keys, i)
- if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
+ struct bch_fs *c = container_of(keys, struct bch_fs, journal_keys);
+ darray_for_each(*keys, i) {
+ struct bkey_i *k = journal_key_k(c, i);
+ if (k->k.type == KEY_TYPE_btree_ptr_v2)
+ bkey_i_to_btree_ptr_v2(k)->v.mem_ptr = 0;
+ }
}
/* journal replay: */
@@ -202,8 +204,10 @@ static void replay_now_at(struct journal *j, u64 seq)
static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
struct journal_key *k)
{
+ struct bch_fs *c = trans->c;
+ struct bkey_i *bk = journal_key_k(c, k);
struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, bk->k.p,
BTREE_MAX_DEPTH, k->level,
BTREE_ITER_intent);
int ret = bch2_btree_iter_traverse(&iter);
@@ -214,14 +218,14 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
/* Has this delta already been applied to the btree? */
- if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
+ if (bversion_cmp(old.k->bversion, bk->k.bversion) >= 0) {
ret = 0;
goto out;
}
- struct bkey_i *new = k->k;
+ struct bkey_i *new = bk;
if (old.k->type == KEY_TYPE_accounting) {
- new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
+ new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(bk));
ret = PTR_ERR_OR_ZERO(new);
if (ret)
goto out;
@@ -230,7 +234,8 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
bkey_s_c_to_accounting(old));
}
- trans->journal_res.seq = k->journal_seq;
+ if (!k->allocated)
+ trans->journal_res.seq = c->journal_entries_base_seq + k->journal_seq_offset;
ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun);
out:
@@ -241,6 +246,7 @@ out:
static int bch2_journal_replay_key(struct btree_trans *trans,
struct journal_key *k)
{
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
unsigned iter_flags =
BTREE_ITER_intent|
@@ -251,7 +257,8 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (k->overwritten)
return 0;
- trans->journal_res.seq = k->journal_seq;
+ if (!k->allocated)
+ trans->journal_res.seq = c->journal_entries_base_seq + k->journal_seq_offset;
/*
* BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to
@@ -266,7 +273,8 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
else
update_flags |= BTREE_UPDATE_key_cache_reclaim;
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ struct bkey_i *bk = journal_key_k(c, k);
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, bk->k.p,
BTREE_MAX_DEPTH, k->level,
iter_flags);
ret = bch2_btree_iter_traverse(&iter);
@@ -275,13 +283,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
struct btree_path *path = btree_iter_path(trans, &iter);
if (unlikely(!btree_path_node(path, k->level))) {
- struct bch_fs *c = trans->c;
-
CLASS(printbuf, buf)();
prt_str(&buf, "btree=");
bch2_btree_id_to_text(&buf, k->btree_id);
prt_printf(&buf, " level=%u ", k->level);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k));
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(bk));
if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)|
BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) {
@@ -298,7 +304,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
}
bch2_trans_iter_exit(&iter);
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, bk->k.p,
BTREE_MAX_DEPTH, 0, iter_flags);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_btree_increase_depth(trans, iter.path, 0) ?:
@@ -310,17 +316,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (k->overwritten)
goto out;
- if (k->k->k.type == KEY_TYPE_accounting) {
- struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s);
+ if (bk->k.type == KEY_TYPE_accounting) {
+ struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, bk->k.u64s);
ret = PTR_ERR_OR_ZERO(n);
if (ret)
goto out;
- bkey_copy(n, k->k);
+ bkey_copy(n, bk);
goto out;
}
- ret = bch2_trans_update(trans, &iter, k->k, update_flags);
+ ret = bch2_trans_update(trans, &iter, bk, update_flags);
out:
bch2_trans_iter_exit(&iter);
return ret;
@@ -331,13 +337,9 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
const struct journal_key *l = *((const struct journal_key **)_l);
const struct journal_key *r = *((const struct journal_key **)_r);
- /*
- * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
- *
- * journal_seq == 0 means that the key comes from early repair, and
- * should be inserted last so as to avoid overflowing the journal
- */
- return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
+ return !l->allocated && !r->allocated
+ ? cmp_int(l->journal_seq_offset, r->journal_seq_offset)
+ : cmp_int(l->allocated, r->allocated);
}
DEFINE_DARRAY_NAMED(darray_journal_keys, struct journal_key *)
@@ -369,7 +371,9 @@ int bch2_journal_replay(struct bch_fs *c)
* flush accounting keys until we're done
*/
darray_for_each(*keys, k) {
- if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
+ struct bkey_i *bk = journal_key_k(trans->c, k);
+
+ if (!(bk->k.type == KEY_TYPE_accounting && !k->allocated))
continue;
cond_resched();
@@ -412,7 +416,6 @@ int bch2_journal_replay(struct bch_fs *c)
BCH_TRANS_COMMIT_skip_accounting_apply|
(!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
bch2_journal_replay_key(trans, k));
- BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
if (ret) {
ret = darray_push(&keys_sorted, k);
if (ret)
@@ -434,8 +437,8 @@ int bch2_journal_replay(struct bch_fs *c)
struct journal_key *k = *kp;
- if (k->journal_seq)
- replay_now_at(j, k->journal_seq);
+ if (!k->allocated)
+ replay_now_at(j, c->journal_entries_base_seq + k->journal_seq_offset);
else
replay_now_at(j, j->replay_journal_seq_end);
diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h
index b63c20558d3d..2696eee00345 100644
--- a/fs/bcachefs/recovery_passes_format.h
+++ b/fs/bcachefs/recovery_passes_format.h
@@ -37,7 +37,7 @@
x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \
x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \
x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \
- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
+ x(check_backpointers_to_extents, 13, PASS_ONLINE) \
x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \
x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \
x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 0784283ce78c..3ffd68d2608d 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -784,7 +784,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
/* Query replicas: */
bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
- unsigned flags, bool print)
+ unsigned flags, struct printbuf *err)
{
struct bch_replicas_entry_v1 *e;
@@ -823,16 +823,14 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
: BCH_FORCE_IF_DATA_DEGRADED;
if (dflags & ~flags) {
- if (print) {
- CLASS(printbuf, buf)();
-
- bch2_replicas_entry_to_text(&buf, e);
- bch_err(c, "insufficient devices online (%u) for replicas entry %s",
- nr_online, buf.buf);
+ if (err) {
+ prt_printf(err, "insufficient devices online (%u) for replicas entry ",
+ nr_online);
+ bch2_replicas_entry_to_text(err, e);
+ prt_newline(err);
}
return false;
}
-
}
return true;
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 5aba2c1ce133..15023a9b0b1e 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -44,7 +44,7 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
}
bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
- unsigned, bool);
+ unsigned, struct printbuf *);
unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
index 44bc12573a0c..96ad64920810 100644
--- a/fs/bcachefs/sb-counters_format.h
+++ b/fs/bcachefs/sb-counters_format.h
@@ -22,7 +22,7 @@ enum counters_flags {
x(io_read_split, 33, TYPE_COUNTER) \
x(io_read_reuse_race, 34, TYPE_COUNTER) \
x(io_read_retry, 32, TYPE_COUNTER) \
- x(io_read_fail_and_poison, 82, TYPE_COUNTER) \
+ x(io_read_fail_and_poison, 95, TYPE_COUNTER) \
x(io_write, 1, TYPE_SECTORS) \
x(io_move, 2, TYPE_SECTORS) \
x(io_move_read, 35, TYPE_SECTORS) \
@@ -124,4 +124,13 @@ struct bch_sb_field_counters {
__le64 d[];
};
+static inline void __maybe_unused check_bch_counter_ids_unique(void) {
+ switch(0){
+#define x(t, n, ...) case (n):
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ ;
+ }
+}
+
#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index e3c73d903898..d26a0ca4a59d 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -36,10 +36,12 @@ int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev)
void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev)
{
- if (dev != BCH_SB_MEMBER_INVALID)
+ if (dev != BCH_SB_MEMBER_INVALID) {
bch2_fs_inconsistent(c, "pointer to %s device %u",
test_bit(dev, c->devs_removed.d)
? "removed" : "nonexistent", dev);
+ dump_stack();
+ }
}
void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket)
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 84f987d3a02a..eab0c1e3ff56 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1673,7 +1673,8 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
return ret;
darray_for_each(*deleted, i)
- nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
+ nr_deleted_ancestors += bch2_snapshots_same_tree(c, s->k.p.offset, i->id) &&
+ bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
if (!nr_deleted_ancestors)
return 0;
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index fef32a0118c4..28d9a29a1fd0 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -51,6 +51,17 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
return s ? s->tree : 0;
}
+static inline bool bch2_snapshots_same_tree(struct bch_fs *c, u32 id1, u32 id2)
+{
+ if (id1 == id2)
+ return true;
+
+ guard(rcu)();
+ const struct snapshot_t *s1 = snapshot_t(c, id1);
+ const struct snapshot_t *s2 = snapshot_t(c, id2);
+ return s1 && s2 && s1->tree == s2->tree;
+}
+
static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{
const struct snapshot_t *s = snapshot_t(c, id);
@@ -157,6 +168,10 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
+ EBUG_ON(!id);
+ EBUG_ON(!ancestor);
+ EBUG_ON(!bch2_snapshots_same_tree(c, id, ancestor));
+
return id == ancestor
? true
: __bch2_snapshot_is_ancestor(c, id, ancestor);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 8c0fb44929cc..2a61cc36ddbf 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -34,6 +34,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
struct bch_hash_info {
u32 inum_snapshot;
u8 type;
+ bool is_31bit;
struct unicode_map *cf_encoding;
/*
* For crc32 or crc64 string hashes the first key value of
@@ -48,6 +49,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
struct bch_hash_info info = {
.inum_snapshot = bi->bi_snapshot,
.type = INODE_STR_HASH(bi),
+ .is_31bit = bi->bi_flags & BCH_INODE_31bit_dirent_offset,
.cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
.siphash_key = { .k0 = bi->bi_hash_seed }
};
@@ -112,8 +114,8 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
}
}
-static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
- const struct bch_hash_info *info)
+static inline u64 __bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
+ const struct bch_hash_info *info)
{
switch (info->type) {
case BCH_STR_HASH_crc32c:
@@ -128,6 +130,14 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
}
}
+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
+ const struct bch_hash_info *info,
+ bool maybe_31bit)
+{
+ return __bch2_str_hash_end(ctx, info) &
+ (maybe_31bit && info->is_31bit ? INT_MAX : U64_MAX);
+}
+
struct bch_hash_desc {
enum btree_id btree_id;
u8 key_type;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index be7ed612d28f..61eeac671283 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -89,7 +89,7 @@ int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version v
prt_str(&buf, "requested incompat feature ");
bch2_version_to_text(&buf, version);
prt_str(&buf, " currently not enabled, allowed up to ");
- bch2_version_to_text(&buf, version);
+ bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
prt_printf(&buf, "\n set version_upgrade=incompat to enable");
bch_notice(c, "%s", buf.buf);
@@ -379,7 +379,7 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
return 0;
}
-int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
+int bch2_sb_validate(struct bch_sb *sb, struct bch_opts *opts, u64 read_offset,
enum bch_validate_flags flags, struct printbuf *out)
{
enum bch_opt_id opt_id;
@@ -389,28 +389,30 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
if (ret)
return ret;
- u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
- unsigned incompat_bit = 0;
- if (incompat)
- incompat_bit = __ffs64(incompat);
- else if (sb->features[1])
- incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
-
- if (incompat_bit) {
- prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
- incompat_bit,
- bch2_sb_features[BCH_FEATURE_NR - 1],
- BCH_FEATURE_NR - 1);
- return -BCH_ERR_invalid_sb_features;
- }
+ if (!opts->no_version_check) {
+ u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
+ unsigned incompat_bit = 0;
+ if (incompat)
+ incompat_bit = __ffs64(incompat);
+ else if (sb->features[1])
+ incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
+
+ if (incompat_bit) {
+ prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
+ incompat_bit,
+ bch2_sb_features[BCH_FEATURE_NR - 1],
+ BCH_FEATURE_NR - 1);
+ return -BCH_ERR_invalid_sb_features;
+ }
- if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
- BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
- prt_str(out, "Filesystem has incompatible version ");
- bch2_version_to_text(out, le16_to_cpu(sb->version));
- prt_str(out, ", current version ");
- bch2_version_to_text(out, bcachefs_metadata_version_current);
- return -BCH_ERR_invalid_sb_features;
+ if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
+ BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
+ prt_str(out, "Filesystem has incompatible version ");
+ bch2_version_to_text(out, le16_to_cpu(sb->version));
+ prt_str(out, ", current version ");
+ bch2_version_to_text(out, bcachefs_metadata_version_current);
+ return -BCH_ERR_invalid_sb_features;
+ }
}
if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
@@ -915,7 +917,7 @@ got_super:
sb->have_layout = true;
- ret = bch2_sb_validate(sb->sb, offset, 0, &err);
+ ret = bch2_sb_validate(sb->sb, opts, offset, 0, &err);
if (ret) {
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
@@ -1081,9 +1083,10 @@ int bch2_write_super(struct bch_fs *c)
bch2_sb_from_fs(c, (*ca));
darray_for_each(online_devices, ca) {
+ struct bch_opts opts = bch2_opts_empty();
printbuf_reset(&err);
- ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
+ ret = bch2_sb_validate((*ca)->disk_sb.sb, &opts, 0, BCH_VALIDATE_write, &err);
if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
goto out;
@@ -1186,13 +1189,13 @@ int bch2_write_super(struct bch_fs *c)
nr_wrote = dev_mask_nr(&sb_written);
can_mount_with_written =
- bch2_have_enough_devs(c, sb_written, degraded_flags, false);
+ bch2_have_enough_devs(c, sb_written, degraded_flags, NULL);
for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++)
sb_written.d[i] = ~sb_written.d[i];
can_mount_without_written =
- bch2_have_enough_devs(c, sb_written, degraded_flags, false);
+ bch2_have_enough_devs(c, sb_written, degraded_flags, NULL);
/*
* If we would be able to mount _without_ the devices we successfully
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index a3b7a90f2533..82cb3a3ceeae 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -92,7 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
-int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
+int bch2_sb_validate(struct bch_sb *, struct bch_opts *, u64,
+ enum bch_validate_flags, struct printbuf *);
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 70d5aff38723..cc9d00e1afd5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1021,6 +1021,12 @@ static int bch2_fs_opt_version_init(struct bch_fs *c)
prt_bitflags(&p, bch2_recovery_passes, sb_passes);
}
+ u64 btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
+ if (btrees_lost_data) {
+ prt_str(&p, "\nsuperblock indicates damage to following btrees:\n ");
+ prt_bitflags(&p, __bch2_btree_ids, btrees_lost_data);
+ }
+
if (bch2_check_version_downgrade(c)) {
prt_str(&p, "\nVersion downgrade required:");
@@ -1362,10 +1368,14 @@ static bool bch2_fs_may_start(struct bch_fs *c)
return false;
}
break;
- }
+ }
}
- return bch2_have_enough_devs(c, c->online_devs, flags, true);
+ CLASS(printbuf, err)();
+ bool ret = bch2_have_enough_devs(c, c->online_devs, flags, &err);
+ if (!ret)
+ bch2_print_str(c, KERN_ERR, err.buf);
+ return ret;
}
int bch2_fs_start(struct bch_fs *c)
@@ -1557,7 +1567,6 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
- genradix_free(&ca->buckets_gc);
bch2_free_super(&ca->disk_sb);
bch2_dev_allocator_background_exit(ca);
@@ -1741,19 +1750,20 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
return 0;
}
-static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb,
+ struct printbuf *err)
{
unsigned ret;
if (bch2_dev_is_online(ca)) {
- bch_err(ca, "already have device online in slot %u",
- sb->sb->dev_idx);
+ prt_printf(err, "already have device online in slot %u\n",
+ sb->sb->dev_idx);
return bch_err_throw(ca->fs, device_already_online);
}
if (get_capacity(sb->bdev->bd_disk) <
ca->mi.bucket_size * ca->mi.nbuckets) {
- bch_err(ca, "cannot online: device too small (capacity %llu filesystem size %llu nbuckets %llu)",
+ prt_printf(err, "cannot online: device too small (capacity %llu filesystem size %llu nbuckets %llu)\n",
get_capacity(sb->bdev->bd_disk),
ca->mi.bucket_size * ca->mi.nbuckets,
ca->mi.nbuckets);
@@ -1789,7 +1799,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
return 0;
}
-static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb,
+ struct printbuf *err)
{
struct bch_dev *ca;
int ret;
@@ -1804,7 +1815,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
ca = bch2_dev_locked(c, sb->sb->dev_idx);
- ret = __bch2_dev_attach_bdev(ca, sb);
+ ret = __bch2_dev_attach_bdev(ca, sb, err);
if (ret)
return ret;
@@ -1828,7 +1839,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
* because we got an error or what have you?
*/
bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
+ enum bch_member_state new_state, int flags,
+ struct printbuf *err)
{
struct bch_devs_mask new_online_devs;
int nr_rw = 0, required;
@@ -1865,7 +1877,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
new_online_devs = c->online_devs;
__clear_bit(ca->dev_idx, new_online_devs.d);
- return bch2_have_enough_devs(c, new_online_devs, flags, false);
+ return bch2_have_enough_devs(c, new_online_devs, flags, err);
default:
BUG();
}
@@ -1899,14 +1911,15 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
+ enum bch_member_state new_state, int flags,
+ struct printbuf *err)
{
int ret = 0;
if (ca->mi.state == new_state)
return 0;
- if (!bch2_dev_state_allowed(c, ca, new_state, flags))
+ if (!bch2_dev_state_allowed(c, ca, new_state, flags, err))
return bch_err_throw(c, device_state_not_allowed);
if (new_state != BCH_MEMBER_STATE_rw)
@@ -1929,15 +1942,17 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
}
int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
+ enum bch_member_state new_state, int flags,
+ struct printbuf *err)
{
guard(rwsem_write)(&c->state_lock);
- return __bch2_dev_set_state(c, ca, new_state, flags);
+ return __bch2_dev_set_state(c, ca, new_state, flags, err);
}
/* Device add/removal: */
-int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags,
+ struct printbuf *err)
{
unsigned dev_idx = ca->dev_idx, data;
bool fast_device_removal = !bch2_request_incompat_feature(c,
@@ -1952,8 +1967,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
*/
bch2_dev_put(ca);
- if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
- bch_err(ca, "Cannot remove without losing data");
+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags, NULL)) {
+ prt_printf(err, "Cannot remove without losing data\n");
ret = bch_err_throw(c, device_state_not_allowed);
goto err;
}
@@ -1973,16 +1988,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (!data_type_is_empty(i) &&
!data_type_is_hidden(i) &&
usage.buckets[i]) {
- bch_err(ca, "Remove failed: still has data (%s, %llu buckets)",
- __bch2_data_types[i], usage.buckets[i]);
+ prt_printf(err, "Remove failed: still has data (%s, %llu buckets)\n",
+ __bch2_data_types[i], usage.buckets[i]);
ret = -EBUSY;
goto err;
}
ret = bch2_dev_remove_alloc(c, ca);
- bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_dev_remove_alloc() error: %s\n", bch2_err_str(ret));
goto err;
+ }
/*
* We need to flush the entire journal to get rid of keys that reference
@@ -1995,25 +2011,28 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
* calls, and could be cleaned up:
*/
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
- bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_journal_flush_device_pins() error: %s\n", bch2_err_str(ret));
goto err;
+ }
ret = bch2_journal_flush(&c->journal);
- bch_err_msg(ca, ret, "bch2_journal_flush()");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_journal_flush() error: %s\n", bch2_err_str(ret));
goto err;
+ }
ret = bch2_replicas_gc2(c);
- bch_err_msg(ca, ret, "bch2_replicas_gc2()");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_replicas_gc2() error: %s\n", bch2_err_str(ret));
goto err;
+ }
data = bch2_dev_has_data(c, ca);
if (data) {
- CLASS(printbuf, data_has)();
- prt_bitflags(&data_has, __bch2_data_types, data);
- bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
+ prt_str(err, "Remove failed, still has data (");
+ prt_bitflags(err, __bch2_data_types, data);
+ prt_str(err, ")\n");
ret = -EBUSY;
goto err;
}
@@ -2058,7 +2077,7 @@ err:
}
/* Add new device to running filesystem: */
-int bch2_dev_add(struct bch_fs *c, const char *path)
+int bch2_dev_add(struct bch_fs *c, const char *path, struct printbuf *err)
{
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb = {};
@@ -2067,9 +2086,10 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
int ret = 0;
ret = bch2_read_super(path, &opts, &sb);
- bch_err_msg(c, ret, "reading super");
- if (ret)
+ if (ret) {
+ prt_printf(err, "error reading superblock: %s\n", bch2_err_str(ret));
goto err;
+ }
struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
@@ -2090,7 +2110,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
}
if (ret) {
- bch_err(c, "filesystem UUID already open");
+ prt_printf(err, "cannot go multidevice: filesystem UUID already open\n");
goto err;
}
}
@@ -2105,7 +2125,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err;
}
- ret = __bch2_dev_attach_bdev(ca, &sb);
+ ret = __bch2_dev_attach_bdev(ca, &sb, err);
if (ret)
goto err;
@@ -2114,16 +2134,17 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true);
ret = bch2_sb_from_fs(c, ca);
- bch_err_msg(c, ret, "setting up new superblock");
- if (ret)
+ if (ret) {
+ prt_printf(err, "error setting up new superblock: %s\n", bch2_err_str(ret));
goto err;
+ }
if (dynamic_fault("bcachefs:add:no_slot"))
goto err;
ret = bch2_sb_member_alloc(c);
if (ret < 0) {
- bch_err_msg(c, ret, "setting up new superblock");
+ prt_printf(err, "error allocating superblock member slot: %s\n", bch2_err_str(ret));
goto err;
}
unsigned dev_idx = ret;
@@ -2141,7 +2162,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (BCH_MEMBER_GROUP(&dev_mi)) {
ret = __bch2_dev_group_set(c, ca, label.buf);
- bch_err_msg(c, ret, "creating new label");
+ prt_printf(err, "error creating new label: %s\n", bch2_err_str(ret));
if (ret)
goto err_late;
}
@@ -2155,22 +2176,25 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (test_bit(BCH_FS_started, &c->flags)) {
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- bch_err_msg(ca, ret, "marking new superblock");
- if (ret)
+ if (ret) {
+ prt_printf(err, "error marking new superblock: %s\n", bch2_err_str(ret));
goto err_late;
+ }
ret = bch2_fs_freespace_init(c);
- bch_err_msg(ca, ret, "initializing free space");
- if (ret)
+ if (ret) {
+ prt_printf(err, "error initializing free space: %s\n", bch2_err_str(ret));
goto err_late;
+ }
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
ret = bch2_dev_journal_alloc(ca, false);
- bch_err_msg(c, ret, "allocating journal");
- if (ret)
+ if (ret) {
+ prt_printf(err, "error allocating journal: %s\n", bch2_err_str(ret));
goto err_late;
+ }
}
/*
@@ -2203,7 +2227,7 @@ err_late:
}
/* Hot add existing device to running filesystem: */
-int bch2_dev_online(struct bch_fs *c, const char *path)
+int bch2_dev_online(struct bch_fs *c, const char *path, struct printbuf *err)
{
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb = { NULL };
@@ -2214,42 +2238,48 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
guard(rwsem_write)(&c->state_lock);
ret = bch2_read_super(path, &opts, &sb);
- if (ret)
+ if (ret) {
+ prt_printf(err, "error reading superblock: %s\n", bch2_err_str(ret));
return ret;
+ }
dev_idx = sb.sb->dev_idx;
ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
- bch_err_msg(c, ret, "bringing %s online", path);
- if (ret)
+ if (ret) {
+ prt_printf(err, "device not a member of fs: %s\n", bch2_err_str(ret));
goto err;
+ }
- ret = bch2_dev_attach_bdev(c, &sb);
+ ret = bch2_dev_attach_bdev(c, &sb, err);
if (ret)
goto err;
ca = bch2_dev_locked(c, dev_idx);
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_trans_mark_dev_sb() error: %s\n", bch2_err_str(ret));
goto err;
+ }
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
if (!ca->mi.freespace_initialized) {
ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
- bch_err_msg(ca, ret, "initializing free space");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_dev_freespace_init() error: %s\n", bch2_err_str(ret));
goto err;
+ }
}
if (!ca->journal.nr) {
ret = bch2_dev_journal_alloc(ca, false);
- bch_err_msg(ca, ret, "allocating journal");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_dev_journal_alloc() error: %s\n", bch2_err_str(ret));
goto err;
+ }
}
scoped_guard(mutex, &c->sb_lock) {
@@ -2264,17 +2294,17 @@ err:
return ret;
}
-int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags, struct printbuf *err)
{
guard(rwsem_write)(&c->state_lock);
if (!bch2_dev_is_online(ca)) {
- bch_err(ca, "Already offline");
+ prt_printf(err, "Already offline\n");
return 0;
}
- if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
- bch_err(ca, "Cannot offline required disk");
+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags, NULL)) {
+ prt_printf(err, "Cannot offline required disk\n");
return bch_err_throw(c, device_state_not_allowed);
}
@@ -2294,7 +2324,7 @@ static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new
bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets);
}
-int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets, struct printbuf *err)
{
u64 old_nbuckets;
int ret = 0;
@@ -2303,31 +2333,36 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
old_nbuckets = ca->mi.nbuckets;
if (nbuckets < ca->mi.nbuckets) {
- bch_err(ca, "Cannot shrink yet");
+ prt_printf(err, "Cannot shrink yet\n");
return -EINVAL;
}
if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
- bch_err(ca, "New device size too big (%llu greater than max %u)",
- nbuckets, BCH_MEMBER_NBUCKETS_MAX);
+ prt_printf(err, "New device size too big (%llu greater than max %u)\n",
+ nbuckets, BCH_MEMBER_NBUCKETS_MAX);
return bch_err_throw(c, device_size_too_big);
}
if (bch2_dev_is_online(ca) &&
get_capacity(ca->disk_sb.bdev->bd_disk) <
ca->mi.bucket_size * nbuckets) {
- bch_err(ca, "New size larger than device");
+ prt_printf(err, "New size %llu larger than device size %llu\n",
+ ca->mi.bucket_size * nbuckets,
+ get_capacity(ca->disk_sb.bdev->bd_disk));
return bch_err_throw(c, device_size_too_small);
}
ret = bch2_dev_buckets_resize(c, ca, nbuckets);
- bch_err_msg(ca, ret, "resizing buckets");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_dev_buckets_resize() error: %s\n", bch2_err_str(ret));
return ret;
+ }
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_trans_mark_dev_sb() error: %s\n", bch2_err_str(ret));
return ret;
+ }
scoped_guard(mutex, &c->sb_lock) {
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
@@ -2338,8 +2373,10 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (ca->mi.freespace_initialized) {
ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets);
- if (ret)
+ if (ret) {
+ prt_printf(err, "__bch2_dev_resize_alloc() error: %s\n", bch2_err_str(ret));
return ret;
+ }
}
bch2_recalc_capacity(c);
@@ -2450,10 +2487,14 @@ static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
if (ca) {
+ CLASS(printbuf, buf)();
+ __bch2_log_msg_start(ca->name, &buf);
+ prt_printf(&buf, "offline from block layer\n");
+
bool dev = bch2_dev_state_allowed(c, ca,
BCH_MEMBER_STATE_failed,
- BCH_FORCE_IF_DEGRADED);
-
+ BCH_FORCE_IF_DEGRADED,
+ &buf);
if (!dev && sb) {
if (!surprise)
sync_filesystem(sb);
@@ -2461,11 +2502,6 @@ static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
evict_inodes(sb);
}
- CLASS(printbuf, buf)();
- __bch2_log_msg_start(ca->name, &buf);
-
- prt_printf(&buf, "offline from block layer");
-
if (dev) {
__bch2_dev_offline(c, ca);
} else {
@@ -2543,11 +2579,6 @@ struct bch_fs *bch2_fs_open(darray_const_str *devices,
BUG_ON(darray_push(&sbs, sb));
}
- if (opts->nochanges && !opts->read_only) {
- ret = bch_err_throw(c, erofs_nochanges);
- goto err_print;
- }
-
darray_for_each(sbs, sb)
if (!best || sb_cmp(sb->sb, best->sb) > 0)
best = sb;
@@ -2575,9 +2606,12 @@ struct bch_fs *bch2_fs_open(darray_const_str *devices,
scoped_guard(rwsem_write, &c->state_lock)
darray_for_each(sbs, sb) {
- ret = bch2_dev_attach_bdev(c, sb);
- if (ret)
+ CLASS(printbuf, err)();
+ ret = bch2_dev_attach_bdev(c, sb, &err);
+ if (ret) {
+ bch_err(bch2_dev_locked(c, sb->sb->dev_idx), "%s", err.buf);
goto err;
+ }
}
if (!c->opts.nostart) {
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index e90bab9afe78..d13dbf2b8227 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -17,18 +17,20 @@ struct bch_fs *bch2_dev_to_fs(dev_t);
struct bch_fs *bch2_uuid_to_fs(__uuid_t);
bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
+ enum bch_member_state, int,
+ struct printbuf *);
int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
+ enum bch_member_state, int,
+ struct printbuf *);
int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
-
-int bch2_dev_fail(struct bch_dev *, int);
-int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_add(struct bch_fs *, const char *);
-int bch2_dev_online(struct bch_fs *, const char *);
-int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
+ enum bch_member_state, int,
+ struct printbuf *);
+
+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int, struct printbuf *);
+int bch2_dev_add(struct bch_fs *, const char *, struct printbuf *);
+int bch2_dev_online(struct bch_fs *, const char *, struct printbuf *);
+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int, struct printbuf *);
+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64, struct printbuf *);
struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
bool bch2_fs_emergency_read_only(struct bch_fs *);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 6094b568dd33..6d7303008b19 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -4,6 +4,7 @@
#include "acl.h"
#include "bkey_methods.h"
#include "btree_update.h"
+#include "dirent.h"
#include "extents.h"
#include "fs.h"
#include "rebalance.h"
@@ -25,7 +26,7 @@ static u64 bch2_xattr_hash(const struct bch_hash_info *info,
bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
- return bch2_str_hash_end(&ctx, info);
+ return bch2_str_hash_end(&ctx, info, false);
}
static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
@@ -484,6 +485,22 @@ static int inode_opt_set_fn(struct btree_trans *trans,
return ret;
}
+ if (s->id == Inode_opt_inodes_32bit &&
+ !bch2_request_incompat_feature(trans->c, bcachefs_metadata_version_31bit_dirent_offset)) {
+ /*
+ * Make sure the dir is empty, as otherwise we'd need to
+ * rehash everything and update the dirent keys.
+ */
+ int ret = bch2_empty_dir_trans(trans, inode_inum(inode));
+ if (ret < 0)
+ return ret;
+
+ if (s->defined)
+ bi->bi_flags |= BCH_INODE_31bit_dirent_offset;
+ else
+ bi->bi_flags &= ~BCH_INODE_31bit_dirent_offset;
+ }
+
if (s->defined)
bi->bi_fields_set |= 1U << s->id;
else