summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/backpointers.c47
-rw-r--r--fs/bcachefs/bcachefs.h17
-rw-r--r--fs/bcachefs/bcachefs_format.h46
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h95
-rw-r--r--fs/bcachefs/bkey_buf.h44
-rw-r--r--fs/bcachefs/bkey_methods.c6
-rw-r--r--fs/bcachefs/bkey_types.h5
-rw-r--r--fs/bcachefs/btree_cache.c4
-rw-r--r--fs/bcachefs/btree_gc.c93
-rw-r--r--fs/bcachefs/btree_io.c7
-rw-r--r--fs/bcachefs/btree_iter.c114
-rw-r--r--fs/bcachefs/btree_iter.h6
-rw-r--r--fs/bcachefs/btree_journal_iter.c209
-rw-r--r--fs/bcachefs/btree_journal_iter.h35
-rw-r--r--fs/bcachefs/btree_journal_iter_types.h38
-rw-r--r--fs/bcachefs/btree_node_scan.c4
-rw-r--r--fs/bcachefs/btree_trans_commit.c4
-rw-r--r--fs/bcachefs/btree_update.c53
-rw-r--r--fs/bcachefs/btree_update.h17
-rw-r--r--fs/bcachefs/btree_update_interior.c16
-rw-r--r--fs/bcachefs/btree_write_buffer.c10
-rw-r--r--fs/bcachefs/btree_write_buffer.h2
-rw-r--r--fs/bcachefs/buckets.c30
-rw-r--r--fs/bcachefs/chardev.c215
-rw-r--r--fs/bcachefs/darray.c23
-rw-r--r--fs/bcachefs/darray.h19
-rw-r--r--fs/bcachefs/data_update.c19
-rw-r--r--fs/bcachefs/data_update.h1
-rw-r--r--fs/bcachefs/dirent.c2
-rw-r--r--fs/bcachefs/disk_accounting.c181
-rw-r--r--fs/bcachefs/disk_accounting.h10
-rw-r--r--fs/bcachefs/error.c12
-rw-r--r--fs/bcachefs/error.h3
-rw-r--r--fs/bcachefs/extent_update.c62
-rw-r--r--fs/bcachefs/extent_update.h2
-rw-r--r--fs/bcachefs/extents.c29
-rw-r--r--fs/bcachefs/extents.h1
-rw-r--r--fs/bcachefs/fs-io-buffered.c41
-rw-r--r--fs/bcachefs/fs.c12
-rw-r--r--fs/bcachefs/fsck.c9
-rw-r--r--fs/bcachefs/inode.c11
-rw-r--r--fs/bcachefs/inode.h2
-rw-r--r--fs/bcachefs/inode_format.h3
-rw-r--r--fs/bcachefs/journal.c8
-rw-r--r--fs/bcachefs/journal_io.c47
-rw-r--r--fs/bcachefs/journal_io.h23
-rw-r--r--fs/bcachefs/journal_reclaim.c14
-rw-r--r--fs/bcachefs/journal_types.h2
-rw-r--r--fs/bcachefs/lru.h10
-rw-r--r--fs/bcachefs/migrate.c4
-rw-r--r--fs/bcachefs/move.c28
-rw-r--r--fs/bcachefs/movinggc.c188
-rw-r--r--fs/bcachefs/namei.c3
-rw-r--r--fs/bcachefs/opts.c2
-rw-r--r--fs/bcachefs/opts.h7
-rw-r--r--fs/bcachefs/rebalance.c117
-rw-r--r--fs/bcachefs/recovery.c258
-rw-r--r--fs/bcachefs/recovery_passes_format.h2
-rw-r--r--fs/bcachefs/replicas.c14
-rw-r--r--fs/bcachefs/replicas.h2
-rw-r--r--fs/bcachefs/sb-counters_format.h18
-rw-r--r--fs/bcachefs/sb-errors_format.h3
-rw-r--r--fs/bcachefs/sb-members.c4
-rw-r--r--fs/bcachefs/snapshot.c3
-rw-r--r--fs/bcachefs/snapshot.h15
-rw-r--r--fs/bcachefs/str_hash.h14
-rw-r--r--fs/bcachefs/super-io.c57
-rw-r--r--fs/bcachefs/super-io.h3
-rw-r--r--fs/bcachefs/super.c531
-rw-r--r--fs/bcachefs/super.h22
-rw-r--r--fs/bcachefs/trace.h5
-rw-r--r--fs/bcachefs/vstructs.h3
-rw-r--r--fs/bcachefs/xattr.c19
73 files changed, 1963 insertions, 1022 deletions
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 45d3db41225a..cb25cddb759b 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -532,10 +532,6 @@ static int check_bp_exists(struct btree_trans *trans,
struct btree_iter other_extent_iter = {};
CLASS(printbuf, buf)();
- if (bpos_lt(bp->k.p, s->bp_start) ||
- bpos_gt(bp->k.p, s->bp_end))
- return 0;
-
CLASS(btree_iter, bp_iter)(trans, BTREE_ID_backpointers, bp->k.p, 0);
struct bkey_s_c bp_k = bch2_btree_iter_peek_slot(&bp_iter);
int ret = bkey_err(bp_k);
@@ -690,6 +686,10 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
struct bkey_i_backpointer bp;
bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
+ if (bpos_lt(bp.k.p, s->bp_start) ||
+ bpos_gt(bp.k.p, s->bp_end))
+ continue;
+
int ret = !empty
? check_bp_exists(trans, s, &bp, k)
: bch2_bucket_backpointer_mod(trans, k, &bp, true);
@@ -897,7 +897,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen &&
+ if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointer_bucket_gen &&
(bp.v->bucket_gen != a->gen ||
bp.v->pad)) {
ret = bch2_backpointer_del(trans, bp_k.k->p);
@@ -929,6 +929,14 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
if (sectors[ALLOC_dirty] != a->dirty_sectors ||
sectors[ALLOC_cached] != a->cached_sectors ||
sectors[ALLOC_stripe] != a->stripe_sectors) {
+ /*
+ * Post 1.14 upgrade, we assume that backpointers are mostly
+ * correct and a sector count mismatch is probably due to a
+ * write buffer race
+ *
+ * Pre upgrade, we expect all the buckets to be wrong, a write
+ * buffer flush is pointless:
+ */
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
if (ret)
@@ -976,12 +984,22 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
goto next;
struct bpos bucket = bp_pos_to_bucket(ca, pos);
- u64 next = ca->mi.nbuckets;
-
- unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets);
- if (bitmap)
- next = min_t(u64, next,
- find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset));
+ u64 next = min(bucket.offset, ca->mi.nbuckets);
+
+ unsigned long *mismatch = READ_ONCE(ca->bucket_backpointer_mismatch.buckets);
+ unsigned long *empty = READ_ONCE(ca->bucket_backpointer_empty.buckets);
+ /*
+ * Find the first bucket with mismatches - but
+ * not empty buckets; we don't need to pin those
+ * because we just recreate all backpointers in
+ * those buckets
+ */
+ if (mismatch && empty)
+ next = find_next_andnot_bit(mismatch, empty, ca->mi.nbuckets, next);
+ else if (mismatch)
+ next = find_next_bit(mismatch, ca->mi.nbuckets, next);
+ else
+ next = ca->mi.nbuckets;
bucket.offset = next;
if (bucket.offset == ca->mi.nbuckets)
@@ -1108,17 +1126,18 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
if (ret)
goto err;
- u64 nr_buckets = 0, nr_mismatches = 0;
+ u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0;
for_each_member_device(c, ca) {
nr_buckets += ca->mi.nbuckets;
nr_mismatches += ca->bucket_backpointer_mismatch.nr;
+ nr_empty += ca->bucket_backpointer_empty.nr;
}
if (!nr_mismatches)
goto err;
- bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
- nr_mismatches, nr_buckets);
+ bch_info(c, "scanning for missing backpointers in %llu/%llu buckets, %llu buckets with no backpointers",
+ nr_mismatches - nr_empty, nr_buckets, nr_empty);
while (1) {
ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index cdf593c59922..16d08dfb5f19 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -386,14 +386,6 @@ do { \
##__VA_ARGS__, bch2_err_str(_ret)); \
} while (0)
-static inline int __bch2_err_trace(struct bch_fs *c, int err)
-{
- trace_error_throw(c, err, _THIS_IP_);
- return err;
-}
-
-#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err)
-
/* Parameters that are useful for debugging, but should always be compiled in: */
#define BCH_DEBUG_PARAMS_ALWAYS() \
BCH_DEBUG_PARAM(key_merging_disabled, \
@@ -1153,6 +1145,15 @@ struct bch_fs {
struct mutex fsck_error_counts_lock;
};
+static inline int __bch2_err_trace(struct bch_fs *c, int err)
+{
+ this_cpu_inc(c->counters[BCH_COUNTER_error_throw]);
+ trace_error_throw(c, err, _THIS_IP_);
+ return err;
+}
+
+#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err)
+
extern struct wait_queue_head bch2_read_only_wait;
static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index b4a04df5ea95..b2de993d802b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -423,7 +423,8 @@ enum bch_bkey_type_flags {
x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \
x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \
x(accounting, 34, BKEY_TYPE_strict_btree_checks) \
- x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks)
+ x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) \
+ x(extent_whiteout, 36, BKEY_TYPE_strict_btree_checks)
enum bch_bkey_type {
#define x(name, nr, ...) KEY_TYPE_##name = nr,
@@ -440,6 +441,10 @@ struct bch_whiteout {
struct bch_val v;
};
+struct bch_extent_whiteout {
+ struct bch_val v;
+};
+
struct bch_error {
struct bch_val v;
};
@@ -700,7 +705,9 @@ struct bch_sb_field_ext {
x(extent_flags, BCH_VERSION(1, 25)) \
x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \
x(fast_device_removal, BCH_VERSION(1, 27)) \
- x(inode_has_case_insensitive, BCH_VERSION(1, 28))
+ x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \
+ x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \
+ x(31bit_dirent_offset, BCH_VERSION(1, 30))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -1340,6 +1347,7 @@ enum btree_id_flags {
BTREE_IS_snapshots| \
BTREE_IS_data, \
BIT_ULL(KEY_TYPE_whiteout)| \
+ BIT_ULL(KEY_TYPE_extent_whiteout)| \
BIT_ULL(KEY_TYPE_error)| \
BIT_ULL(KEY_TYPE_cookie)| \
BIT_ULL(KEY_TYPE_extent)| \
@@ -1371,7 +1379,8 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_alloc_v4)) \
x(quotas, 5, 0, \
BIT_ULL(KEY_TYPE_quota)) \
- x(stripes, 6, 0, \
+ x(stripes, 6, \
+ BTREE_IS_data, \
BIT_ULL(KEY_TYPE_stripe)) \
x(reflink, 7, \
BTREE_IS_extents| \
@@ -1431,9 +1440,9 @@ enum btree_id {
*/
#define BTREE_ID_NR_MAX 63
-static inline bool btree_id_is_alloc(enum btree_id id)
+static inline bool btree_id_is_alloc(enum btree_id btree)
{
- switch (id) {
+ switch (btree) {
case BTREE_ID_alloc:
case BTREE_ID_backpointers:
case BTREE_ID_need_discard:
@@ -1447,6 +1456,33 @@ static inline bool btree_id_is_alloc(enum btree_id id)
}
}
+/* We can reconstruct these btrees from information in other btrees */
+static inline bool btree_id_can_reconstruct(enum btree_id btree)
+{
+ if (btree_id_is_alloc(btree))
+ return true;
+
+ switch (btree) {
+ case BTREE_ID_snapshot_trees:
+ case BTREE_ID_deleted_inodes:
+ case BTREE_ID_rebalance_work:
+ case BTREE_ID_subvolume_children:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * We can reconstruct BTREE_ID_alloc, but reconstucting it from scratch is not
+ * so cheap and OOMs on huge filesystems (until we have online
+ * check_allocations)
+ */
+static inline bool btree_id_recovers_from_scan(enum btree_id btree)
+{
+ return btree == BTREE_ID_alloc || !btree_id_can_reconstruct(btree);
+}
+
#define BTREE_MAX_DEPTH 4U
/* Btree nodes */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 52594e925eb7..5dc562f2a881 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -66,33 +66,46 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_STOP _IO(0xbc, 3)
#endif
-#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
-#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
-#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
-#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage)
-#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage)
-#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
-#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
-#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
-#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal)
-
-#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
-#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
-
-#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2)
-
-#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
-#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
-#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
-#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters)
+#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_ADD_v2 _IOW(0xbc, 23, struct bch_ioctl_disk_v2)
+#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_REMOVE_v2 _IOW(0xbc, 24, struct bch_ioctl_disk_v2)
+#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_ONLINE_v2 _IOW(0xbc, 25, struct bch_ioctl_disk_v2)
+#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_OFFLINE_v2 _IOW(0xbc, 26, struct bch_ioctl_disk_v2)
+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
+#define BCH_IOCTL_DISK_SET_STATE_v2 _IOW(0xbc, 22, struct bch_ioctl_disk_set_state_v2)
+#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
+#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage)
+#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage)
+#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
+#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
+#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
+#define BCH_IOCTL_DISK_RESIZE_v2 _IOW(0xbc, 27, struct bch_ioctl_disk_resize_v2)
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc, 15, struct bch_ioctl_disk_resize_journal)
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL_v2 _IOW(0xbc, 28, struct bch_ioctl_disk_resize_journal_v2)
+
+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
+
+#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2)
+
+#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
+#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
+#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
+#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters)
/* ioctl below act on a particular file, not the filesystem as a whole: */
#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
+struct bch_ioctl_err_msg {
+ __u64 msg_ptr;
+ __u32 msg_len;
+ __u32 pad;
+};
+
/*
* BCH_IOCTL_QUERY_UUID: get filesystem UUID
*
@@ -104,13 +117,6 @@ struct bch_ioctl_query_uuid {
__uuid_t uuid;
};
-#if 0
-struct bch_ioctl_start {
- __u32 flags;
- __u32 pad;
-};
-#endif
-
/*
* BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
*
@@ -164,6 +170,13 @@ struct bch_ioctl_disk {
__u64 dev;
};
+struct bch_ioctl_disk_v2 {
+ __u32 flags;
+ __u32 pad;
+ __u64 dev;
+ struct bch_ioctl_err_msg err;
+};
+
/*
* BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
*
@@ -181,6 +194,14 @@ struct bch_ioctl_disk_set_state {
__u64 dev;
};
+struct bch_ioctl_disk_set_state_v2 {
+ __u32 flags;
+ __u8 new_state;
+ __u8 pad[3];
+ __u64 dev;
+ struct bch_ioctl_err_msg err;
+};
+
#define BCH_DATA_OPS() \
x(scrub, 0) \
x(rereplicate, 1) \
@@ -392,6 +413,14 @@ struct bch_ioctl_disk_resize {
__u64 nbuckets;
};
+struct bch_ioctl_disk_resize_v2 {
+ __u32 flags;
+ __u32 pad;
+ __u64 dev;
+ __u64 nbuckets;
+ struct bch_ioctl_err_msg err;
+};
+
/*
* BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
*
@@ -405,6 +434,14 @@ struct bch_ioctl_disk_resize_journal {
__u64 nbuckets;
};
+struct bch_ioctl_disk_resize_journal_v2 {
+ __u32 flags;
+ __u32 pad;
+ __u64 dev;
+ __u64 nbuckets;
+ struct bch_ioctl_err_msg err;
+};
+
struct bch_ioctl_subvolume {
__u32 flags;
__u32 dirfd;
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
index a30c4ae8eb36..05a01bf86039 100644
--- a/fs/bcachefs/bkey_buf.h
+++ b/fs/bcachefs/bkey_buf.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_BKEY_BUF_H
#define _BCACHEFS_BKEY_BUF_H
+#include <linux/mempool.h>
+
#include "bcachefs.h"
#include "bkey.h"
@@ -10,41 +12,49 @@ struct bkey_buf {
u64 onstack[12];
};
-static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
- struct bch_fs *c, unsigned u64s)
+static inline int bch2_bkey_buf_realloc_noprof(struct bkey_buf *s,
+ struct bch_fs *c, unsigned u64s)
{
if (s->k == (void *) s->onstack &&
u64s > ARRAY_SIZE(s->onstack)) {
- s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+ s->k = mempool_alloc_noprof(&c->large_bkey_pool, GFP_NOFS);
memcpy(s->k, s->onstack, sizeof(s->onstack));
}
+
+ return 0; /* for alloc_hooks() macro */
}
+#define bch2_bkey_buf_realloc(...) alloc_hooks(bch2_bkey_buf_realloc_noprof(__VA_ARGS__))
-static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
- struct bch_fs *c,
- struct bkey_s_c k)
+static inline int bch2_bkey_buf_reassemble_noprof(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct bkey_s_c k)
{
- bch2_bkey_buf_realloc(s, c, k.k->u64s);
+ bch2_bkey_buf_realloc_noprof(s, c, k.k->u64s);
bkey_reassemble(s->k, k);
+ return 0;
}
+#define bch2_bkey_buf_reassemble(...) alloc_hooks(bch2_bkey_buf_reassemble_noprof(__VA_ARGS__))
-static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
- struct bch_fs *c,
- struct bkey_i *src)
+static inline int bch2_bkey_buf_copy_noprof(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct bkey_i *src)
{
- bch2_bkey_buf_realloc(s, c, src->k.u64s);
+ bch2_bkey_buf_realloc_noprof(s, c, src->k.u64s);
bkey_copy(s->k, src);
+ return 0;
}
+#define bch2_bkey_buf_copy(...) alloc_hooks(bch2_bkey_buf_copy_noprof(__VA_ARGS__))
-static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
- struct bch_fs *c,
- struct btree *b,
- struct bkey_packed *src)
+static inline int bch2_bkey_buf_unpack_noprof(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct btree *b,
+ struct bkey_packed *src)
{
- bch2_bkey_buf_realloc(s, c, BKEY_U64s +
- bkeyp_val_u64s(&b->format, src));
+ bch2_bkey_buf_realloc_noprof(s, c, BKEY_U64s + bkeyp_val_u64s(&b->format, src));
bch2_bkey_unpack(b, s->k, src);
+ return 0;
}
+#define bch2_bkey_buf_unpack(...) alloc_hooks(bch2_bkey_buf_unpack_noprof(__VA_ARGS__))
static inline void bch2_bkey_buf_init(struct bkey_buf *s)
{
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index fcd8c82cba4f..75d73677c4d8 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -41,6 +41,10 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
.key_validate = deleted_key_validate, \
})
+#define bch2_bkey_ops_extent_whiteout ((struct bkey_ops) { \
+ .key_validate = deleted_key_validate, \
+})
+
static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_validate_context from)
{
@@ -203,7 +207,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
? bch2_bkey_types[k.k->type]
: "(unknown)");
- if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+ if (btree_node_type_is_extents(type) && !bkey_extent_whiteout(k.k)) {
bkey_fsck_err_on(k.k->size == 0,
c, bkey_extent_size_zero,
"size == 0");
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
index b4f328f9853c..88a48ce63656 100644
--- a/fs/bcachefs/bkey_types.h
+++ b/fs/bcachefs/bkey_types.h
@@ -44,6 +44,11 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
#define bkey_whiteout(_k) \
((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
+#define bkey_extent_whiteout(_k) \
+ ((_k)->type == KEY_TYPE_deleted || \
+ (_k)->type == KEY_TYPE_whiteout || \
+ (_k)->type == KEY_TYPE_extent_whiteout)
+
/* bkey with split value, const */
struct bkey_s_c {
const struct bkey *k;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 8716eedd43fc..59638d09e1fd 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -788,7 +788,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
goto got_node;
}
- b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
+ b = __btree_node_mem_alloc(c, GFP_NOWAIT);
if (b) {
bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
} else {
@@ -826,7 +826,7 @@ got_node:
mutex_unlock(&bc->lock);
- if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
+ if (btree_node_data_alloc(c, b, GFP_NOWAIT)) {
bch2_trans_unlock(trans);
if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
goto err;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6b91649688da..43f294284d57 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -44,27 +44,6 @@
#include <linux/rcupdate.h>
#include <linux/sched/task.h>
-/*
- * Returns true if it's a btree we can easily reconstruct, or otherwise won't
- * cause data loss if it's missing:
- */
-static bool btree_id_important(enum btree_id btree)
-{
- if (btree_id_is_alloc(btree))
- return false;
-
- switch (btree) {
- case BTREE_ID_quotas:
- case BTREE_ID_snapshot_trees:
- case BTREE_ID_logged_ops:
- case BTREE_ID_rebalance_work:
- case BTREE_ID_subvolume_children:
- return false;
- default:
- return true;
- }
-}
-
static const char * const bch2_gc_phase_strs[] = {
#define x(n) #n,
GC_PHASES()
@@ -377,7 +356,7 @@ again:
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
BUG_ON(bpos_gt(k.k->p, b->data->max_key));
@@ -491,7 +470,7 @@ again:
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) {
bch2_bkey_buf_reassemble(&cur_k, c, k);
bch2_btree_and_journal_iter_advance(&iter);
@@ -557,45 +536,55 @@ fsck_err:
return ret;
}
-static int bch2_check_root(struct btree_trans *trans, enum btree_id btree,
+static int bch2_topology_check_root(struct btree_trans *trans, enum btree_id btree,
bool *reconstructed_root)
{
struct bch_fs *c = trans->c;
struct btree_root *r = bch2_btree_id_root(c, btree);
- CLASS(printbuf, buf)();
- int ret = 0;
-
- bch2_btree_id_to_text(&buf, btree);
- if (r->error) {
- bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
+ if (!r->error)
+ return 0;
- ret = bch2_btree_has_scanned_nodes(c, btree);
- if (ret < 0)
- goto err;
+ CLASS(printbuf, buf)();
+ int ret = 0;
- if (!ret) {
- __fsck_err(trans,
- FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0),
- btree_root_unreadable_and_scan_found_nothing,
- "no nodes found for btree %s, continue?", buf.buf);
+ if (!btree_id_recovers_from_scan(btree)) {
+ r->alive = false;
+ r->error = 0;
+ bch2_btree_root_alloc_fake_trans(trans, btree, 0);
+ ret = bch2_btree_lost_data(c, &buf, btree);
+ bch2_print_str(c, KERN_NOTICE, buf.buf);
+ goto out;
+ }
- r->alive = false;
- r->error = 0;
- bch2_btree_root_alloc_fake_trans(trans, btree, 0);
- } else {
- r->alive = false;
- r->error = 0;
- bch2_btree_root_alloc_fake_trans(trans, btree, 1);
+ bch2_btree_id_to_text(&buf, btree);
+ bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
- bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX);
- if (ret)
- return ret;
- }
+ ret = bch2_btree_has_scanned_nodes(c, btree);
+ if (ret < 0)
+ goto err;
- *reconstructed_root = true;
+ if (!ret) {
+ __fsck_err(trans,
+ FSCK_CAN_FIX|(btree_id_can_reconstruct(btree) ? FSCK_AUTOFIX : 0),
+ btree_root_unreadable_and_scan_found_nothing,
+ "no nodes found for btree %s, continue?", buf.buf);
+
+ r->alive = false;
+ r->error = 0;
+ bch2_btree_root_alloc_fake_trans(trans, btree, 0);
+ } else {
+ r->alive = false;
+ r->error = 0;
+ bch2_btree_root_alloc_fake_trans(trans, btree, 1);
+
+ bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX);
+ if (ret)
+ return ret;
}
+out:
+ *reconstructed_root = true;
err:
fsck_err:
bch_err_fn(c, ret);
@@ -613,7 +602,7 @@ int bch2_check_topology(struct bch_fs *c)
for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
bool reconstructed_root = false;
recover:
- ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root));
+ ret = lockrestart_do(trans, bch2_topology_check_root(trans, i, &reconstructed_root));
if (ret)
break;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 8a03cd75a64f..2e3dd9bacac5 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -131,10 +131,10 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
- p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+ p = kvmalloc(size, GFP_NOWAIT|__GFP_ACCOUNT|__GFP_RECLAIMABLE);
if (!p) {
*used_mempool = true;
- p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS|__GFP_ACCOUNT|__GFP_RECLAIMABLE);
}
memalloc_nofs_restore(flags);
return p;
@@ -1014,6 +1014,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
k = bkey_p_next(k);
continue;
drop_this_key:
+ ret = 0;
next_good_key = k->u64s;
if (!next_good_key ||
@@ -1470,7 +1471,7 @@ start:
}
prt_newline(&buf);
- if (failed.nr)
+ if (ret || failed.nr)
bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
async_object_list_del(c, btree_read_bio, rb->list_idx);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a67babf69d39..1e152c671bd7 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -650,7 +650,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str
i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
+ const struct bkey_i *j_k =
bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
i->k->k.p);
@@ -848,7 +848,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
break;
bch2_btree_and_journal_iter_advance(jiter);
- k = bch2_btree_and_journal_iter_peek(jiter);
+ k = bch2_btree_and_journal_iter_peek(c, jiter);
if (!k.k)
break;
@@ -898,7 +898,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
- k = bch2_btree_and_journal_iter_peek(&jiter);
+ k = bch2_btree_and_journal_iter_peek(c, &jiter);
if (!k.k) {
CLASS(printbuf, buf)();
@@ -2120,10 +2120,10 @@ void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_
}
}
-static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos search_pos,
- struct bpos end_pos)
+static const struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos search_pos,
+ struct bpos end_pos)
{
struct btree_path *path = btree_iter_path(trans, iter);
@@ -2139,7 +2139,7 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
struct btree_iter *iter)
{
struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
+ const struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
if (k) {
iter->k = k->k;
@@ -2156,7 +2156,7 @@ void btree_trans_peek_journal(struct btree_trans *trans,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *next_journal =
+ const struct bkey_i *next_journal =
bch2_btree_journal_peek(trans, iter, search_key,
k->k ? k->k->p : path_l(path)->b->key.k.p);
if (next_journal) {
@@ -2165,10 +2165,10 @@ void btree_trans_peek_journal(struct btree_trans *trans,
}
}
-static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos search_key,
- struct bpos end_pos)
+static const struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos search_key,
+ struct bpos end_pos)
{
struct btree_path *path = btree_iter_path(trans, iter);
@@ -2186,7 +2186,7 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *next_journal =
+ const struct bkey_i *next_journal =
bch2_btree_journal_peek_prev(trans, iter, search_key,
k->k ? k->k->p : path_l(path)->b->data->min_key);
@@ -2366,7 +2366,9 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
+ !(iter->flags & BTREE_ITER_nofilter_whiteouts) &&
+ bkey_eq(end, POS_MAX));
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {
@@ -2450,10 +2452,27 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
continue;
}
- if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_nofilter_whiteouts)) {
- search_key = bkey_successor(iter, k.k->p);
- continue;
+ if (!(iter->flags & BTREE_ITER_nofilter_whiteouts)) {
+ /*
+ * KEY_TYPE_extent_whiteout indicates that there
+ * are no extents that overlap with this
+ * whiteout - meaning bkey_start_pos() is
+ * monotonically increasing when including
+ * KEY_TYPE_extent_whiteout (not
+ * KEY_TYPE_whiteout).
+ *
+ * Without this @end wouldn't be able to
+ * terminate searches and we'd have to scan
+ * through tons of whiteouts:
+ */
+ if (k.k->type == KEY_TYPE_extent_whiteout &&
+ bkey_ge(k.k->p, end))
+ goto end;
+
+ if (bkey_extent_whiteout(k.k)) {
+ search_key = bkey_successor(iter, k.k->p);
+ continue;
+ }
}
}
@@ -2711,7 +2730,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
saved_path = 0;
}
- if (!bkey_whiteout(k.k)) {
+ if (!bkey_extent_whiteout(k.k)) {
saved_path = btree_path_clone(trans, iter->path,
iter->flags & BTREE_ITER_intent,
_THIS_IP_);
@@ -2724,7 +2743,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
continue;
}
- if (bkey_whiteout(k.k)) {
+ if (bkey_extent_whiteout(k.k)) {
search_key = bkey_predecessor(iter, k.k->p);
search_key.snapshot = U32_MAX;
continue;
@@ -2865,7 +2884,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
iter->k = *k.k;
}
- if (unlikely(k.k->type == KEY_TYPE_whiteout &&
+ if (unlikely(bkey_extent_whiteout(k.k) &&
(iter->flags & BTREE_ITER_filter_snapshots) &&
!(iter->flags & BTREE_ITER_nofilter_whiteouts)))
iter->k.type = KEY_TYPE_deleted;
@@ -2878,31 +2897,40 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
EBUG_ON(btree_iter_path(trans, iter)->level);
- if (iter->flags & BTREE_ITER_intent) {
- struct btree_iter iter2;
+ struct btree_iter iter2;
- bch2_trans_copy_iter(&iter2, iter);
- k = bch2_btree_iter_peek_max(&iter2, end);
+ bch2_trans_copy_iter(&iter2, iter);
+ iter2.flags |= BTREE_ITER_nofilter_whiteouts;
- if (k.k && !bkey_err(k)) {
- swap(iter->key_cache_path, iter2.key_cache_path);
- iter->k = iter2.k;
- k.k = &iter->k;
+ while (1) {
+ k = bch2_btree_iter_peek_max(&iter2, end);
+ if ((iter2.flags & BTREE_ITER_is_extents) &&
+ k.k &&
+ !bkey_err(k) &&
+ k.k->type == KEY_TYPE_whiteout) {
+ bch2_btree_iter_set_pos(&iter2, k.k->p);
+ continue;
}
- bch2_trans_iter_exit(&iter2);
- } else {
- struct bpos pos = iter->pos;
- k = bch2_btree_iter_peek_max(iter, end);
- if (unlikely(bkey_err(k)))
- bch2_btree_iter_set_pos(iter, pos);
- else
- iter->pos = pos;
+ break;
+ }
+
+ if (k.k && !bkey_err(k)) {
+ swap(iter->key_cache_path, iter2.key_cache_path);
+ iter->k = iter2.k;
+ k.k = &iter->k;
}
+ bch2_trans_iter_exit(&iter2);
if (unlikely(bkey_err(k)))
goto out;
+ if (unlikely(k.k &&
+ bkey_extent_whiteout(k.k) &&
+ (iter->flags & BTREE_ITER_filter_snapshots) &&
+ !(iter->flags & BTREE_ITER_nofilter_whiteouts)))
+ iter->k.type = KEY_TYPE_deleted;
+
next = k.k ? bkey_start_pos(k.k) : POS_MAX;
if (bkey_lt(iter->pos, next)) {
@@ -3243,9 +3271,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long
EBUG_ON(trans->mem_bytes);
EBUG_ON(trans->mem_top);
EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX);
-
+
bool lock_dropped = false;
- new_mem = allocate_dropping_locks_norelock(trans, lock_dropped, kmalloc(new_bytes, _gfp));
+ new_mem = allocate_dropping_locks_norelock(trans, lock_dropped,
+ kmalloc(new_bytes, _gfp|__GFP_NOWARN));
if (!new_mem) {
new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX;
@@ -3497,7 +3526,7 @@ got_trans:
if (s->max_mem) {
unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
- trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
+ trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL|__GFP_NOWARN);
if (likely(trans->mem))
trans->mem_bytes = expected_mem_bytes;
}
@@ -3668,6 +3697,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
+ if (trans->journal_replay_not_finished)
+ prt_printf(out, "has journal_keys ref\n");
+
/* trans->paths is rcu protected vs. freeing */
guard(rcu)();
guard(printbuf_atomic)(out);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index b117cb5d7f94..c8fc6ee01d96 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -954,7 +954,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
#define allocate_dropping_locks_errcode(_trans, _do) \
({ \
- gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
+ gfp_t _gfp = GFP_NOWAIT; \
int _ret = _do; \
\
if (bch2_err_matches(_ret, ENOMEM)) { \
@@ -966,7 +966,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
#define allocate_dropping_locks(_trans, _ret, _do) \
({ \
- gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
+ gfp_t _gfp = GFP_NOWAIT; \
typeof(_do) _p = _do; \
\
_ret = 0; \
@@ -979,7 +979,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
#define allocate_dropping_locks_norelock(_trans, _lock_dropped, _do) \
({ \
- gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \
+ gfp_t _gfp = GFP_NOWAIT; \
typeof(_do) _p = _do; \
_lock_dropped = false; \
if (unlikely(!_p)) { \
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 24f2fbe84ad7..a6f344faf751 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -46,21 +46,22 @@ static size_t __bch2_journal_key_search(struct journal_keys *keys,
enum btree_id id, unsigned level,
struct bpos pos)
{
+ struct bch_fs *c = container_of(keys, struct bch_fs, journal_keys);
size_t l = 0, r = keys->nr, m;
while (l < r) {
m = l + ((r - l) >> 1);
- if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
+ if (__journal_key_cmp(c, id, level, pos, idx_to_key(keys, m)) > 0)
l = m + 1;
else
r = m;
}
BUG_ON(l < keys->nr &&
- __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
+ __journal_key_cmp(c, id, level, pos, idx_to_key(keys, l)) > 0);
BUG_ON(l &&
- __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
+ __journal_key_cmp(c, id, level, pos, idx_to_key(keys, l - 1)) <= 0);
return l;
}
@@ -72,10 +73,20 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
}
+static inline struct journal_key_range_overwritten *__overwrite_range(struct journal_keys *keys, u32 idx)
+{
+ return idx ? keys->overwrites.data + idx : NULL;
+}
+
+static inline struct journal_key_range_overwritten *overwrite_range(struct journal_keys *keys, u32 idx)
+{
+ return idx ? rcu_dereference(keys->overwrites.data) + idx : NULL;
+}
+
/* Returns first non-overwritten key >= search key: */
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos,
- struct bpos end_pos, size_t *idx)
+const struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos,
+ struct bpos end_pos, size_t *idx)
{
struct journal_keys *keys = &c->journal_keys;
unsigned iters = 0;
@@ -87,7 +98,7 @@ search:
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
while (*idx &&
- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+ __journal_key_cmp(c, btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
--(*idx);
iters++;
if (iters == 10) {
@@ -96,23 +107,23 @@ search:
}
}
- struct bkey_i *ret = NULL;
+ const struct bkey_i *ret = NULL;
rcu_read_lock(); /* for overwritten_ranges */
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
- if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
+ if (__journal_key_cmp(c, btree_id, level, end_pos, k) < 0)
break;
if (k->overwritten) {
if (k->overwritten_range)
- *idx = rcu_dereference(k->overwritten_range)->end;
+ *idx = overwrite_range(keys, k->overwritten_range)->end;
else
*idx += 1;
continue;
}
- if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
- ret = k->k;
+ if (__journal_key_cmp(c, btree_id, level, pos, k) <= 0) {
+ ret = journal_key_k(c, k);
break;
}
@@ -129,9 +140,9 @@ search:
return ret;
}
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos,
- struct bpos end_pos, size_t *idx)
+const struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos,
+ struct bpos end_pos, size_t *idx)
{
struct journal_keys *keys = &c->journal_keys;
unsigned iters = 0;
@@ -146,7 +157,7 @@ search:
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
while (*idx < keys->nr &&
- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) {
+ __journal_key_cmp(c, btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) {
(*idx)++;
iters++;
if (iters == 10) {
@@ -158,25 +169,25 @@ search:
if (*idx == keys->nr)
--(*idx);
- struct bkey_i *ret = NULL;
+ const struct bkey_i *ret = NULL;
rcu_read_lock(); /* for overwritten_ranges */
while (true) {
k = idx_to_key(keys, *idx);
- if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
+ if (__journal_key_cmp(c, btree_id, level, end_pos, k) > 0)
break;
if (k->overwritten) {
if (k->overwritten_range)
- *idx = rcu_dereference(k->overwritten_range)->start;
+ *idx = overwrite_range(keys, k->overwritten_range)->start;
if (!*idx)
break;
--(*idx);
continue;
}
- if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
- ret = k->k;
+ if (__journal_key_cmp(c, btree_id, level, pos, k) >= 0) {
+ ret = journal_key_k(c, k);
break;
}
@@ -194,8 +205,8 @@ search:
return ret;
}
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos)
+const struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, struct bpos pos)
{
size_t idx = 0;
@@ -264,13 +275,8 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
struct journal_key n = {
.btree_id = id,
.level = level,
- .k = k,
.allocated = true,
- /*
- * Ensure these keys are done last by journal replay, to unblock
- * journal reclaim:
- */
- .journal_seq = U64_MAX,
+ .allocated_k = k,
};
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
@@ -278,8 +284,8 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
BUG_ON(test_bit(BCH_FS_rw, &c->flags));
if (idx < keys->size &&
- journal_key_cmp(&n, &keys->data[idx]) == 0) {
- struct bkey_i *o = keys->data[idx].k;
+ journal_key_cmp(c, &n, &keys->data[idx]) == 0) {
+ struct bkey_i *o = journal_key_k(c, &keys->data[idx]);
if (k->k.type == KEY_TYPE_accounting &&
o->k.type == KEY_TYPE_accounting) {
@@ -291,7 +297,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
}
if (keys->data[idx].allocated)
- kfree(keys->data[idx].k);
+ kfree(keys->data[idx].allocated_k);
keys->data[idx] = n;
return 0;
}
@@ -376,17 +382,20 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
unsigned level, struct bpos pos)
{
- struct journal_keys *keys = &trans->c->journal_keys;
+ if (!trans->journal_replay_not_finished)
+ return false;
+
+ struct bch_fs *c = trans->c;
+ struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
- if (!trans->journal_replay_not_finished)
+ if (idx >= keys->size ||
+ keys->data[idx].btree_id != btree ||
+ keys->data[idx].level != level)
return false;
- return (idx < keys->size &&
- keys->data[idx].btree_id == btree &&
- keys->data[idx].level == level &&
- bpos_eq(keys->data[idx].k->k.p, pos) &&
- bkey_deleted(&keys->data[idx].k->k));
+ struct bkey_i *k = journal_key_k(c, &keys->data[idx]);
+ return bpos_eq(k->k.p, pos) && bkey_deleted(&k->k);
}
static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
@@ -403,9 +412,9 @@ static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos
bool next_overwritten = next && next->overwritten;
struct journal_key_range_overwritten *prev_range =
- prev_overwritten ? prev->overwritten_range : NULL;
+ prev_overwritten ? __overwrite_range(keys, prev->overwritten_range) : NULL;
struct journal_key_range_overwritten *next_range =
- next_overwritten ? next->overwritten_range : NULL;
+ next_overwritten ? __overwrite_range(keys, next->overwritten_range) : NULL;
BUG_ON(prev_range && prev_range->end != idx);
BUG_ON(next_range && next_range->start != idx + 1);
@@ -413,37 +422,47 @@ static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos
if (prev_range && next_range) {
prev_range->end = next_range->end;
- keys->data[pos].overwritten_range = prev_range;
+ keys->data[pos].overwritten_range = prev->overwritten_range;
+
+ u32 old = next->overwritten_range;
+
for (size_t i = next_range->start; i < next_range->end; i++) {
struct journal_key *ip = keys->data + idx_to_pos(keys, i);
- BUG_ON(ip->overwritten_range != next_range);
- ip->overwritten_range = prev_range;
+ BUG_ON(ip->overwritten_range != old);
+ ip->overwritten_range = prev->overwritten_range;
}
-
- kfree_rcu_mightsleep(next_range);
} else if (prev_range) {
prev_range->end++;
- k->overwritten_range = prev_range;
+ k->overwritten_range = prev->overwritten_range;
if (next_overwritten) {
prev_range->end++;
- next->overwritten_range = prev_range;
+ next->overwritten_range = prev->overwritten_range;
}
} else if (next_range) {
next_range->start--;
- k->overwritten_range = next_range;
+ k->overwritten_range = next->overwritten_range;
if (prev_overwritten) {
next_range->start--;
- prev->overwritten_range = next_range;
+ prev->overwritten_range = next->overwritten_range;
}
} else if (prev_overwritten || next_overwritten) {
- struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
- if (!r)
+ /* 0 is a sentinel value */
+ if (darray_resize_rcu(&keys->overwrites, max(keys->overwrites.nr + 1, 2)))
return;
- r->start = idx - (size_t) prev_overwritten;
- r->end = idx + 1 + (size_t) next_overwritten;
+ if (!keys->overwrites.nr)
+ darray_push(&keys->overwrites, (struct journal_key_range_overwritten) {});
+
+ darray_push(&keys->overwrites, ((struct journal_key_range_overwritten) {
+ .start = idx - (size_t) prev_overwritten,
+ .end = idx + 1 + (size_t) next_overwritten,
+ }));
+
+ smp_wmb();
+ u32 r = keys->overwrites.nr - 1;
+
+ k->overwritten_range = r;
- rcu_assign_pointer(k->overwritten_range, r);
if (prev_overwritten)
prev->overwritten_range = r;
if (next_overwritten)
@@ -457,11 +476,15 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
- if (idx < keys->size &&
- keys->data[idx].btree_id == btree &&
- keys->data[idx].level == level &&
- bpos_eq(keys->data[idx].k->k.p, pos) &&
- !keys->data[idx].overwritten) {
+ if (idx >= keys->size ||
+ keys->data[idx].btree_id != btree ||
+ keys->data[idx].level != level ||
+ keys->data[idx].overwritten)
+ return;
+
+ struct bkey_i *k = journal_key_k(c, &keys->data[idx]);
+
+ if (bpos_eq(k->k.p, pos)) {
guard(mutex)(&keys->overwrite_lock);
__bch2_journal_key_overwritten(keys, idx);
}
@@ -476,7 +499,7 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
}
}
-static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+static struct bkey_s_c bch2_journal_iter_peek(struct bch_fs *c, struct journal_iter *iter)
{
journal_iter_verify(iter);
@@ -490,10 +513,10 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
BUG_ON(cmp);
if (!k->overwritten)
- return bkey_i_to_s_c(k->k);
+ return bkey_i_to_s_c(journal_key_k(c, k));
if (k->overwritten_range)
- iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
+ iter->idx = idx_to_pos(iter->keys, overwrite_range(iter->keys, k->overwritten_range)->end);
else
bch2_journal_iter_advance(iter);
}
@@ -554,7 +577,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
while (nr--) {
bch2_btree_and_journal_iter_advance(&iter);
- struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
+ struct bkey_s_c k = bch2_btree_and_journal_iter_peek(c, &iter);
if (!k.k)
break;
@@ -565,7 +588,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
bch2_bkey_buf_exit(&tmp, c);
}
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct bch_fs *c, struct btree_and_journal_iter *iter)
{
struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
size_t iters = 0;
@@ -586,7 +609,7 @@ again:
bch2_journal_iter_advance_btree(iter);
if (iter->trans->journal_replay_not_finished)
- while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+ while ((journal_k = bch2_journal_iter_peek(c, &iter->journal)).k &&
bpos_lt(journal_k.k->p, iter->pos))
bch2_journal_iter_advance(&iter->journal);
@@ -658,15 +681,22 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
/*
* When keys compare equal, oldest compares first:
*/
-static int journal_sort_key_cmp(const void *_l, const void *_r)
+static int journal_sort_key_cmp(const void *_l, const void *_r, const void *priv)
{
+ struct bch_fs *c = (void *) priv;
const struct journal_key *l = _l;
const struct journal_key *r = _r;
int rewind = l->rewind && r->rewind ? -1 : 1;
- return journal_key_cmp(l, r) ?:
- ((cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->journal_offset, r->journal_offset)) * rewind);
+ int cmp = journal_key_cmp(c, l, r);
+ if (cmp)
+ return cmp;
+
+ if (l->allocated || r->allocated)
+ return cmp_int(l->allocated, r->allocated);
+
+ return ((cmp_int(l->journal_seq_offset, r->journal_seq_offset) ?:
+ cmp_int(l->journal_offset, r->journal_offset)) * rewind);
}
void bch2_journal_keys_put(struct bch_fs *c)
@@ -680,20 +710,16 @@ void bch2_journal_keys_put(struct bch_fs *c)
move_gap(keys, keys->nr);
- darray_for_each(*keys, i) {
- if (i->overwritten_range &&
- (i == &darray_last(*keys) ||
- i->overwritten_range != i[1].overwritten_range))
- kfree(i->overwritten_range);
-
+ darray_for_each(*keys, i)
if (i->allocated)
- kfree(i->k);
- }
+ kfree(i->allocated_k);
kvfree(keys->data);
keys->data = NULL;
keys->nr = keys->gap = keys->size = 0;
+ darray_exit(&keys->overwrites);
+
struct journal_replay **i;
struct genradix_iter iter;
@@ -704,8 +730,10 @@ void bch2_journal_keys_put(struct bch_fs *c)
static void __journal_keys_sort(struct journal_keys *keys)
{
- sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
- journal_sort_key_cmp, NULL);
+ struct bch_fs *c = container_of(keys, struct bch_fs, journal_keys);
+
+ sort_r_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
+ journal_sort_key_cmp, NULL, c);
cond_resched();
@@ -717,9 +745,10 @@ static void __journal_keys_sort(struct journal_keys *keys)
* compare each individual accounting key against the version in
* the btree during replay:
*/
- if (src->k->k.type != KEY_TYPE_accounting &&
+ struct bkey_i *k = journal_key_k(c, src);
+ if (k->k.type != KEY_TYPE_accounting &&
src + 1 < &darray_top(*keys) &&
- !journal_key_cmp(src, src + 1))
+ !journal_key_cmp(c, src, src + 1))
continue;
*dst++ = *src;
@@ -763,8 +792,7 @@ int bch2_journal_keys_sort(struct bch_fs *c)
.btree_id = entry->btree_id,
.level = entry->level,
.rewind = rewind,
- .k = k,
- .journal_seq = le64_to_cpu(i->j.seq),
+ .journal_seq_offset = journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)),
.journal_offset = k->_data - i->j._data,
};
@@ -801,13 +829,18 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
move_gap(keys, keys->nr);
- darray_for_each(*keys, i)
+ darray_for_each(*keys, i) {
+ struct bkey_i *k = journal_key_k(c, i);
+
if (!(i->btree_id == btree &&
i->level >= level_min &&
i->level <= level_max &&
- bpos_ge(i->k->k.p, start) &&
- bpos_le(i->k->k.p, end)))
+ bpos_ge(k->k.p, start) &&
+ bpos_le(k->k.p, end)))
keys->data[dst++] = *i;
+ else if (i->allocated)
+ kfree(i->allocated_k);
+ }
keys->nr = keys->gap = dst;
}
@@ -825,7 +858,7 @@ void bch2_journal_keys_dump(struct bch_fs *c)
prt_printf(&buf, "btree=");
bch2_btree_id_to_text(&buf, i->btree_id);
prt_printf(&buf, " l=%u ", i->level);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(journal_key_k(c, i)));
pr_err("%s", buf.buf);
}
}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 2a3082919b8d..85d6969fa9b1 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -29,6 +29,22 @@ struct btree_and_journal_iter {
bool fail_if_too_many_whiteouts;
};
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
+{
+ return seq - c->journal_entries_base_seq;
+}
+
+static inline struct bkey_i *journal_key_k(struct bch_fs *c,
+ const struct journal_key *k)
+{
+ if (k->allocated)
+ return k->allocated_k;
+
+ struct journal_replay *i = *genradix_ptr(&c->journal_entries, k->journal_seq_offset);
+
+ return (struct bkey_i *) (i->j._data + k->journal_offset);
+}
+
static inline int __journal_key_btree_cmp(enum btree_id l_btree_id,
unsigned l_level,
const struct journal_key *r)
@@ -37,25 +53,28 @@ static inline int __journal_key_btree_cmp(enum btree_id l_btree_id,
cmp_int(l_btree_id, r->btree_id);
}
-static inline int __journal_key_cmp(enum btree_id l_btree_id,
+static inline int __journal_key_cmp(struct bch_fs *c,
+ enum btree_id l_btree_id,
unsigned l_level,
struct bpos l_pos,
const struct journal_key *r)
{
return __journal_key_btree_cmp(l_btree_id, l_level, r) ?:
- bpos_cmp(l_pos, r->k->k.p);
+ bpos_cmp(l_pos, journal_key_k(c, r)->k.p);
}
-static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+static inline int journal_key_cmp(struct bch_fs *c,
+ const struct journal_key *l, const struct journal_key *r)
{
- return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+ return __journal_key_cmp(c, l->btree_id, l->level,
+ journal_key_k(c, l)->k.p, r);
}
-struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
+const struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
+const struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+const struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
@@ -71,7 +90,7 @@ bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned,
void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct bch_fs *, struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h
index 86aacb254fb2..4495fc92f848 100644
--- a/fs/bcachefs/btree_journal_iter_types.h
+++ b/fs/bcachefs/btree_journal_iter_types.h
@@ -2,21 +2,47 @@
#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
+struct journal_ptr {
+ bool csum_good;
+ struct bch_csum csum;
+ u8 dev;
+ u32 bucket;
+ u32 bucket_offset;
+ u64 sector;
+};
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+ DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
+
+ bool csum_good;
+ bool ignore_blacklisted;
+ bool ignore_not_dirty;
+ /* must be last: */
+ struct jset j;
+};
+
struct journal_key_range_overwritten {
size_t start, end;
};
struct journal_key {
- u64 journal_seq;
- u32 journal_offset;
+ union {
+ struct {
+ u32 journal_seq_offset;
+ u32 journal_offset;
+ };
+ struct bkey_i *allocated_k;
+ };
enum btree_id btree_id:8;
unsigned level:8;
bool allocated:1;
bool overwritten:1;
bool rewind:1;
- struct journal_key_range_overwritten __rcu *
- overwritten_range;
- struct bkey_i *k;
+ u32 overwritten_range;
};
struct journal_keys {
@@ -31,7 +57,9 @@ struct journal_keys {
size_t gap;
atomic_t ref;
bool initial_ref_held;
+
struct mutex overwrite_lock;
+ DARRAY(struct journal_key_range_overwritten) overwrites;
};
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 4b7b5ca74ba1..b618a0bd1186 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -149,7 +149,7 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
}
- if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
+ if (btree_id_can_reconstruct(BTREE_NODE_ID(bn)))
return;
if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
@@ -534,7 +534,7 @@ int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos node_min, struct bpos node_max)
{
- if (btree_id_is_alloc(btree))
+ if (!btree_id_recovers_from_scan(btree))
return 0;
struct find_btree_nodes *f = &c->found_btree_nodes;
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 8b94a8156fbf..5fa7f2f9f1e9 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -54,7 +54,7 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u);
if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
+ const struct bkey_i *j_k =
bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
if (j_k)
@@ -449,7 +449,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
if (unlikely(!new_k))
return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 6f3b57573cba..b70eb095a37e 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -224,14 +224,14 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
update->k.p = old.k->p;
update->k.p.snapshot = new.k->p.snapshot;
- if (new.k->p.snapshot != old.k->p.snapshot) {
- update->k.type = KEY_TYPE_whiteout;
- } else if (btree_type_has_snapshots(btree_id)) {
- ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+ if (btree_type_has_snapshots(btree_id)) {
+ ret = new.k->p.snapshot != old.k->p.snapshot
+ ? 1
+ : need_whiteout_for_snapshot(trans, btree_id, update->k.p);
if (ret < 0)
return ret;
if (ret)
- update->k.type = KEY_TYPE_whiteout;
+ update->k.type = extent_whiteout_type(trans->c, iter->btree_id, new.k);
}
ret = bch2_btree_insert_nonextent(trans, btree_id, update,
@@ -265,7 +265,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
CLASS(btree_iter, iter)(trans, btree_id, bkey_start_pos(&insert->k),
BTREE_ITER_intent|
BTREE_ITER_with_updates|
- BTREE_ITER_not_extents);
+ BTREE_ITER_not_extents|
+ BTREE_ITER_nofilter_whiteouts);
struct bkey_s_c k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
int ret = bkey_err(k);
if (ret)
@@ -283,12 +284,40 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
goto next;
}
- while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
- bool done = bkey_lt(insert->k.p, k.k->p);
+ while (true) {
+ BUG_ON(bkey_le(k.k->p, bkey_start_pos(&insert->k)));
- ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
- if (ret)
- return ret;
+ /*
+ * When KEY_TYPE_whiteout is included, bkey_start_pos is not
+ * monotonically increasing
+ */
+ if (k.k->type != KEY_TYPE_whiteout && bkey_le(insert->k.p, bkey_start_pos(k.k)))
+ break;
+
+ bool done = k.k->type != KEY_TYPE_whiteout && bkey_lt(insert->k.p, k.k->p);
+
+ if (bkey_extent_whiteout(k.k)) {
+ enum bch_bkey_type whiteout_type = extent_whiteout_type(trans->c, btree_id, &insert->k);
+
+ if (bkey_le(k.k->p, insert->k.p) &&
+ k.k->type != whiteout_type) {
+ struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ return ret;
+
+ update->k.p.snapshot = iter.snapshot;
+ update->k.type = whiteout_type;
+
+ ret = bch2_trans_update(trans, &iter, update, 0);
+ if (ret)
+ return ret;
+ }
+ } else {
+ ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
+ if (ret)
+ return ret;
+ }
if (done)
goto out;
@@ -374,7 +403,7 @@ __btree_trans_update_by_path(struct btree_trans *trans,
i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
+ const struct bkey_i *j_k =
bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
if (j_k) {
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 663739db82b1..18560ca80057 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -5,6 +5,7 @@
#include "btree_iter.h"
#include "journal.h"
#include "snapshot.h"
+#include "super-io.h"
struct bch_fs;
struct btree;
@@ -110,6 +111,22 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
: 0;
}
+static inline enum bch_bkey_type extent_whiteout_type(struct bch_fs *c, enum btree_id btree,
+ const struct bkey *k)
+{
+ /*
+ * KEY_TYPE_extent_whiteout indicates that there isn't a real extent
+ * present at that position: key start positions inclusive of
+ * KEY_TYPE_extent_whiteout (but not KEY_TYPE_whiteout) are
+ * monotonically increasing
+ */
+ return btree_id_is_extents_snapshots(btree) &&
+ bkey_deleted(k) &&
+ !bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_snapshot_whiteouts)
+ ? KEY_TYPE_extent_whiteout
+ : KEY_TYPE_whiteout;
+}
+
int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
enum btree_iter_update_trigger_flags,
struct bkey_s_c, struct bkey_s_c);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 76897cf15946..a9877a47bfc6 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -95,7 +95,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
if (!b->c.level)
goto out;
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ while ((k = bch2_btree_and_journal_iter_peek(c, &iter)).k) {
if (k.k->type != KEY_TYPE_btree_ptr_v2)
goto out;
@@ -336,6 +336,20 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
BUG_ON(b->ob.nr);
mutex_lock(&c->btree_reserve_cache_lock);
+ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) {
+ guard(spinlock)(&c->freelist_lock);
+ if (c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark)) {
+ if (cl)
+ closure_wait(&c->open_buckets_wait, cl);
+
+ ret = cl
+ ? bch_err_throw(c, bucket_alloc_blocked)
+ : bch_err_throw(c, open_buckets_empty);
+ mutex_unlock(&c->btree_reserve_cache_lock);
+ goto err;
+ }
+ }
+
if (c->btree_reserve_cache_nr > nr_reserve) {
for (struct btree_alloc *a = c->btree_reserve_cache;
a < c->btree_reserve_cache + c->btree_reserve_cache_nr;) {
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index afad11831e1d..755fb25a8eba 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -701,8 +701,16 @@ int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree,
struct bkey_i_accounting *k)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_write_buffered_key new = { .btree = btree };
+ if (trace_accounting_key_to_wb_slowpath_enabled()) {
+ CLASS(printbuf, buf)();
+ prt_printf(&buf, "have: %zu\n", wb->accounting.nr);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&k->k_i));
+ trace_accounting_key_to_wb_slowpath(c, buf.buf);
+ }
+ count_event(c, accounting_key_to_wb_slowpath);
+
+ struct btree_write_buffered_key new = { .btree = btree };
bkey_copy(&new.k, &k->k_i);
int ret = darray_push(&wb->accounting, new);
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index e484cd6b90b0..b862bdf67f58 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -95,7 +95,7 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c,
EBUG_ON(!dst->seq);
- return k->k.type == KEY_TYPE_accounting
+ return bch2_bkey_is_accounting_mem(&k->k)
? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k))
: __bch2_journal_key_to_wb(c, dst, btree, k);
}
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 87a6f4dce296..6be1cc9ba0da 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -111,16 +111,28 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
CLASS(printbuf, buf)();
int ret = 0;
- CLASS(bch2_dev_tryget, ca)(c, p.ptr.dev);
+ CLASS(bch2_dev_tryget_noerror, ca)(c, p.ptr.dev);
if (!ca) {
- if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
- trans, ptr_to_invalid_device,
- "pointer to missing device %u\n"
- "while marking %s",
- p.ptr.dev,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- *do_update = true;
+ if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
+ return 0;
+
+ if (test_bit(p.ptr.dev, c->devs_removed.d)) {
+ if (fsck_err(trans, ptr_to_removed_device,
+ "pointer to removed device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+ } else {
+ if (fsck_err(trans, ptr_to_invalid_device,
+ "pointer to missing device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+ }
return 0;
}
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 467fc45e84fe..f6f90d421f27 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -187,6 +187,18 @@ static long bch2_ioctl_stop(struct bch_fs *c)
}
#endif
+static int copy_ioctl_err_msg(struct bch_ioctl_err_msg *dst, struct printbuf *src, int ret)
+{
+ if (ret) {
+ prt_printf(src, "error=%s", bch2_err_str(ret));
+ ret = copy_to_user_errcode((void __user *)(ulong)dst->msg_ptr,
+ src->buf,
+ min(src->pos, dst->msg_len)) ?: ret;
+ }
+
+ return ret;
+}
+
static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
{
char *path;
@@ -203,13 +215,37 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
if (ret)
return ret;
- ret = bch2_dev_add(c, path);
- if (!IS_ERR(path))
- kfree(path);
+ CLASS(printbuf, err)();
+ ret = bch2_dev_add(c, path, &err);
+ if (ret)
+ bch_err(c, "%s", err.buf);
+ kfree(path);
return ret;
}
+static long bch2_ioctl_disk_add_v2(struct bch_fs *c, struct bch_ioctl_disk_v2 arg)
+{
+ char *path = NULL;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (arg.flags || arg.pad)
+ return -EINVAL;
+
+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(path);
+ if (ret)
+ return ret;
+
+ CLASS(printbuf, err)();
+ ret = bch2_dev_add(c, path, &err);
+ kfree(path);
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
+}
+
static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
{
if (!capable(CAP_SYS_ADMIN))
@@ -226,7 +262,32 @@ static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
if (IS_ERR(ca))
return PTR_ERR(ca);
- return bch2_dev_remove(c, ca, arg.flags);
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_remove(c, ca, arg.flags, &err);
+ if (ret)
+ bch_err(ca, "%s", err.buf);
+ return ret;
+}
+
+static long bch2_ioctl_disk_remove_v2(struct bch_fs *c, struct bch_ioctl_disk_v2 arg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+ BCH_FORCE_IF_METADATA_LOST|
+ BCH_FORCE_IF_DEGRADED|
+ BCH_BY_INDEX)) ||
+ arg.pad)
+ return -EINVAL;
+
+ struct bch_dev *ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_remove(c, ca, arg.flags, &err);
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
}
static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
@@ -245,11 +306,36 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
if (ret)
return ret;
- ret = bch2_dev_online(c, path);
+ CLASS(printbuf, err)();
+ ret = bch2_dev_online(c, path, &err);
+ if (ret)
+ bch_err(c, "%s", err.buf);
kfree(path);
return ret;
}
+static long bch2_ioctl_disk_online_v2(struct bch_fs *c, struct bch_ioctl_disk_v2 arg)
+{
+ char *path;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (arg.flags || arg.pad)
+ return -EINVAL;
+
+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(path);
+ if (ret)
+ return ret;
+
+ CLASS(printbuf, err)();
+ ret = bch2_dev_online(c, path, &err);
+ kfree(path);
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
+}
+
static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
{
if (!capable(CAP_SYS_ADMIN))
@@ -266,7 +352,32 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
if (IS_ERR(ca))
return PTR_ERR(ca);
- return bch2_dev_offline(c, ca, arg.flags);
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_offline(c, ca, arg.flags, &err);
+ if (ret)
+ bch_err(ca, "%s", err.buf);
+ return ret;
+}
+
+static long bch2_ioctl_disk_offline_v2(struct bch_fs *c, struct bch_ioctl_disk_v2 arg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+ BCH_FORCE_IF_METADATA_LOST|
+ BCH_FORCE_IF_DEGRADED|
+ BCH_BY_INDEX)) ||
+ arg.pad)
+ return -EINVAL;
+
+ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_offline(c, ca, arg.flags, &err);
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
}
static long bch2_ioctl_disk_set_state(struct bch_fs *c,
@@ -287,11 +398,40 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
if (IS_ERR(ca))
return PTR_ERR(ca);
- int ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags, &err);
bch_err_msg(ca, ret, "setting device state");
return ret;
}
+static long bch2_ioctl_disk_set_state_v2(struct bch_fs *c,
+ struct bch_ioctl_disk_set_state_v2 arg)
+{
+ CLASS(printbuf, err)();
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+ BCH_FORCE_IF_METADATA_LOST|
+ BCH_FORCE_IF_DEGRADED|
+ BCH_BY_INDEX)) ||
+ arg.pad[0] || arg.pad[1] || arg.pad[2] ||
+ arg.new_state >= BCH_MEMBER_STATE_NR)
+ return -EINVAL;
+
+ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags);
+ int ret = PTR_ERR_OR_ZERO(ca);
+ if (ret) {
+ prt_printf(&err, "device %llu not found\n", arg.dev);
+ goto err;
+ }
+
+ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags, &err);
+err:
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
+}
+
struct bch_data_ctx {
struct thread_with_file thr;
@@ -620,7 +760,30 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c,
if (IS_ERR(ca))
return PTR_ERR(ca);
- return bch2_dev_resize(c, ca, arg.nbuckets);
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_resize(c, ca, arg.nbuckets, &err);
+ if (ret)
+ bch_err(ca, "%s", err.buf);
+ return ret;
+}
+
+static long bch2_ioctl_disk_resize_v2(struct bch_fs *c,
+ struct bch_ioctl_disk_resize_v2 arg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad)
+ return -EINVAL;
+
+ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ CLASS(printbuf, err)();
+ int ret = bch2_dev_resize(c, ca, arg.nbuckets, &err);
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
}
static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
@@ -643,6 +806,28 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
return bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
}
+static long bch2_ioctl_disk_resize_journal_v2(struct bch_fs *c,
+ struct bch_ioctl_disk_resize_journal_v2 arg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad)
+ return -EINVAL;
+
+ if (arg.nbuckets > U32_MAX)
+ return -EINVAL;
+
+ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ CLASS(printbuf, err)();
+ int ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
+ return copy_ioctl_err_msg(&arg.err, &err, ret);
+}
+
#define BCH_IOCTL(_name, _argtype) \
do { \
_argtype i; \
@@ -684,20 +869,34 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
switch (cmd) {
case BCH_IOCTL_DISK_ADD:
BCH_IOCTL(disk_add, struct bch_ioctl_disk);
+ case BCH_IOCTL_DISK_ADD_v2:
+ BCH_IOCTL(disk_add_v2, struct bch_ioctl_disk_v2);
case BCH_IOCTL_DISK_REMOVE:
BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
+ case BCH_IOCTL_DISK_REMOVE_v2:
+ BCH_IOCTL(disk_remove_v2, struct bch_ioctl_disk_v2);
case BCH_IOCTL_DISK_ONLINE:
BCH_IOCTL(disk_online, struct bch_ioctl_disk);
+ case BCH_IOCTL_DISK_ONLINE_v2:
+ BCH_IOCTL(disk_online_v2, struct bch_ioctl_disk_v2);
case BCH_IOCTL_DISK_OFFLINE:
BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
+ case BCH_IOCTL_DISK_OFFLINE_v2:
+ BCH_IOCTL(disk_offline_v2, struct bch_ioctl_disk_v2);
case BCH_IOCTL_DISK_SET_STATE:
BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
+ case BCH_IOCTL_DISK_SET_STATE_v2:
+ BCH_IOCTL(disk_set_state_v2, struct bch_ioctl_disk_set_state_v2);
case BCH_IOCTL_DATA:
BCH_IOCTL(data, struct bch_ioctl_data);
case BCH_IOCTL_DISK_RESIZE:
BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+ case BCH_IOCTL_DISK_RESIZE_v2:
+ BCH_IOCTL(disk_resize_v2, struct bch_ioctl_disk_resize_v2);
case BCH_IOCTL_DISK_RESIZE_JOURNAL:
BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
+ case BCH_IOCTL_DISK_RESIZE_JOURNAL_v2:
+ BCH_IOCTL(disk_resize_journal_v2, struct bch_ioctl_disk_resize_journal_v2);
case BCH_IOCTL_FSCK_ONLINE:
BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
case BCH_IOCTL_QUERY_ACCOUNTING:
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
index e86d36d23e9e..6940037bd19e 100644
--- a/fs/bcachefs/darray.c
+++ b/fs/bcachefs/darray.c
@@ -1,11 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/log2.h>
+#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include "darray.h"
-int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp,
+ bool rcu)
{
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
@@ -20,18 +22,25 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_
if (unlikely(check_mul_overflow(new_size, element_size, &bytes)))
return -ENOMEM;
- void *data = likely(bytes < INT_MAX)
+ void *old = d->data;
+ void *new = likely(bytes < INT_MAX)
? kvmalloc_noprof(bytes, gfp)
: vmalloc_noprof(bytes);
- if (!data)
+ if (!new)
return -ENOMEM;
if (d->size)
- memcpy(data, d->data, d->size * element_size);
- if (d->data != d->preallocated)
- kvfree(d->data);
- d->data = data;
+ memcpy(new, old, d->size * element_size);
+
+ rcu_assign_pointer(d->data, new);
d->size = new_size;
+
+ if (old != d->preallocated) {
+ if (!rcu)
+ kvfree(old);
+ else
+ kvfree_rcu_mightsleep(old);
+ }
}
return 0;
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 4080ee99aadd..b4f284fe9652 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -34,17 +34,17 @@ typedef DARRAY(s16) darray_s16;
typedef DARRAY(s32) darray_s32;
typedef DARRAY(s64) darray_s64;
-int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
+int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t, bool);
#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
-#define __darray_resize(_d, _element_size, _new_size, _gfp) \
+#define __darray_resize(_d, _element_size, _new_size, _gfp, _rcu) \
(unlikely((_new_size) > (_d)->size) \
- ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
+ ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp), _rcu)\
: 0)
#define darray_resize_gfp(_d, _new_size, _gfp) \
- __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
+ __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp, false)
#define darray_resize(_d, _new_size) \
darray_resize_gfp(_d, _new_size, GFP_KERNEL)
@@ -55,6 +55,12 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
#define darray_make_room(_d, _more) \
darray_make_room_gfp(_d, _more, GFP_KERNEL)
+#define darray_resize_rcu(_d, _new_size) \
+ __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), GFP_KERNEL, true)
+
+#define darray_make_room_rcu(_d, _more) \
+ darray_resize_rcu((_d), (_d)->nr + (_more))
+
#define darray_room(_d) ((_d).size - (_d).nr)
#define darray_top(_d) ((_d).data[(_d).nr])
@@ -107,8 +113,11 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
#define __darray_for_each(_d, _i) \
for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
+#define darray_for_each_from(_d, _i, _start) \
+ for (typeof(&(_d).data[0]) _i = _start; _i < (_d).data + (_d).nr; _i++)
+
#define darray_for_each(_d, _i) \
- for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+ darray_for_each_from(_d, _i, (_d).data)
#define darray_for_each_reverse(_d, _i) \
for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 01838a3a189d..2c997fddefb3 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -225,7 +225,7 @@ static void trace_io_move_created_rebalance2(struct data_update *m,
trace_io_move_created_rebalance(c, buf.buf);
- this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]);
+ count_event(c, io_move_created_rebalance);
}
noinline_for_stack
@@ -693,6 +693,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
if (ret)
return ret;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i = 0;
+ bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+ if (data_opts->kill_ec_ptrs & BIT(i))
+ bch2_bkey_drop_ec(n, p.ptr.dev);
+ i++;
+ }
+
while (data_opts->kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts->kill_ptrs);
@@ -803,10 +812,14 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
break;
}
- if (!nr_replicas) {
+ if (nr_replicas < m->op.nr_replicas) {
+ prt_printf(&buf, "\nnr_replicas %u < %u", nr_replicas, m->op.nr_replicas);
trace_data_update_done_no_rw_devs(c, buf.buf);
- return bch_err_throw(c, data_update_done_no_rw_devs);
}
+
+ if (!nr_replicas)
+ return bch_err_throw(c, data_update_done_no_rw_devs);
+
if (nr_replicas < m->op.nr_replicas)
return bch_err_throw(c, insufficient_devices);
return 0;
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 5e14d13568de..fc12aa65366f 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -12,6 +12,7 @@ struct moving_context;
struct data_update_opts {
unsigned rewrite_ptrs;
unsigned kill_ptrs;
+ unsigned kill_ec_ptrs;
u16 target;
u8 extra_replicas;
unsigned btree_insert_flags;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index cb44b35e0f1d..fe6f3d874a47 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -95,7 +95,7 @@ static u64 bch2_dirent_hash(const struct bch_hash_info *info,
bch2_str_hash_update(&ctx, info, name->name, name->len);
/* [0,2) reserved for dots */
- return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
+ return max_t(u64, bch2_str_hash_end(&ctx, info, true), 2);
}
static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index f96530c70262..5863b5a30b61 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -11,6 +11,7 @@
#include "disk_accounting.h"
#include "error.h"
#include "journal_io.h"
+#include "recovery_passes.h"
#include "replicas.h"
/*
@@ -184,6 +185,9 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
void *end = &acc_k + 1;
int ret = 0;
+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+ return 0;
+
bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) &&
bversion_zero(k.k->bversion),
c, accounting_key_version_0,
@@ -731,6 +735,37 @@ invalid_device:
goto fsck_err;
}
+static struct journal_key *accumulate_newer_accounting_keys(struct bch_fs *c, struct journal_key *i)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ struct bkey_i *k = journal_key_k(c, i);
+
+ darray_for_each_from(*keys, j, i + 1) {
+ if (journal_key_cmp(c, i, j))
+ return j;
+
+ struct bkey_i *n = journal_key_k(c, j);
+ if (n->k.type == KEY_TYPE_accounting) {
+ WARN_ON(bversion_cmp(k->k.bversion, n->k.bversion) >= 0);
+
+ bch2_accounting_accumulate(bkey_i_to_accounting(k),
+ bkey_i_to_s_c_accounting(n));
+ j->overwritten = true;
+ }
+ }
+
+ return &darray_top(*keys);
+}
+
+static struct journal_key *accumulate_and_read_journal_accounting(struct btree_trans *trans, struct journal_key *i)
+{
+ struct bch_fs *c = trans->c;
+ struct journal_key *next = accumulate_newer_accounting_keys(c, i);
+
+ int ret = accounting_read_key(trans, bkey_i_to_s_c(journal_key_k(c, i)));
+ return ret ? ERR_PTR(ret) : next;
+}
+
/*
* At startup time, initialize the in memory accounting from the btree (and
* journal)
@@ -756,80 +791,76 @@ int bch2_accounting_read(struct bch_fs *c)
percpu_memset(c->usage, 0, sizeof(*c->usage));
}
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_key *jk = keys->data;
+
+ while (jk < &darray_top(*keys) &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0)
+ jk++;
+
+ struct journal_key *end = jk;
+ while (end < &darray_top(*keys) &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0)
+ end++;
+
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
iter.flags &= ~BTREE_ITER_with_journal;
int ret = for_each_btree_key_continue(trans, iter,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
-
- if (k.k->type != KEY_TYPE_accounting)
- continue;
-
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, k.k->p);
-
- if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
- break;
-
- if (!bch2_accounting_is_mem(&acc_k)) {
- struct disk_accounting_pos next;
- memset(&next, 0, sizeof(next));
- next.type = acc_k.type + 1;
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
- continue;
- }
-
- accounting_read_key(trans, k);
- }));
- bch2_trans_iter_exit(&iter);
- if (ret)
- return ret;
+ struct bkey u;
+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
- struct journal_keys *keys = &c->journal_keys;
- struct journal_key *dst = keys->data;
- move_gap(keys, keys->nr);
+ if (k.k->type != KEY_TYPE_accounting)
+ continue;
- darray_for_each(*keys, i) {
- if (i->k->k.type == KEY_TYPE_accounting) {
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
+ while (jk < end &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0)
+ jk = accumulate_and_read_journal_accounting(trans, jk);
- if (!bch2_accounting_is_mem(&acc_k))
- continue;
+ while (jk < end &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 &&
+ bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) {
+ jk->overwritten = true;
+ jk++;
+ }
- struct bkey_s_c k = bkey_i_to_s_c(i->k);
- unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
- sizeof(acc->k.data[0]),
- accounting_pos_cmp, &k.k->p);
+ if (jk < end &&
+ __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0)
+ jk = accumulate_and_read_journal_accounting(trans, jk);
- bool applied = idx < acc->k.nr &&
- bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
- if (applied)
- continue;
+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+ break;
- if (i + 1 < &darray_top(*keys) &&
- i[1].k->k.type == KEY_TYPE_accounting &&
- !journal_key_cmp(i, i + 1)) {
- WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
+ if (!bch2_accounting_is_mem(&acc_k)) {
+ struct disk_accounting_pos next_acc;
+ memset(&next_acc, 0, sizeof(next_acc));
+ next_acc.type = acc_k.type + 1;
+ struct bpos next = disk_accounting_pos_to_bpos(&next_acc);
+ if (jk < end)
+ next = bpos_min(next, journal_key_k(c, jk)->k.p);
+
+ bch2_btree_iter_set_pos(&iter, next);
+ continue;
+ }
- i[1].journal_seq = i[0].journal_seq;
+ accounting_read_key(trans, k);
+ }));
+ bch2_trans_iter_exit(&iter);
+ if (ret)
+ return ret;
- bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k),
- bkey_s_c_to_accounting(k));
- continue;
- }
+ while (jk < end)
+ jk = accumulate_and_read_journal_accounting(trans, jk);
- ret = accounting_read_key(trans, k);
- if (ret)
- return ret;
- }
-
- *dst++ = *i;
- }
+ struct journal_key *dst = keys->data;
+ darray_for_each(*keys, i)
+ if (!i->overwritten)
+ *dst++ = *i;
keys->gap = keys->nr = dst - keys->data;
guard(percpu_write)(&c->mark_lock);
@@ -880,6 +911,40 @@ int bch2_accounting_read(struct bch_fs *c)
u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+ /*
+ * Check for underflow, schedule check_allocations
+ * necessary:
+ *
+ * XXX - see if we can factor this out to run on a bkey
+ * so we can check everything lazily, right now we don't
+ * check the non in-mem counters at all
+ */
+ bool underflow = false;
+ for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
+ underflow |= (s64) v[j] < 0;
+
+ if (underflow) {
+ CLASS(printbuf, buf)();
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "Accounting underflow for\n");
+ bch2_accounting_key_to_text(&buf, &k);
+
+ for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
+ prt_printf(&buf, " %lli", v[j]);
+
+ bool print = bch2_count_fsck_err(c, accounting_key_underflow, &buf);
+ unsigned pos = buf.pos;
+ ret = bch2_run_explicit_recovery_pass(c, &buf,
+ BCH_RECOVERY_PASS_check_allocations, 0);
+ print |= buf.pos != pos;
+
+ if (print)
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ if (ret)
+ return ret;
+ }
+
switch (k.type) {
case BCH_DISK_ACCOUNTING_persistent_reserved:
usage->reserved += v[0] * k.persistent_reserved.nr_replicas;
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 43f4b21d0aab..cc73cce98a44 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -145,6 +145,16 @@ static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc)
acc->type != BCH_DISK_ACCOUNTING_inum;
}
+static inline bool bch2_bkey_is_accounting_mem(struct bkey *k)
+{
+ if (k->type != KEY_TYPE_accounting)
+ return false;
+
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k->p);
+ return bch2_accounting_is_mem(&acc_k);
+}
+
/*
* Update in memory counters so they match the btree update we're doing; called
* from transaction commit path
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 32a286b3a74e..e33f3166c48a 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -141,14 +141,16 @@ void bch2_io_error_work(struct work_struct *work)
if (ca->mi.state >= BCH_MEMBER_STATE_ro)
return;
- bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
- BCH_FORCE_IF_DEGRADED);
CLASS(printbuf, buf)();
__bch2_log_msg_start(ca->name, &buf);
- prt_printf(&buf, "writes erroring for %u seconds, setting %s ro",
- c->opts.write_error_timeout,
- dev ? "device" : "filesystem");
+ prt_printf(&buf, "writes erroring for %u seconds\n",
+ c->opts.write_error_timeout);
+
+ bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
+ BCH_FORCE_IF_DEGRADED, &buf);
+
+ prt_printf(&buf, "setting %s ro", dev ? "device" : "filesystem");
if (!dev)
bch2_fs_emergency_read_only2(c, &buf);
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 0c3c3a24fc6f..213814787dd6 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -173,7 +173,8 @@ do { \
if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \
!bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \
ret = _ret; \
- ret = bch_err_throw(c, fsck_delete_bkey); \
+ else \
+ ret = bch_err_throw(c, fsck_delete_bkey); \
goto fsck_err; \
} while (0)
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index c4b0ea1adaa8..73eb28090bc7 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -98,11 +98,13 @@ static int count_iters_for_insert(struct btree_trans *trans,
return ret2 ?: ret;
}
-int bch2_extent_atomic_end(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos *end)
+int bch2_extent_trim_atomic(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *insert)
{
- unsigned nr_iters = 0;
+ enum bch_bkey_type whiteout_type =
+ extent_whiteout_type(trans->c, iter->btree_id, &insert->k);
+ struct bpos end = insert->k.p;
struct btree_iter copy;
bch2_trans_copy_iter(&copy, iter);
@@ -111,42 +113,60 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
if (ret)
goto err;
+ copy.flags |= BTREE_ITER_nofilter_whiteouts;
+
+ /*
+ * We're doing our own whiteout filtering, but we still need to pass a
+ * max key to avoid popping an assert in bch2_snapshot_is_ancestor():
+ */
struct bkey_s_c k;
- for_each_btree_key_max_continue_norestart(copy, *end, 0, k, ret) {
+ unsigned nr_iters = 0;
+ for_each_btree_key_max_continue_norestart(copy,
+ POS(insert->k.p.inode, U64_MAX),
+ 0, k, ret) {
unsigned offset = 0;
if (bkey_gt(iter->pos, bkey_start_pos(k.k)))
offset = iter->pos.offset - bkey_start_offset(k.k);
- ret = count_iters_for_insert(trans, k, offset, end, &nr_iters);
- if (ret)
- break;
+ if (bkey_extent_whiteout(k.k)) {
+ if (bpos_gt(k.k->p, insert->k.p)) {
+ if (k.k->type == KEY_TYPE_extent_whiteout)
+ break;
+ else
+ continue;
+ } else if (k.k->type != whiteout_type) {
+ nr_iters += 1;
+ if (nr_iters >= EXTENT_ITERS_MAX) {
+ end = bpos_min(end, k.k->p);
+ break;
+ }
+ }
+ } else {
+ if (bpos_ge(bkey_start_pos(k.k), end))
+ break;
+
+ ret = count_iters_for_insert(trans, k, offset, &end, &nr_iters);
+ if (ret)
+ break;
+ }
}
err:
bch2_trans_iter_exit(&copy);
- return ret < 0 ? ret : 0;
-}
-
-int bch2_extent_trim_atomic(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *k)
-{
- struct bpos end = k->k.p;
- int ret = bch2_extent_atomic_end(trans, iter, &end);
- if (ret)
+ if (ret < 0)
return ret;
/* tracepoint */
- if (bpos_lt(end, k->k.p)) {
+ if (bpos_lt(end, insert->k.p)) {
if (trace_extent_trim_atomic_enabled()) {
CLASS(printbuf, buf)();
bch2_bpos_to_text(&buf, end);
prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
+ bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(insert));
trace_extent_trim_atomic(trans->c, buf.buf);
}
- bch2_cut_back(end, k);
+ bch2_cut_back(end, insert);
}
return 0;
}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index 34467db53f45..2d956d971b11 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -4,8 +4,6 @@
#include "bcachefs.h"
-int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
- struct bpos *);
int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
struct bkey_i *);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index b879a586b7f6..68a61f7bc737 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -157,7 +157,7 @@ static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p2,
u64 p2_latency)
{
- struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
+ struct bch_dev *ca2 = bch2_dev_rcu_noerror(c, p2.ptr.dev);
int failed_delta = dev_failed(ca1) - dev_failed(ca2);
if (unlikely(failed_delta))
@@ -419,7 +419,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
return false;
/* Extents may not straddle buckets: */
- struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, lp.ptr.dev);
bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
if (!same_bucket)
@@ -815,14 +815,14 @@ static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent
unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
{
- struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->ptr.dev);
return ca ? __extent_ptr_durability(ca, p) : 0;
}
unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
{
- struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->ptr.dev);
if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
return 0;
@@ -995,6 +995,22 @@ void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
}
+void bch2_bkey_drop_ec(struct bkey_i *k, unsigned dev)
+{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+ union bch_extent_entry *entry, *ec = NULL;
+
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr)
+ ec = entry;
+ else if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_ptr &&
+ entry->ptr.dev == dev) {
+ bch2_bkey_extent_entry_drop(k, ec);
+ return;
+ }
+ }
+}
+
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -1028,7 +1044,7 @@ bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
guard(rcu)();
bkey_for_each_ptr(ptrs, ptr)
if (bch2_dev_in_target(c, ptr->dev, target) &&
- (ca = bch2_dev_rcu(c, ptr->dev)) &&
+ (ca = bch2_dev_rcu_noerror(c, ptr->dev)) &&
(!ptr->cached ||
!dev_ptr_stale_rcu(ca, ptr)))
return true;
@@ -1212,7 +1228,7 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
guard(rcu)();
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
- (!(ca = bch2_dev_rcu(c, ptr->dev)) ||
+ (!(ca = bch2_dev_rcu_noerror(c, ptr->dev)) ||
dev_ptr_stale_rcu(ca, ptr) > 0));
return bkey_deleted(k.k);
@@ -1757,3 +1773,4 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
return -val_u64s_delta;
}
+
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 35ee03cd5065..f6dcb17108cd 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -650,6 +650,7 @@ void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
void bch2_bkey_drop_device(struct bkey_s, unsigned);
+void bch2_bkey_drop_ec(struct bkey_i *k, unsigned);
#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \
do { \
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 0005569ecace..9532f1a73053 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -42,6 +42,14 @@ struct readpages_iter {
folios folios;
};
+static inline void readpages_iter_folio_revert(struct readahead_control *ractl,
+ struct folio *folio)
+{
+ bch2_folio_release(folio);
+ ractl->_nr_pages += folio_nr_pages(folio);
+ ractl->_index -= folio_nr_pages(folio);
+}
+
static int readpages_iter_init(struct readpages_iter *iter,
struct readahead_control *ractl)
{
@@ -52,9 +60,7 @@ static int readpages_iter_init(struct readpages_iter *iter,
while ((folio = __readahead_folio(ractl))) {
if (!bch2_folio_create(folio, GFP_KERNEL) ||
darray_push(&iter->folios, folio)) {
- bch2_folio_release(folio);
- ractl->_nr_pages += folio_nr_pages(folio);
- ractl->_index -= folio_nr_pages(folio);
+ readpages_iter_folio_revert(ractl, folio);
return iter->folios.nr ? 0 : -ENOMEM;
}
@@ -64,6 +70,15 @@ static int readpages_iter_init(struct readpages_iter *iter,
return 0;
}
+static void readpages_iter_exit(struct readpages_iter *iter,
+ struct readahead_control *ractl)
+{
+ darray_for_each_reverse(iter->folios, folio) {
+ readpages_iter_folio_revert(ractl, *folio);
+ folio_get(*folio);
+ }
+}
+
static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
{
if (iter->idx >= iter->folios.nr)
@@ -290,7 +305,10 @@ void bch2_readahead(struct readahead_control *ractl)
* scheduling.
*/
blk_start_plug(&plug);
- bch2_pagecache_add_get(inode);
+ if (!bch2_pagecache_add_tryget(inode)) {
+ readpages_iter_exit(&readpages_iter, ractl);
+ goto out;
+ }
struct btree_trans *trans = bch2_trans_get(c);
while ((folio = readpage_iter_peek(&readpages_iter))) {
@@ -317,6 +335,7 @@ void bch2_readahead(struct readahead_control *ractl)
bch2_trans_put(trans);
bch2_pagecache_add_put(inode);
+out:
blk_finish_plug(&plug);
darray_exit(&readpages_iter.folios);
}
@@ -648,6 +667,17 @@ do_io:
return 0;
}
+static int bch2_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc, void *data)
+{
+ struct folio *folio = NULL;
+ int error;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+ error = __bch2_writepage(folio, wbc, data);
+ return error;
+}
+
int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct bch_fs *c = mapping->host->i_sb->s_fs_info;
@@ -656,7 +686,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode);
blk_start_plug(&w->plug);
- int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w);
+ int ret = bch2_write_cache_pages(mapping, wbc, w);
if (w->io)
bch2_writepage_do_io(w);
blk_finish_plug(&w->plug);
@@ -759,7 +789,6 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
struct bch2_folio_reservation *res = fsdata;
unsigned offset = pos - folio_pos(folio);
- lockdep_assert_held(&inode->v.i_rwsem);
BUG_ON(offset + copied > folio_size(folio));
if (unlikely(copied < len && !folio_test_uptodate(folio))) {
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b5e3090f1cb8..0425238a83ee 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -268,7 +268,7 @@ restart:
rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
if (inode->ei_inum.inum == inum) {
ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
- GFP_NOWAIT|__GFP_NOWARN);
+ GFP_NOWAIT);
if (ret) {
rcu_read_unlock();
ret = darray_make_room(&subvols, 1);
@@ -826,14 +826,6 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
bch2_inode_update_after_write(trans, inode, &inode_u,
ATTR_MTIME);
- if (inode_u.bi_subvol) {
- /*
- * Subvolume deletion is asynchronous, but we still want to tell
- * the VFS that it's been deleted here:
- */
- set_nlink(&inode->v, 0);
- }
-
if (IS_CASEFOLDED(vdir))
d_invalidate(dentry);
err:
@@ -865,9 +857,7 @@ static int bch2_symlink(struct mnt_idmap *idmap,
if (IS_ERR(inode))
return bch2_err_class(PTR_ERR(inode));
- inode_lock(&inode->v);
ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
- inode_unlock(&inode->v);
if (unlikely(ret))
goto err;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6ccea09243ab..ccc44b1fc178 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -266,7 +266,8 @@ create_lostfound:
root_inode.bi_nlink++;
- ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
+ ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu,
+ inode_opt_get(c, &root_inode, inodes_32bit));
if (ret)
goto err;
@@ -573,7 +574,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
new_inode.bi_subvol = subvolid;
- int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
+ int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu, false) ?:
bch2_btree_iter_traverse(&inode_iter) ?:
bch2_inode_write(trans, &inode_iter, &new_inode);
bch2_trans_iter_exit(&inode_iter);
@@ -1444,7 +1445,7 @@ static int check_key_has_inode(struct btree_trans *trans,
if (ret)
return ret;
- if (k.k->type == KEY_TYPE_whiteout)
+ if (bkey_extent_whiteout(k.k))
return 0;
bool have_inode = i && !i->whiteout;
@@ -1924,7 +1925,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
&inode->recalculate_sums);
if (ret)
goto err;
+ }
+ if (!bkey_extent_whiteout(k.k)) {
/*
* Check inodes in reverse order, from oldest snapshots to
* newest, starting from the inode that matches this extent's
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index d5e5190f0663..4aa130ff7cf6 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -944,11 +944,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
}
static struct bkey_i_inode_alloc_cursor *
-bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
+bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max,
+ bool is_32bit)
{
struct bch_fs *c = trans->c;
- u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1;
+ u64 cursor_idx = is_32bit ? 0 : cpu + 1;
cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);
@@ -967,7 +968,7 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m
if (IS_ERR(cursor))
return cursor;
- if (c->opts.inodes_32bit) {
+ if (is_32bit) {
*min = BLOCKDEV_INODE_MAX;
*max = INT_MAX;
} else {
@@ -996,11 +997,11 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m
int bch2_inode_create(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode_u,
- u32 snapshot, u64 cpu)
+ u32 snapshot, u64 cpu, bool is_32bit)
{
u64 min, max;
struct bkey_i_inode_alloc_cursor *cursor =
- bch2_inode_alloc_cursor_get(trans, cpu, &min, &max);
+ bch2_inode_alloc_cursor_get(trans, cpu, &min, &max, is_32bit);
int ret = PTR_ERR_OR_ZERO(cursor);
if (ret)
return ret;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index b8ec3e628d90..79092ea74844 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -172,7 +172,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
struct bch_inode_unpacked *);
int bch2_inode_create(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *, u32, u64);
+ struct bch_inode_unpacked *, u32, u64, bool);
int bch2_inode_rm(struct bch_fs *, subvol_inum);
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index 1f00938b1bdc..e07fa6cc99bd 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -144,7 +144,8 @@ enum inode_opt_id {
x(unlinked, 7) \
x(backptr_untrusted, 8) \
x(has_child_snapshot, 9) \
- x(has_case_insensitive, 10)
+ x(has_case_insensitive, 10) \
+ x(31bit_dirent_offset, 11)
/* bits 20+ reserved for packed fields below: */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 07869436a964..93ac0faedf7d 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -120,6 +120,7 @@ static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
INIT_LIST_HEAD(&p->flushed[i]);
atomic_set(&p->count, count);
p->devs.nr = 0;
+ p->bytes = 0;
}
/*
@@ -264,6 +265,11 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
+ struct journal_entry_pin_list *pin_list =
+ journal_seq_pin(j, journal_cur_seq(j));
+ pin_list->bytes = roundup_pow_of_two(vstruct_bytes(buf->data));
+ j->dirty_entry_bytes += pin_list->bytes;
+
if (trace_journal_entry_close_enabled() && trace) {
CLASS(printbuf, err)();
guard(printbuf_atomic)(&err);
@@ -737,9 +743,9 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
return ret;
CLASS(printbuf, buf)();
+ prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret));
bch2_journal_debug_to_text(&buf, j);
bch2_print_str(c, KERN_ERR, buf.buf);
- prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret));
closure_wait_event(&j->async_wait,
!bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 093e4acad085..0a9fbc76f363 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -3,6 +3,7 @@
#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_io.h"
+#include "btree_journal_iter.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "buckets.h"
@@ -106,11 +107,6 @@ static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *cs
return !bch2_crc_cmp(j->csum, *csum);
}
-static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
-{
- return (seq - c->journal_entries_base_seq) & (~0U >> 1);
-}
-
static void __journal_replay_free(struct bch_fs *c,
struct journal_replay *i)
{
@@ -156,6 +152,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct journal_replay **_i, *i, *dup;
size_t bytes = vstruct_bytes(j);
u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+ u64 seq = le64_to_cpu(j->seq);
CLASS(printbuf, buf)();
int ret = JOURNAL_ENTRY_ADD_OK;
@@ -163,12 +160,11 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
last_seq = min(last_seq, c->opts.journal_rewind);
if (!c->journal.oldest_seq_found_ondisk ||
- le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
- c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
+ seq < c->journal.oldest_seq_found_ondisk)
+ c->journal.oldest_seq_found_ondisk = seq;
/* Is this entry older than the range we need? */
- if (!c->opts.read_entire_journal &&
- le64_to_cpu(j->seq) < jlist->last_seq)
+ if (!c->opts.read_entire_journal && seq < jlist->last_seq)
return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
/*
@@ -177,7 +173,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
* within the range of +-2billion of the filrst one we find.
*/
if (!c->journal_entries_base_seq)
- c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
+ c->journal_entries_base_seq = max_t(s64, 1, seq - S32_MAX);
/* Drop entries we don't need anymore */
if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
@@ -195,11 +191,36 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
}
}
+ /* Drop overwrites, log entries if we don't need them: */
+ if (!c->opts.retain_recovery_info &&
+ !c->opts.journal_rewind) {
+ vstruct_for_each_safe(j, src)
+ if (vstruct_end(src) > vstruct_end(j))
+ goto nocompact;
+
+ struct jset_entry *dst = j->start;
+ vstruct_for_each_safe(j, src) {
+ if (src->type == BCH_JSET_ENTRY_log ||
+ src->type == BCH_JSET_ENTRY_overwrite)
+ continue;
+
+ memmove_u64s_down(dst, src, vstruct_u64s(src));
+ dst = vstruct_next(dst);
+ }
+
+ j->u64s = cpu_to_le32((u64 *) dst - j->_data);
+ bytes = vstruct_bytes(j);
+ }
+nocompact:
jlist->last_seq = max(jlist->last_seq, last_seq);
- _i = genradix_ptr_alloc(&c->journal_entries,
- journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
- GFP_KERNEL);
+ if (seq < c->journal_entries_base_seq ||
+ seq >= c->journal_entries_base_seq + U32_MAX) {
+ bch_err(c, "journal entry sequence numbers span too large a range: cannot reply, contact developers");
+ return bch_err_throw(c, ENOMEM_journal_entry_add);
+ }
+
+ _i = genradix_ptr_alloc(&c->journal_entries, journal_entry_radix_idx(c, seq), GFP_KERNEL);
if (!_i)
return bch_err_throw(c, ENOMEM_journal_entry_add);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index f53c5c81d137..f8754bf71264 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -7,29 +7,6 @@
void bch2_journal_pos_from_member_info_set(struct bch_fs *);
void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
-struct journal_ptr {
- bool csum_good;
- struct bch_csum csum;
- u8 dev;
- u32 bucket;
- u32 bucket_offset;
- u64 sector;
-};
-
-/*
- * Only used for holding the journal entries we read in btree_journal_read()
- * during cache_registration
- */
-struct journal_replay {
- DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
-
- bool csum_good;
- bool ignore_blacklisted;
- bool ignore_not_dirty;
- /* must be last: */
- struct jset j;
-};
-
static inline bool journal_replay_ignore(struct journal_replay *i)
{
return !i || i->ignore_blacklisted || i->ignore_not_dirty;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index f23e5ee9ad75..bd1885607d3e 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -148,6 +148,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
+ size_t mem_limit = max_t(ssize_t, 0,
+ (totalram_pages() * PAGE_SIZE) / 4 - j->dirty_entry_bytes);
+
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
if (!ca->journal.nr ||
!ca->mi.durability)
@@ -180,6 +183,7 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
* @nr_devs_want largest devices:
*/
space = dev_space[nr_devs_want - 1];
+ space.total = min(space.total, mem_limit >> 9);
space.next_entry = min(space.next_entry, min_bucket_size);
return space;
}
@@ -328,9 +332,17 @@ void bch2_journal_reclaim_fast(struct journal *j)
* Unpin journal entries whose reference counts reached zero, meaning
* all btree nodes got written out
*/
+ struct journal_entry_pin_list *pin_list;
while (!fifo_empty(&j->pin) &&
j->pin.front <= j->seq_ondisk &&
- !atomic_read(&fifo_peek_front(&j->pin).count)) {
+ !atomic_read(&(pin_list = &fifo_peek_front(&j->pin))->count)) {
+
+ if (WARN_ON(j->dirty_entry_bytes < pin_list->bytes))
+ pin_list->bytes = j->dirty_entry_bytes;
+
+ j->dirty_entry_bytes -= pin_list->bytes;
+ pin_list->bytes = 0;
+
j->pin.front++;
popped = true;
}
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 51104bbb99da..7c9273bd0e15 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -71,6 +71,7 @@ struct journal_entry_pin_list {
struct list_head flushed[JOURNAL_PIN_TYPE_NR];
atomic_t count;
struct bch_devs_list devs;
+ size_t bytes;
};
struct journal;
@@ -253,6 +254,7 @@ struct journal {
u64 front, back, size, mask;
struct journal_entry_pin_list *data;
} pin;
+ size_t dirty_entry_bytes;
struct journal_space space[journal_space_nr];
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 8abd0aa2083a..6f1e0a7b5db5 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -24,6 +24,16 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
return pos;
}
+static inline struct bpos lru_start(u16 lru_id)
+{
+ return lru_pos(lru_id, 0, 0);
+}
+
+static inline struct bpos lru_end(u16 lru_id)
+{
+ return lru_pos(lru_id, U64_MAX, LRU_TIME_MAX);
+}
+
static inline enum bch_lru_type lru_type(struct bkey_s_c l)
{
u16 lru_id = l.k->p.inode >> 48;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index a66d01d04e57..892990b4a6a6 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -125,6 +125,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c,
if (!btree_type_has_ptrs(id))
continue;
+ /* Stripe keys have pointers, but are handled separately */
+ if (id == BTREE_ID_stripes)
+ continue;
+
int ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index a38996f5366f..4f41f1f6ec6c 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -150,7 +150,7 @@ static void move_write_done(struct bch_write_op *op)
bch2_write_op_to_text(&buf, op);
trace_io_move_write_fail(c, buf.buf);
}
- this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
+ count_event(c, io_move_write_fail);
ctxt->write_error = true;
}
@@ -344,9 +344,13 @@ int bch2_move_extent(struct moving_context *ctxt,
if (!data_opts.rewrite_ptrs &&
!data_opts.extra_replicas &&
!data_opts.scrub) {
- if (data_opts.kill_ptrs)
+ if (data_opts.kill_ptrs|data_opts.kill_ec_ptrs) {
+ this_cpu_add(c->counters[BCH_COUNTER_io_move_drop_only], k.k->size);
return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
- return 0;
+ } else {
+ this_cpu_add(c->counters[BCH_COUNTER_io_move_noop], k.k->size);
+ return 0;
+ }
}
struct moving_io *io = allocate_dropping_locks(trans, ret,
@@ -538,7 +542,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
if (ctxt->wait_on_copygc && c->copygc_running) {
bch2_moving_ctxt_flush_all(ctxt);
- wait_event_killable(c->copygc_running_wq,
+ wait_event_freezable(c->copygc_running_wq,
!c->copygc_running ||
(is_kthread && kthread_should_stop()));
}
@@ -815,7 +819,9 @@ static int bch2_move_data(struct bch_fs *c,
unsigned min_depth_this_btree = min_depth;
- if (!btree_type_has_ptrs(id))
+ /* Stripe keys have pointers, but are handled separately */
+ if (!btree_type_has_ptrs(id) ||
+ id == BTREE_ID_stripes)
min_depth_this_btree = max(min_depth_this_btree, 1);
for (unsigned level = min_depth_this_btree;
@@ -1276,7 +1282,17 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
i++;
}
- return data_opts->kill_ptrs != 0;
+ i = 0;
+ bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+ if (p.has_ec && durability - p.ec.redundancy >= replicas) {
+ data_opts->kill_ec_ptrs |= BIT(i);
+ durability -= p.ec.redundancy;
+ }
+
+ i++;
+ }
+
+ return (data_opts->kill_ptrs|data_opts->kill_ec_ptrs) != 0;
}
static bool scrub_pred(struct bch_fs *c, void *_arg,
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index b0cbe3c1aab6..f36d60b8fb07 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -14,6 +14,7 @@
#include "btree_write_buffer.h"
#include "buckets.h"
#include "clock.h"
+#include "ec.h"
#include "errcode.h"
#include "error.h"
#include "lru.h"
@@ -131,72 +132,153 @@ static bool bucket_in_flight(struct buckets_in_flight *list,
return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params);
}
+static int try_add_copygc_bucket(struct btree_trans *trans,
+ struct buckets_in_flight *buckets_in_flight,
+ struct bpos bucket, u64 lru_time)
+{
+ struct move_bucket b = { .k.bucket = bucket };
+
+ int ret = bch2_bucket_is_movable(trans, &b, lru_time);
+ if (ret <= 0)
+ return ret;
+
+ if (bucket_in_flight(buckets_in_flight, b.k))
+ return 0;
+
+ struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL);
+ if (!b_i)
+ return -ENOMEM;
+
+ *b_i = b;
+
+ ret = darray_push(&buckets_in_flight->to_evacuate, b_i);
+ if (ret) {
+ kfree(b_i);
+ return ret;
+ }
+
+ ret = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash,
+ bch_move_bucket_params);
+ BUG_ON(ret);
+
+ size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
+ return buckets_in_flight->to_evacuate.nr >= nr_to_get;
+}
+
static int bch2_copygc_get_buckets(struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight)
{
struct btree_trans *trans = ctxt->trans;
- struct bch_fs *c = trans->c;
- size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
- size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
- int ret;
- move_buckets_wait(ctxt, buckets_in_flight, false);
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
+ lru_start(BCH_LRU_BUCKET_FRAGMENTATION),
+ lru_end(BCH_LRU_BUCKET_FRAGMENTATION),
+ 0, k,
+ try_add_copygc_bucket(trans, buckets_in_flight,
+ u64_to_bucket(k.k->p.offset),
+ lru_pos_time(k.k->p))
+ );
- ret = bch2_btree_write_buffer_tryflush(trans);
- if (bch2_err_matches(ret, EROFS))
- return ret;
+ return ret < 0 ? ret : 0;
+}
- if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
- return ret;
+static int bch2_copygc_get_stripe_buckets(struct moving_context *ctxt,
+ struct buckets_in_flight *buckets_in_flight)
+{
+ struct btree_trans *trans = ctxt->trans;
- ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
- lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
- lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
- 0, k, ({
- struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
- int ret2 = 0;
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
+ lru_start(BCH_LRU_STRIPE_FRAGMENTATION),
+ lru_end(BCH_LRU_STRIPE_FRAGMENTATION),
+ 0, lru_k, ({
+ CLASS(btree_iter, s_iter)(trans, BTREE_ID_stripes, POS(0, lru_k.k->p.offset), 0);
+ struct bkey_s_c s_k = bch2_btree_iter_peek_slot(&s_iter);
+ int ret2 = bkey_err(s_k);
+ if (ret2)
+ goto err;
- saw++;
+ if (s_k.k->type != KEY_TYPE_stripe)
+ continue;
- ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p));
- if (ret2 < 0)
- goto err;
+ const struct bch_stripe *s = bkey_s_c_to_stripe(s_k).v;
- if (!ret2)
- not_movable++;
- else if (bucket_in_flight(buckets_in_flight, b.k))
- in_flight++;
- else {
- struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL);
- ret2 = b_i ? 0 : -ENOMEM;
+ /* write buffer race? */
+ if (stripe_lru_pos(s) != lru_pos_time(lru_k.k->p))
+ continue;
+
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
+ for (unsigned i = 0; i < nr_data; i++) {
+ if (!stripe_blockcount_get(s, i))
+ continue;
+
+ const struct bch_extent_ptr *ptr = s->ptrs + i;
+ CLASS(bch2_dev_tryget, ca)(trans->c, ptr->dev);
+ if (unlikely(!ca))
+ continue;
+
+ ret2 = try_add_copygc_bucket(trans, buckets_in_flight,
+ PTR_BUCKET_POS(ca, ptr), U64_MAX);
if (ret2)
- goto err;
+ break;
+ }
+err:
+ ret2;
+ }));
- *b_i = b;
+ return ret < 0 ? ret : 0;
+}
+
+static bool should_do_ec_copygc(struct btree_trans *trans)
+{
+ u64 stripe_frag_ratio = 0;
+
+ for_each_btree_key_max(trans, iter, BTREE_ID_lru,
+ lru_start(BCH_LRU_STRIPE_FRAGMENTATION),
+ lru_end(BCH_LRU_STRIPE_FRAGMENTATION),
+ 0, lru_k, ({
+ CLASS(btree_iter, s_iter)(trans, BTREE_ID_stripes, POS(0, lru_k.k->p.offset), 0);
+ struct bkey_s_c s_k = bch2_btree_iter_peek_slot(&s_iter);
+ int ret = bkey_err(s_k);
+ if (ret)
+ goto err;
- ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i);
- if (ret2) {
- kfree(b_i);
- goto err;
- }
+ if (s_k.k->type != KEY_TYPE_stripe)
+ continue;
- ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash,
- bch_move_bucket_params);
- BUG_ON(ret2);
+ const struct bch_stripe *s = bkey_s_c_to_stripe(s_k).v;
- sectors += b.sectors;
- }
+ /* write buffer race? */
+ if (stripe_lru_pos(s) != lru_pos_time(lru_k.k->p))
+ continue;
- ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get;
+ unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_nonempty = 0;
+ for (unsigned i = 0; i < nr_data; i++)
+ blocks_nonempty += !!stripe_blockcount_get(s, i);
+
+ /* stripe is pending delete */
+ if (!blocks_nonempty)
+ continue;
+
+ /* This matches the calculation in alloc_lru_idx_fragmentation, so we can
+ * directly compare without actually looking up the bucket pointed to by the
+ * bucket fragmentation lru:
+ */
+ stripe_frag_ratio = div_u64(blocks_nonempty * (1ULL << 31), nr_data);
+ break;
err:
- ret2;
+ ret;
}));
- pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
- buckets_in_flight->nr, buckets_in_flight->sectors,
- saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret);
+ CLASS(btree_iter, iter)(trans, BTREE_ID_lru, lru_start(BCH_LRU_BUCKET_FRAGMENTATION), 0);
+ struct bkey_s_c lru_k;
- return ret < 0 ? ret : 0;
+ lockrestart_do(trans, bkey_err(lru_k = bch2_btree_iter_peek_max(&iter,
+ lru_end(BCH_LRU_BUCKET_FRAGMENTATION))));
+
+ u64 bucket_frag_ratio = lru_k.k && !bkey_err(lru_k) ? lru_pos_time(lru_k.k->p) : 0;
+
+ /* Prefer normal bucket copygc */
+ return stripe_frag_ratio && stripe_frag_ratio * 2 < bucket_frag_ratio;
}
noinline
@@ -213,7 +295,18 @@ static int bch2_copygc(struct moving_context *ctxt,
u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
- ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight);
+ move_buckets_wait(ctxt, buckets_in_flight, false);
+
+ ret = bch2_btree_write_buffer_tryflush(trans);
+ if (bch2_err_matches(ret, EROFS))
+ goto err;
+
+ if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
+ goto err;
+
+ ret = should_do_ec_copygc(trans)
+ ? bch2_copygc_get_stripe_buckets(ctxt, buckets_in_flight)
+ : bch2_copygc_get_buckets(ctxt, buckets_in_flight);
if (ret)
goto err;
@@ -265,7 +358,8 @@ static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca)
for (unsigned i = 0; i < BCH_DATA_NR; i++)
if (data_type_movable(i))
- fragmented += usage_full.d[i].fragmented;
+ fragmented += usage_full.d[i].buckets * ca->mi.bucket_size -
+ usage_full.d[i].sectors;
return max(0LL, fragmented_allowed - fragmented);
}
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index d1019052f182..5c321a0d1f89 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -62,7 +62,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (flags & BCH_CREATE_TMPFILE)
new_inode->bi_flags |= BCH_INODE_unlinked;
- ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
+ ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu,
+ inode_opt_get(c, dir_u, inodes_32bit));
if (ret)
goto err;
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 921f9049912d..c3ef35dc01e2 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -525,7 +525,7 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id
switch (id) {
case Opt_state:
if (ca)
- return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED);
+ return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED, NULL);
break;
case Opt_compression:
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 84ce69a7f131..31a3abcbd83e 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -242,7 +242,7 @@ enum fsck_err_opts {
x(inodes_32bit, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- BCH_SB_INODE_32BIT, true, \
+ BCH_SB_INODE_32BIT, false, \
NULL, "Constrain inode numbers to 32 bits") \
x(shard_inode_numbers_bits, u8, \
OPT_FS|OPT_FORMAT, \
@@ -321,6 +321,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Don't kick drives out when splitbrain detected")\
+ x(no_version_check, u8, \
+ OPT_HIDDEN, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Don't fail reading the superblock due to incompatible version")\
x(verbose, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index c0c5fe961a83..25bf72dc6488 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -292,12 +292,48 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum,
: 0;
}
-static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
- struct btree_iter *work_iter)
+#define REBALANCE_WORK_BUF_NR 1024
+DEFINE_DARRAY_NAMED(darray_rebalance_work, struct bkey_i_cookie);
+
+static struct bkey_i *next_rebalance_entry(struct btree_trans *trans,
+ darray_rebalance_work *buf, struct bpos *work_pos)
{
- return !kthread_should_stop()
- ? bch2_btree_iter_peek(work_iter)
- : bkey_s_c_null;
+ if (unlikely(!buf->nr)) {
+ /*
+ * Avoid contention with write buffer flush: buffer up rebalance
+ * work entries in a darray
+ */
+
+ BUG_ON(!buf->size);;
+
+ bch2_trans_begin(trans);
+
+ for_each_btree_key(trans, iter, BTREE_ID_rebalance_work, *work_pos,
+ BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
+ /* we previously used darray_make_room */
+ BUG_ON(bkey_bytes(k.k) > sizeof(buf->data[0]));
+
+ bkey_reassemble(&darray_top(*buf).k_i, k);
+ buf->nr++;
+
+ *work_pos = bpos_successor(iter.pos);
+ if (buf->nr == buf->size)
+ break;
+ 0;
+ }));
+
+ if (!buf->nr)
+ return NULL;
+
+ unsigned l = 0, r = buf->nr - 1;
+ while (l < r) {
+ swap(buf->data[l], buf->data[r]);
+ l++;
+ --r;
+ }
+ }
+
+ return &(&darray_pop(buf))->k_i;
}
static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
@@ -385,6 +421,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
trace_rebalance_extent(c, buf.buf);
}
+ count_event(c, rebalance_extent);
return k;
}
@@ -408,8 +445,9 @@ static int do_rebalance_extent(struct moving_context *ctxt,
bch2_bkey_buf_init(&sk);
- ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
- extent_iter, &io_opts, &data_opts));
+ ret = lockrestart_do(trans,
+ bkey_err(k = next_rebalance_extent(trans, work_pos,
+ extent_iter, &io_opts, &data_opts)));
if (ret || !k.k)
goto out;
@@ -441,11 +479,12 @@ out:
return ret;
}
-static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
+static int do_rebalance_scan(struct moving_context *ctxt,
+ u64 inum, u64 cookie, u64 *sectors_scanned)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
- struct bch_fs_rebalance *r = &trans->c->rebalance;
+ struct bch_fs_rebalance *r = &c->rebalance;
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
ctxt->stats = &r->scan_stats;
@@ -464,10 +503,9 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
per_snapshot_io_opts_init(&snapshot_io_opts, c);
int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
- r->scan_start.pos, r->scan_end.pos,
- BTREE_ITER_all_snapshots|
- BTREE_ITER_not_extents|
- BTREE_ITER_prefetch, k, ({
+ r->scan_start.pos, r->scan_end.pos,
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_prefetch, k, ({
ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
@@ -478,14 +516,16 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
bch2_clear_rebalance_needs_scan(trans, inum, cookie));
per_snapshot_io_opts_exit(&snapshot_io_opts);
- bch2_move_stats_exit(&r->scan_stats, trans->c);
+ *sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen);
+ bch2_move_stats_exit(&r->scan_stats, c);
/*
* Ensure that the rebalance_work entries we created are seen by the
* next iteration of do_rebalance(), so we don't end up stuck in
* rebalance_wait():
*/
- atomic64_inc(&r->scan_stats.sectors_seen);
+ *sectors_scanned += 1;
+
bch2_btree_write_buffer_flush_sync(trans);
return ret;
@@ -524,58 +564,47 @@ static int do_rebalance(struct moving_context *ctxt)
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bch_fs_rebalance *r = &c->rebalance;
- struct btree_iter extent_iter = { NULL };
- struct bkey_s_c k;
+ struct btree_iter extent_iter = {};
+ u64 sectors_scanned = 0;
u32 kick = r->kick;
- int ret = 0;
- bch2_trans_begin(trans);
+ struct bpos work_pos = POS_MIN;
+ CLASS(darray_rebalance_work, work)();
+ int ret = darray_make_room(&work, REBALANCE_WORK_BUF_NR);
+ if (ret)
+ return ret;
bch2_move_stats_init(&r->work_stats, "rebalance_work");
- bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
-
- CLASS(btree_iter, rebalance_work_iter)(trans,
- BTREE_ID_rebalance_work, POS_MIN,
- BTREE_ITER_all_snapshots);
while (!bch2_move_ratelimit(ctxt)) {
if (!bch2_rebalance_enabled(c)) {
bch2_moving_ctxt_flush_all(ctxt);
kthread_wait_freezable(bch2_rebalance_enabled(c) ||
kthread_should_stop());
+ if (kthread_should_stop())
+ break;
}
- if (kthread_should_stop())
- break;
-
- bch2_trans_begin(trans);
-
- ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret || !k.k)
+ struct bkey_i *k = next_rebalance_entry(trans, &work, &work_pos);
+ if (!k)
break;
- ret = k.k->type == KEY_TYPE_cookie
- ? do_rebalance_scan(ctxt, k.k->p.inode,
- le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
- : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
+ ret = k->k.type == KEY_TYPE_cookie
+ ? do_rebalance_scan(ctxt, k->k.p.inode,
+ le64_to_cpu(bkey_i_to_cookie(k)->v.cookie),
+ &sectors_scanned)
+ : do_rebalance_extent(ctxt, k->k.p, &extent_iter);
if (ret)
break;
-
- bch2_btree_iter_advance(&rebalance_work_iter);
}
bch2_trans_iter_exit(&extent_iter);
- bch2_move_stats_exit(&r->scan_stats, c);
+ bch2_move_stats_exit(&r->work_stats, c);
if (!ret &&
!kthread_should_stop() &&
!atomic64_read(&r->work_stats.sectors_seen) &&
- !atomic64_read(&r->scan_stats.sectors_seen) &&
+ !sectors_scanned &&
kick == r->kick) {
bch2_moving_ctxt_flush_all(ctxt);
bch2_trans_unlock_long(trans);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index c57ff235a97a..6319144a440c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -64,7 +64,6 @@ int bch2_btree_lost_data(struct bch_fs *c,
* but in debug mode we want the next fsck run to be clean:
*/
ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0, &write_sb) ?: ret;
- ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0, &write_sb) ?: ret;
#endif
write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
@@ -182,9 +181,12 @@ void bch2_reconstruct_alloc(struct bch_fs *c)
*/
static void zero_out_btree_mem_ptr(struct journal_keys *keys)
{
- darray_for_each(*keys, i)
- if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
- bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
+ struct bch_fs *c = container_of(keys, struct bch_fs, journal_keys);
+ darray_for_each(*keys, i) {
+ struct bkey_i *k = journal_key_k(c, i);
+ if (k->k.type == KEY_TYPE_btree_ptr_v2)
+ bkey_i_to_btree_ptr_v2(k)->v.mem_ptr = 0;
+ }
}
/* journal replay: */
@@ -202,8 +204,10 @@ static void replay_now_at(struct journal *j, u64 seq)
static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
struct journal_key *k)
{
+ struct bch_fs *c = trans->c;
+ struct bkey_i *bk = journal_key_k(c, k);
struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, bk->k.p,
BTREE_MAX_DEPTH, k->level,
BTREE_ITER_intent);
int ret = bch2_btree_iter_traverse(&iter);
@@ -214,14 +218,14 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
/* Has this delta already been applied to the btree? */
- if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
+ if (bversion_cmp(old.k->bversion, bk->k.bversion) >= 0) {
ret = 0;
goto out;
}
- struct bkey_i *new = k->k;
+ struct bkey_i *new = bk;
if (old.k->type == KEY_TYPE_accounting) {
- new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
+ new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(bk));
ret = PTR_ERR_OR_ZERO(new);
if (ret)
goto out;
@@ -230,7 +234,8 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
bkey_s_c_to_accounting(old));
}
- trans->journal_res.seq = k->journal_seq;
+ if (!k->allocated)
+ trans->journal_res.seq = c->journal_entries_base_seq + k->journal_seq_offset;
ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun);
out:
@@ -241,6 +246,7 @@ out:
static int bch2_journal_replay_key(struct btree_trans *trans,
struct journal_key *k)
{
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
unsigned iter_flags =
BTREE_ITER_intent|
@@ -251,7 +257,8 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (k->overwritten)
return 0;
- trans->journal_res.seq = k->journal_seq;
+ if (!k->allocated)
+ trans->journal_res.seq = c->journal_entries_base_seq + k->journal_seq_offset;
/*
* BTREE_UPDATE_key_cache_reclaim disables key cache lookup/update to
@@ -266,7 +273,8 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
else
update_flags |= BTREE_UPDATE_key_cache_reclaim;
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ struct bkey_i *bk = journal_key_k(c, k);
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, bk->k.p,
BTREE_MAX_DEPTH, k->level,
iter_flags);
ret = bch2_btree_iter_traverse(&iter);
@@ -275,13 +283,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
struct btree_path *path = btree_iter_path(trans, &iter);
if (unlikely(!btree_path_node(path, k->level))) {
- struct bch_fs *c = trans->c;
-
CLASS(printbuf, buf)();
prt_str(&buf, "btree=");
bch2_btree_id_to_text(&buf, k->btree_id);
prt_printf(&buf, " level=%u ", k->level);
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k));
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(bk));
if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)|
BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) {
@@ -298,7 +304,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
}
bch2_trans_iter_exit(&iter);
- bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, bk->k.p,
BTREE_MAX_DEPTH, 0, iter_flags);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_btree_increase_depth(trans, iter.path, 0) ?:
@@ -310,17 +316,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (k->overwritten)
goto out;
- if (k->k->k.type == KEY_TYPE_accounting) {
- struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s);
+ if (bk->k.type == KEY_TYPE_accounting) {
+ struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, bk->k.u64s);
ret = PTR_ERR_OR_ZERO(n);
if (ret)
goto out;
- bkey_copy(n, k->k);
+ bkey_copy(n, bk);
goto out;
}
- ret = bch2_trans_update(trans, &iter, k->k, update_flags);
+ ret = bch2_trans_update(trans, &iter, bk, update_flags);
out:
bch2_trans_iter_exit(&iter);
return ret;
@@ -331,13 +337,9 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
const struct journal_key *l = *((const struct journal_key **)_l);
const struct journal_key *r = *((const struct journal_key **)_r);
- /*
- * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
- *
- * journal_seq == 0 means that the key comes from early repair, and
- * should be inserted last so as to avoid overflowing the journal
- */
- return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
+ return !l->allocated && !r->allocated
+ ? cmp_int(l->journal_seq_offset, r->journal_seq_offset)
+ : cmp_int(l->allocated, r->allocated);
}
DEFINE_DARRAY_NAMED(darray_journal_keys, struct journal_key *)
@@ -369,7 +371,9 @@ int bch2_journal_replay(struct bch_fs *c)
* flush accounting keys until we're done
*/
darray_for_each(*keys, k) {
- if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
+ struct bkey_i *bk = journal_key_k(trans->c, k);
+
+ if (!(bk->k.type == KEY_TYPE_accounting && !k->allocated))
continue;
cond_resched();
@@ -412,7 +416,6 @@ int bch2_journal_replay(struct bch_fs *c)
BCH_TRANS_COMMIT_skip_accounting_apply|
(!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
bch2_journal_replay_key(trans, k));
- BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
if (ret) {
ret = darray_push(&keys_sorted, k);
if (ret)
@@ -434,8 +437,8 @@ int bch2_journal_replay(struct bch_fs *c)
struct journal_key *k = *kp;
- if (k->journal_seq)
- replay_now_at(j, k->journal_seq);
+ if (!k->allocated)
+ replay_now_at(j, c->journal_entries_base_seq + k->journal_seq_offset);
else
replay_now_at(j, j->replay_journal_seq_end);
@@ -607,7 +610,7 @@ static int read_btree_roots(struct bch_fs *c)
c, btree_root_read_error,
"error reading btree root %s: %s",
buf.buf, bch2_err_str(ret))) {
- if (btree_id_is_alloc(i))
+ if (btree_id_can_reconstruct(i))
r->error = 0;
ret = 0;
}
@@ -626,93 +629,6 @@ fsck_err:
return ret;
}
-static bool check_version_upgrade(struct bch_fs *c)
-{
- unsigned latest_version = bcachefs_metadata_version_current;
- unsigned latest_compatible = min(latest_version,
- bch2_latest_compatible_version(c->sb.version));
- unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
- unsigned new_version = 0;
- bool ret = false;
-
- if (old_version < bcachefs_metadata_required_upgrade_below) {
- if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
- latest_compatible < bcachefs_metadata_required_upgrade_below)
- new_version = latest_version;
- else
- new_version = latest_compatible;
- } else {
- switch (c->opts.version_upgrade) {
- case BCH_VERSION_UPGRADE_compatible:
- new_version = latest_compatible;
- break;
- case BCH_VERSION_UPGRADE_incompatible:
- new_version = latest_version;
- break;
- case BCH_VERSION_UPGRADE_none:
- new_version = min(old_version, latest_version);
- break;
- }
- }
-
- if (new_version > old_version) {
- CLASS(printbuf, buf)();
-
- if (old_version < bcachefs_metadata_required_upgrade_below)
- prt_str(&buf, "Version upgrade required:\n");
-
- if (old_version != c->sb.version) {
- prt_str(&buf, "Version upgrade from ");
- bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
- prt_str(&buf, " to ");
- bch2_version_to_text(&buf, c->sb.version);
- prt_str(&buf, " incomplete\n");
- }
-
- prt_printf(&buf, "Doing %s version upgrade from ",
- BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
- ? "incompatible" : "compatible");
- bch2_version_to_text(&buf, old_version);
- prt_str(&buf, " to ");
- bch2_version_to_text(&buf, new_version);
- prt_newline(&buf);
-
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- __le64 passes = ext->recovery_passes_required[0];
- bch2_sb_set_upgrade(c, old_version, new_version);
- passes = ext->recovery_passes_required[0] & ~passes;
-
- if (passes) {
- prt_str(&buf, " running recovery passes: ");
- prt_bitflags(&buf, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
- }
-
- bch_notice(c, "%s", buf.buf);
- ret = true;
- }
-
- if (new_version > c->sb.version_incompat_allowed &&
- c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) {
- CLASS(printbuf, buf)();
-
- prt_str(&buf, "Now allowing incompatible features up to ");
- bch2_version_to_text(&buf, new_version);
- prt_str(&buf, ", previously allowed up to ");
- bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
- prt_newline(&buf);
-
- bch_notice(c, "%s", buf.buf);
- ret = true;
- }
-
- if (ret)
- bch2_sb_upgrade(c, new_version,
- c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible);
-
- return ret;
-}
-
int bch2_fs_recovery(struct bch_fs *c)
{
struct bch_sb_field_clean *clean = NULL;
@@ -732,108 +648,6 @@ int bch2_fs_recovery(struct bch_fs *c)
bch_info(c, "recovering from unclean shutdown");
}
- if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
- bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
- ret = -EINVAL;
- goto err;
- }
-
- if (!c->sb.clean &&
- !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
- bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
- ret = -EINVAL;
- goto err;
- }
-
- if (c->opts.norecovery) {
- c->opts.recovery_pass_last = c->opts.recovery_pass_last
- ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
- : BCH_RECOVERY_PASS_snapshots_read;
- c->opts.nochanges = true;
- }
-
- if (c->opts.nochanges)
- c->opts.read_only = true;
-
- if (c->opts.journal_rewind) {
- bch_info(c, "rewinding journal, fsck required");
- c->opts.fsck = true;
- }
-
- if (go_rw_in_recovery(c)) {
- /*
- * start workqueues/kworkers early - kthread creation checks for
- * pending signals, which is _very_ annoying
- */
- ret = bch2_fs_init_rw(c);
- if (ret)
- goto err;
- }
-
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- bool write_sb = false;
-
- if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
- ext->recovery_passes_required[0] |=
- cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
- write_sb = true;
- }
-
- u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
- if (sb_passes) {
- CLASS(printbuf, buf)();
- prt_str(&buf, "superblock requires following recovery passes to be run:\n ");
- prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
- bch_info(c, "%s", buf.buf);
- }
-
- if (bch2_check_version_downgrade(c)) {
- CLASS(printbuf, buf)();
-
- prt_str(&buf, "Version downgrade required:");
-
- __le64 passes = ext->recovery_passes_required[0];
- bch2_sb_set_downgrade(c,
- BCH_VERSION_MINOR(bcachefs_metadata_version_current),
- BCH_VERSION_MINOR(c->sb.version));
- passes = ext->recovery_passes_required[0] & ~passes;
- if (passes) {
- prt_str(&buf, "\n running recovery passes: ");
- prt_bitflags(&buf, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
- }
-
- bch_info(c, "%s", buf.buf);
- write_sb = true;
- }
-
- if (check_version_upgrade(c))
- write_sb = true;
-
- c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
-
- if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) {
- SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe);
- write_sb = true;
- }
-
- if (write_sb)
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- if (c->sb.clean)
- set_bit(BCH_FS_clean_recovery, &c->flags);
- if (c->opts.fsck)
- set_bit(BCH_FS_in_fsck, &c->flags);
- set_bit(BCH_FS_in_recovery, &c->flags);
-
- ret = bch2_blacklist_table_initialize(c);
- if (ret) {
- bch_err(c, "error initializing blacklist table");
- goto err;
- }
-
bch2_journal_pos_from_member_info_resume(c);
if (!c->sb.clean || c->opts.retain_recovery_info) {
@@ -1053,8 +867,8 @@ use_clean:
}
mutex_lock(&c->sb_lock);
- ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- write_sb = false;
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ bool write_sb = false;
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h
index b63c20558d3d..2696eee00345 100644
--- a/fs/bcachefs/recovery_passes_format.h
+++ b/fs/bcachefs/recovery_passes_format.h
@@ -37,7 +37,7 @@
x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \
x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \
x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \
- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
+ x(check_backpointers_to_extents, 13, PASS_ONLINE) \
x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \
x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \
x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 0784283ce78c..3ffd68d2608d 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -784,7 +784,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
/* Query replicas: */
bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
- unsigned flags, bool print)
+ unsigned flags, struct printbuf *err)
{
struct bch_replicas_entry_v1 *e;
@@ -823,16 +823,14 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
: BCH_FORCE_IF_DATA_DEGRADED;
if (dflags & ~flags) {
- if (print) {
- CLASS(printbuf, buf)();
-
- bch2_replicas_entry_to_text(&buf, e);
- bch_err(c, "insufficient devices online (%u) for replicas entry %s",
- nr_online, buf.buf);
+ if (err) {
+ prt_printf(err, "insufficient devices online (%u) for replicas entry ",
+ nr_online);
+ bch2_replicas_entry_to_text(err, e);
+ prt_newline(err);
}
return false;
}
-
}
return true;
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 5aba2c1ce133..15023a9b0b1e 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -44,7 +44,7 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
}
bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
- unsigned, bool);
+ unsigned, struct printbuf *);
unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
index 2e3a56bfd085..17cd617664d9 100644
--- a/fs/bcachefs/sb-counters_format.h
+++ b/fs/bcachefs/sb-counters_format.h
@@ -22,7 +22,7 @@ enum counters_flags {
x(io_read_split, 33, TYPE_COUNTER) \
x(io_read_reuse_race, 34, TYPE_COUNTER) \
x(io_read_retry, 32, TYPE_COUNTER) \
- x(io_read_fail_and_poison, 82, TYPE_COUNTER) \
+ x(io_read_fail_and_poison, 95, TYPE_COUNTER) \
x(io_write, 1, TYPE_SECTORS) \
x(io_move, 2, TYPE_SECTORS) \
x(io_move_read, 35, TYPE_SECTORS) \
@@ -31,8 +31,11 @@ enum counters_flags {
x(io_move_fail, 38, TYPE_COUNTER) \
x(io_move_write_fail, 82, TYPE_COUNTER) \
x(io_move_start_fail, 39, TYPE_COUNTER) \
+ x(io_move_drop_only, 91, TYPE_COUNTER) \
+ x(io_move_noop, 92, TYPE_COUNTER) \
x(io_move_created_rebalance, 83, TYPE_COUNTER) \
x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \
+ x(rebalance_extent, 96, TYPE_COUNTER) \
x(bucket_invalidate, 3, TYPE_COUNTER) \
x(bucket_discard, 4, TYPE_COUNTER) \
x(bucket_discard_fast, 79, TYPE_COUNTER) \
@@ -99,7 +102,9 @@ enum counters_flags {
x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \
x(trans_restart_split_race, 76, TYPE_COUNTER) \
x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \
- x(write_buffer_flush_sync, 78, TYPE_COUNTER)
+ x(write_buffer_flush_sync, 78, TYPE_COUNTER) \
+ x(accounting_key_to_wb_slowpath, 94, TYPE_COUNTER) \
+ x(error_throw, 93, TYPE_COUNTER)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
@@ -120,4 +125,13 @@ struct bch_sb_field_counters {
__le64 d[];
};
+static inline void __maybe_unused check_bch_counter_ids_unique(void) {
+ switch(0){
+#define x(t, n, ...) case (n):
+ BCH_PERSISTENT_COUNTERS();
+#undef x
+ ;
+ }
+}
+
#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 5317b1bfe2e5..aa0ea1ec9f10 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -328,6 +328,7 @@ enum bch_fsck_flags {
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \
+ x(accounting_key_underflow, 325, FSCK_AUTOFIX) \
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
@@ -336,7 +337,7 @@ enum bch_fsck_flags {
x(dirent_stray_data_after_cf_name, 305, 0) \
x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
- x(MAX, 325, 0)
+ x(MAX, 326, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index e3c73d903898..d26a0ca4a59d 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -36,10 +36,12 @@ int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev)
void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev)
{
- if (dev != BCH_SB_MEMBER_INVALID)
+ if (dev != BCH_SB_MEMBER_INVALID) {
bch2_fs_inconsistent(c, "pointer to %s device %u",
test_bit(dev, c->devs_removed.d)
? "removed" : "nonexistent", dev);
+ dump_stack();
+ }
}
void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket)
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 84f987d3a02a..eab0c1e3ff56 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1673,7 +1673,8 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
return ret;
darray_for_each(*deleted, i)
- nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
+ nr_deleted_ancestors += bch2_snapshots_same_tree(c, s->k.p.offset, i->id) &&
+ bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
if (!nr_deleted_ancestors)
return 0;
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index fef32a0118c4..28d9a29a1fd0 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -51,6 +51,17 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
return s ? s->tree : 0;
}
+static inline bool bch2_snapshots_same_tree(struct bch_fs *c, u32 id1, u32 id2)
+{
+ if (id1 == id2)
+ return true;
+
+ guard(rcu)();
+ const struct snapshot_t *s1 = snapshot_t(c, id1);
+ const struct snapshot_t *s2 = snapshot_t(c, id2);
+ return s1 && s2 && s1->tree == s2->tree;
+}
+
static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{
const struct snapshot_t *s = snapshot_t(c, id);
@@ -157,6 +168,10 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
+ EBUG_ON(!id);
+ EBUG_ON(!ancestor);
+ EBUG_ON(!bch2_snapshots_same_tree(c, id, ancestor));
+
return id == ancestor
? true
: __bch2_snapshot_is_ancestor(c, id, ancestor);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 8c0fb44929cc..2a61cc36ddbf 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -34,6 +34,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
struct bch_hash_info {
u32 inum_snapshot;
u8 type;
+ bool is_31bit;
struct unicode_map *cf_encoding;
/*
* For crc32 or crc64 string hashes the first key value of
@@ -48,6 +49,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
struct bch_hash_info info = {
.inum_snapshot = bi->bi_snapshot,
.type = INODE_STR_HASH(bi),
+ .is_31bit = bi->bi_flags & BCH_INODE_31bit_dirent_offset,
.cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
.siphash_key = { .k0 = bi->bi_hash_seed }
};
@@ -112,8 +114,8 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
}
}
-static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
- const struct bch_hash_info *info)
+static inline u64 __bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
+ const struct bch_hash_info *info)
{
switch (info->type) {
case BCH_STR_HASH_crc32c:
@@ -128,6 +130,14 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
}
}
+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
+ const struct bch_hash_info *info,
+ bool maybe_31bit)
+{
+ return __bch2_str_hash_end(ctx, info) &
+ (maybe_31bit && info->is_31bit ? INT_MAX : U64_MAX);
+}
+
struct bch_hash_desc {
enum btree_id btree_id;
u8 key_type;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index be7ed612d28f..61eeac671283 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -89,7 +89,7 @@ int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version v
prt_str(&buf, "requested incompat feature ");
bch2_version_to_text(&buf, version);
prt_str(&buf, " currently not enabled, allowed up to ");
- bch2_version_to_text(&buf, version);
+ bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
prt_printf(&buf, "\n set version_upgrade=incompat to enable");
bch_notice(c, "%s", buf.buf);
@@ -379,7 +379,7 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
return 0;
}
-int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
+int bch2_sb_validate(struct bch_sb *sb, struct bch_opts *opts, u64 read_offset,
enum bch_validate_flags flags, struct printbuf *out)
{
enum bch_opt_id opt_id;
@@ -389,28 +389,30 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
if (ret)
return ret;
- u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
- unsigned incompat_bit = 0;
- if (incompat)
- incompat_bit = __ffs64(incompat);
- else if (sb->features[1])
- incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
-
- if (incompat_bit) {
- prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
- incompat_bit,
- bch2_sb_features[BCH_FEATURE_NR - 1],
- BCH_FEATURE_NR - 1);
- return -BCH_ERR_invalid_sb_features;
- }
+ if (!opts->no_version_check) {
+ u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
+ unsigned incompat_bit = 0;
+ if (incompat)
+ incompat_bit = __ffs64(incompat);
+ else if (sb->features[1])
+ incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
+
+ if (incompat_bit) {
+ prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
+ incompat_bit,
+ bch2_sb_features[BCH_FEATURE_NR - 1],
+ BCH_FEATURE_NR - 1);
+ return -BCH_ERR_invalid_sb_features;
+ }
- if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
- BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
- prt_str(out, "Filesystem has incompatible version ");
- bch2_version_to_text(out, le16_to_cpu(sb->version));
- prt_str(out, ", current version ");
- bch2_version_to_text(out, bcachefs_metadata_version_current);
- return -BCH_ERR_invalid_sb_features;
+ if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
+ BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
+ prt_str(out, "Filesystem has incompatible version ");
+ bch2_version_to_text(out, le16_to_cpu(sb->version));
+ prt_str(out, ", current version ");
+ bch2_version_to_text(out, bcachefs_metadata_version_current);
+ return -BCH_ERR_invalid_sb_features;
+ }
}
if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
@@ -915,7 +917,7 @@ got_super:
sb->have_layout = true;
- ret = bch2_sb_validate(sb->sb, offset, 0, &err);
+ ret = bch2_sb_validate(sb->sb, opts, offset, 0, &err);
if (ret) {
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
@@ -1081,9 +1083,10 @@ int bch2_write_super(struct bch_fs *c)
bch2_sb_from_fs(c, (*ca));
darray_for_each(online_devices, ca) {
+ struct bch_opts opts = bch2_opts_empty();
printbuf_reset(&err);
- ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
+ ret = bch2_sb_validate((*ca)->disk_sb.sb, &opts, 0, BCH_VALIDATE_write, &err);
if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
goto out;
@@ -1186,13 +1189,13 @@ int bch2_write_super(struct bch_fs *c)
nr_wrote = dev_mask_nr(&sb_written);
can_mount_with_written =
- bch2_have_enough_devs(c, sb_written, degraded_flags, false);
+ bch2_have_enough_devs(c, sb_written, degraded_flags, NULL);
for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++)
sb_written.d[i] = ~sb_written.d[i];
can_mount_without_written =
- bch2_have_enough_devs(c, sb_written, degraded_flags, false);
+ bch2_have_enough_devs(c, sb_written, degraded_flags, NULL);
/*
* If we would be able to mount _without_ the devices we successfully
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index a3b7a90f2533..82cb3a3ceeae 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -92,7 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
-int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
+int bch2_sb_validate(struct bch_sb *, struct bch_opts *, u64,
+ enum bch_validate_flags, struct printbuf *);
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index b0019488f586..793c16fa8b09 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -55,6 +55,7 @@
#include "replicas.h"
#include "sb-clean.h"
#include "sb-counters.h"
+#include "sb-downgrade.h"
#include "sb-errors.h"
#include "sb-members.h"
#include "snapshot.h"
@@ -236,6 +237,7 @@ static int bch2_dev_alloc(struct bch_fs *, unsigned);
static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
static void bch2_dev_io_ref_stop(struct bch_dev *, int);
static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
+static int bch2_dev_attach_bdev(struct bch_fs *, struct bch_sb_handle *, struct printbuf *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
{
@@ -842,6 +844,244 @@ int bch2_fs_init_rw(struct bch_fs *c)
return 0;
}
+static bool check_version_upgrade(struct bch_fs *c)
+{
+ unsigned latest_version = bcachefs_metadata_version_current;
+ unsigned latest_compatible = min(latest_version,
+ bch2_latest_compatible_version(c->sb.version));
+ unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
+ unsigned new_version = 0;
+ bool ret = false;
+
+ if (old_version < bcachefs_metadata_required_upgrade_below) {
+ if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
+ latest_compatible < bcachefs_metadata_required_upgrade_below)
+ new_version = latest_version;
+ else
+ new_version = latest_compatible;
+ } else {
+ switch (c->opts.version_upgrade) {
+ case BCH_VERSION_UPGRADE_compatible:
+ new_version = latest_compatible;
+ break;
+ case BCH_VERSION_UPGRADE_incompatible:
+ new_version = latest_version;
+ break;
+ case BCH_VERSION_UPGRADE_none:
+ new_version = min(old_version, latest_version);
+ break;
+ }
+ }
+
+ if (new_version > old_version) {
+ CLASS(printbuf, buf)();
+
+ if (old_version < bcachefs_metadata_required_upgrade_below)
+ prt_str(&buf, "Version upgrade required:\n");
+
+ if (old_version != c->sb.version) {
+ prt_str(&buf, "Version upgrade from ");
+ bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
+ prt_str(&buf, " to ");
+ bch2_version_to_text(&buf, c->sb.version);
+ prt_str(&buf, " incomplete\n");
+ }
+
+ prt_printf(&buf, "Doing %s version upgrade from ",
+ BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
+ ? "incompatible" : "compatible");
+ bch2_version_to_text(&buf, old_version);
+ prt_str(&buf, " to ");
+ bch2_version_to_text(&buf, new_version);
+ prt_newline(&buf);
+
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ __le64 passes = ext->recovery_passes_required[0];
+ bch2_sb_set_upgrade(c, old_version, new_version);
+ passes = ext->recovery_passes_required[0] & ~passes;
+
+ if (passes) {
+ prt_str(&buf, " running recovery passes: ");
+ prt_bitflags(&buf, bch2_recovery_passes,
+ bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
+ }
+
+ bch_notice(c, "%s", buf.buf);
+ ret = true;
+ }
+
+ if (new_version > c->sb.version_incompat_allowed &&
+ c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) {
+ CLASS(printbuf, buf)();
+
+ prt_str(&buf, "Now allowing incompatible features up to ");
+ bch2_version_to_text(&buf, new_version);
+ prt_str(&buf, ", previously allowed up to ");
+ bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
+ prt_newline(&buf);
+
+ bch_notice(c, "%s", buf.buf);
+ ret = true;
+ }
+
+ if (ret)
+ bch2_sb_upgrade(c, new_version,
+ c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible);
+
+ return ret;
+}
+
+noinline_for_stack
+static int bch2_fs_opt_version_init(struct bch_fs *c)
+{
+ int ret = 0;
+
+ if (c->opts.norecovery) {
+ c->opts.recovery_pass_last = c->opts.recovery_pass_last
+ ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
+ : BCH_RECOVERY_PASS_snapshots_read;
+ c->opts.nochanges = true;
+ }
+
+ if (c->opts.nochanges)
+ c->opts.read_only = true;
+
+ if (c->opts.journal_rewind)
+ c->opts.fsck = true;
+
+ CLASS(printbuf, p)();
+ bch2_log_msg_start(c, &p);
+
+ prt_str(&p, "starting version ");
+ bch2_version_to_text(&p, c->sb.version);
+
+ bool first = true;
+ for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) {
+ const struct bch_option *opt = &bch2_opt_table[i];
+ u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+ if (!(opt->flags & OPT_MOUNT))
+ continue;
+
+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+ continue;
+
+ prt_str(&p, first ? " opts=" : ",");
+ first = false;
+ bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
+ }
+
+ if (c->sb.version_incompat_allowed != c->sb.version) {
+ prt_printf(&p, "\nallowing incompatible features above ");
+ bch2_version_to_text(&p, c->sb.version_incompat_allowed);
+ }
+
+ if (c->opts.verbose) {
+ prt_printf(&p, "\nfeatures: ");
+ prt_bitflags(&p, bch2_sb_features, c->sb.features);
+ }
+
+ if (c->sb.multi_device) {
+ prt_printf(&p, "\nwith devices");
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) {
+ prt_char(&p, ' ');
+ prt_str(&p, ca->name);
+ }
+ }
+
+ /* cf_encoding log message should be here, but it breaks xfstests - sigh */
+
+ if (c->opts.journal_rewind)
+ prt_printf(&p, "\nrewinding journal, fsck required");
+
+ scoped_guard(mutex, &c->sb_lock) {
+ struct bch_sb_field_ext *ext = bch2_sb_field_get_minsize(&c->disk_sb, ext,
+ sizeof(struct bch_sb_field_ext) / sizeof(u64));
+ if (!ext)
+ return bch_err_throw(c, ENOSPC_sb);
+
+ ret = bch2_sb_members_v2_init(c);
+ if (ret)
+ return ret;
+
+ __le64 now = cpu_to_le64(ktime_get_real_seconds());
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca)
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = now;
+
+ if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))
+ ext->recovery_passes_required[0] |=
+ cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
+
+ u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ if (sb_passes) {
+ prt_str(&p, "\nsuperblock requires following recovery passes to be run:\n ");
+ prt_bitflags(&p, bch2_recovery_passes, sb_passes);
+ }
+
+ u64 btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
+ if (btrees_lost_data) {
+ prt_str(&p, "\nsuperblock indicates damage to following btrees:\n ");
+ prt_bitflags(&p, __bch2_btree_ids, btrees_lost_data);
+ }
+
+ if (bch2_check_version_downgrade(c)) {
+ prt_str(&p, "\nVersion downgrade required:");
+
+ __le64 passes = ext->recovery_passes_required[0];
+ bch2_sb_set_downgrade(c,
+ BCH_VERSION_MINOR(bcachefs_metadata_version_current),
+ BCH_VERSION_MINOR(c->sb.version));
+ passes = ext->recovery_passes_required[0] & ~passes;
+ if (passes) {
+ prt_str(&p, "\nrunning recovery passes: ");
+ prt_bitflags(&p, bch2_recovery_passes,
+ bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
+ }
+ }
+
+ check_version_upgrade(c);
+
+ c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+
+ if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors)
+ SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe);
+
+ /* Don't write the superblock, defer that until we go rw */
+ }
+
+ if (c->sb.clean)
+ set_bit(BCH_FS_clean_recovery, &c->flags);
+ if (c->opts.fsck)
+ set_bit(BCH_FS_in_fsck, &c->flags);
+ set_bit(BCH_FS_in_recovery, &c->flags);
+
+ bch2_print_str(c, KERN_INFO, p.buf);
+
+ /* this really should be part of our one multi line mount message, but -
+ * xfstests... */
+ if (c->cf_encoding)
+ bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
+ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
+
+ if (BCH_SB_INITIALIZED(c->disk_sb.sb)) {
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
+ bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
+ return -EINVAL;
+ }
+
+ if (!c->sb.clean &&
+ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
bch_sb_handles *sbs)
{
@@ -1013,6 +1253,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
ret =
bch2_fs_async_obj_init(c) ?:
+ bch2_blacklist_table_initialize(c) ?:
bch2_fs_btree_cache_init(c) ?:
bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
@@ -1063,7 +1304,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
}
#endif
- for (i = 0; i < c->sb.nr_devices; i++) {
+ for (unsigned i = 0; i < c->sb.nr_devices; i++) {
if (!bch2_member_exists(c->disk_sb.sb, i))
continue;
ret = bch2_dev_alloc(c, i);
@@ -1078,6 +1319,30 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
&c->clock_journal_res,
(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
+ scoped_guard(rwsem_write, &c->state_lock)
+ darray_for_each(*sbs, sb) {
+ CLASS(printbuf, err)();
+ ret = bch2_dev_attach_bdev(c, sb, &err);
+ if (ret) {
+ bch_err(bch2_dev_locked(c, sb->sb->dev_idx), "%s", err.buf);
+ goto err;
+ }
+ }
+
+ ret = bch2_fs_opt_version_init(c);
+ if (ret)
+ goto err;
+
+ /*
+ * start workqueues/kworkers early - kthread creation checks for pending
+ * signals, which is _very_ annoying
+ */
+ if (go_rw_in_recovery(c)) {
+ ret = bch2_fs_init_rw(c);
+ if (ret)
+ goto err;
+ }
+
scoped_guard(mutex, &bch_fs_list_lock)
ret = bch2_fs_online(c);
@@ -1093,53 +1358,6 @@ err:
goto out;
}
-noinline_for_stack
-static void print_mount_opts(struct bch_fs *c)
-{
- enum bch_opt_id i;
- CLASS(printbuf, p)();
- bch2_log_msg_start(c, &p);
-
- prt_str(&p, "starting version ");
- bch2_version_to_text(&p, c->sb.version);
-
- bool first = true;
- for (i = 0; i < bch2_opts_nr; i++) {
- const struct bch_option *opt = &bch2_opt_table[i];
- u64 v = bch2_opt_get_by_id(&c->opts, i);
-
- if (!(opt->flags & OPT_MOUNT))
- continue;
-
- if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
- continue;
-
- prt_str(&p, first ? " opts=" : ",");
- first = false;
- bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
- }
-
- if (c->sb.version_incompat_allowed != c->sb.version) {
- prt_printf(&p, "\nallowing incompatible features above ");
- bch2_version_to_text(&p, c->sb.version_incompat_allowed);
- }
-
- if (c->opts.verbose) {
- prt_printf(&p, "\nfeatures: ");
- prt_bitflags(&p, bch2_sb_features, c->sb.features);
- }
-
- if (c->sb.multi_device) {
- prt_printf(&p, "\nwith devices");
- for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) {
- prt_char(&p, ' ');
- prt_str(&p, ca->name);
- }
- }
-
- bch2_print_str(c, KERN_INFO, p.buf);
-}
-
static bool bch2_fs_may_start(struct bch_fs *c)
{
struct bch_dev *ca;
@@ -1166,46 +1384,28 @@ static bool bch2_fs_may_start(struct bch_fs *c)
return false;
}
break;
- }
+ }
}
- return bch2_have_enough_devs(c, c->online_devs, flags, true);
+ CLASS(printbuf, err)();
+ bool ret = bch2_have_enough_devs(c, c->online_devs, flags, &err);
+ if (!ret)
+ bch2_print_str(c, KERN_ERR, err.buf);
+ return ret;
}
int bch2_fs_start(struct bch_fs *c)
{
- time64_t now = ktime_get_real_seconds();
int ret = 0;
BUG_ON(test_bit(BCH_FS_started, &c->flags));
- print_mount_opts(c);
-
- if (c->cf_encoding)
- bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
- unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
-
if (!bch2_fs_may_start(c))
return bch_err_throw(c, insufficient_devices_to_start);
scoped_guard(rwsem_write, &c->state_lock) {
- guard(mutex)(&c->sb_lock);
- if (!bch2_sb_field_get_minsize(&c->disk_sb, ext,
- sizeof(struct bch_sb_field_ext) / sizeof(u64))) {
- ret = bch_err_throw(c, ENOSPC_sb);
- goto err;
- }
-
- ret = bch2_sb_members_v2_init(c);
- if (ret)
- goto err;
-
scoped_guard(rcu)
for_each_online_member_rcu(c, ca) {
- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
- cpu_to_le64(now);
if (ca->mi.state == BCH_MEMBER_STATE_rw)
bch2_dev_allocator_add(c, ca);
}
@@ -1566,19 +1766,20 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
return 0;
}
-static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb,
+ struct printbuf *err)
{
unsigned ret;
if (bch2_dev_is_online(ca)) {
- bch_err(ca, "already have device online in slot %u",
- sb->sb->dev_idx);
+ prt_printf(err, "already have device online in slot %u\n",
+ sb->sb->dev_idx);
return bch_err_throw(ca->fs, device_already_online);
}
if (get_capacity(sb->bdev->bd_disk) <
ca->mi.bucket_size * ca->mi.nbuckets) {
- bch_err(ca, "cannot online: device too small (capacity %llu filesystem size %llu nbuckets %llu)",
+ prt_printf(err, "cannot online: device too small (capacity %llu filesystem size %llu nbuckets %llu)\n",
get_capacity(sb->bdev->bd_disk),
ca->mi.bucket_size * ca->mi.nbuckets,
ca->mi.nbuckets);
@@ -1614,7 +1815,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
return 0;
}
-static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb,
+ struct printbuf *err)
{
struct bch_dev *ca;
int ret;
@@ -1629,7 +1831,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
ca = bch2_dev_locked(c, sb->sb->dev_idx);
- ret = __bch2_dev_attach_bdev(ca, sb);
+ ret = __bch2_dev_attach_bdev(ca, sb, err);
if (ret)
return ret;
@@ -1653,7 +1855,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
* because we got an error or what have you?
*/
bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
+ enum bch_member_state new_state, int flags,
+ struct printbuf *err)
{
struct bch_devs_mask new_online_devs;
int nr_rw = 0, required;
@@ -1690,7 +1893,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
new_online_devs = c->online_devs;
__clear_bit(ca->dev_idx, new_online_devs.d);
- return bch2_have_enough_devs(c, new_online_devs, flags, false);
+ return bch2_have_enough_devs(c, new_online_devs, flags, err);
default:
BUG();
}
@@ -1724,14 +1927,15 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
+ enum bch_member_state new_state, int flags,
+ struct printbuf *err)
{
int ret = 0;
if (ca->mi.state == new_state)
return 0;
- if (!bch2_dev_state_allowed(c, ca, new_state, flags))
+ if (!bch2_dev_state_allowed(c, ca, new_state, flags, err))
return bch_err_throw(c, device_state_not_allowed);
if (new_state != BCH_MEMBER_STATE_rw)
@@ -1754,15 +1958,17 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
}
int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
- enum bch_member_state new_state, int flags)
+ enum bch_member_state new_state, int flags,
+ struct printbuf *err)
{
guard(rwsem_write)(&c->state_lock);
- return __bch2_dev_set_state(c, ca, new_state, flags);
+ return __bch2_dev_set_state(c, ca, new_state, flags, err);
}
/* Device add/removal: */
-int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags,
+ struct printbuf *err)
{
unsigned dev_idx = ca->dev_idx, data;
bool fast_device_removal = !bch2_request_incompat_feature(c,
@@ -1777,8 +1983,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
*/
bch2_dev_put(ca);
- if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
- bch_err(ca, "Cannot remove without losing data");
+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags, NULL)) {
+ prt_printf(err, "Cannot remove without losing data\n");
ret = bch_err_throw(c, device_state_not_allowed);
goto err;
}
@@ -1798,16 +2004,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (!data_type_is_empty(i) &&
!data_type_is_hidden(i) &&
usage.buckets[i]) {
- bch_err(ca, "Remove failed: still has data (%s, %llu buckets)",
- __bch2_data_types[i], usage.buckets[i]);
+ prt_printf(err, "Remove failed: still has data (%s, %llu buckets)\n",
+ __bch2_data_types[i], usage.buckets[i]);
ret = -EBUSY;
goto err;
}
ret = bch2_dev_remove_alloc(c, ca);
- bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_dev_remove_alloc() error: %s\n", bch2_err_str(ret));
goto err;
+ }
/*
* We need to flush the entire journal to get rid of keys that reference
@@ -1820,25 +2027,28 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
* calls, and could be cleaned up:
*/
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
- bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_journal_flush_device_pins() error: %s\n", bch2_err_str(ret));
goto err;
+ }
ret = bch2_journal_flush(&c->journal);
- bch_err_msg(ca, ret, "bch2_journal_flush()");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_journal_flush() error: %s\n", bch2_err_str(ret));
goto err;
+ }
ret = bch2_replicas_gc2(c);
- bch_err_msg(ca, ret, "bch2_replicas_gc2()");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_replicas_gc2() error: %s\n", bch2_err_str(ret));
goto err;
+ }
data = bch2_dev_has_data(c, ca);
if (data) {
- CLASS(printbuf, data_has)();
- prt_bitflags(&data_has, __bch2_data_types, data);
- bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
+ prt_str(err, "Remove failed, still has data (");
+ prt_bitflags(err, __bch2_data_types, data);
+ prt_str(err, ")\n");
ret = -EBUSY;
goto err;
}
@@ -1883,7 +2093,7 @@ err:
}
/* Add new device to running filesystem: */
-int bch2_dev_add(struct bch_fs *c, const char *path)
+int bch2_dev_add(struct bch_fs *c, const char *path, struct printbuf *err)
{
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb = {};
@@ -1892,9 +2102,10 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
int ret = 0;
ret = bch2_read_super(path, &opts, &sb);
- bch_err_msg(c, ret, "reading super");
- if (ret)
+ if (ret) {
+ prt_printf(err, "error reading superblock: %s\n", bch2_err_str(ret));
goto err;
+ }
struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
@@ -1915,7 +2126,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
}
if (ret) {
- bch_err(c, "filesystem UUID already open");
+ prt_printf(err, "cannot go multidevice: filesystem UUID already open\n");
goto err;
}
}
@@ -1930,7 +2141,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err;
}
- ret = __bch2_dev_attach_bdev(ca, &sb);
+ ret = __bch2_dev_attach_bdev(ca, &sb, err);
if (ret)
goto err;
@@ -1939,16 +2150,17 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true);
ret = bch2_sb_from_fs(c, ca);
- bch_err_msg(c, ret, "setting up new superblock");
- if (ret)
+ if (ret) {
+ prt_printf(err, "error setting up new superblock: %s\n", bch2_err_str(ret));
goto err;
+ }
if (dynamic_fault("bcachefs:add:no_slot"))
goto err;
ret = bch2_sb_member_alloc(c);
if (ret < 0) {
- bch_err_msg(c, ret, "setting up new superblock");
+ prt_printf(err, "error allocating superblock member slot: %s\n", bch2_err_str(ret));
goto err;
}
unsigned dev_idx = ret;
@@ -1966,7 +2178,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (BCH_MEMBER_GROUP(&dev_mi)) {
ret = __bch2_dev_group_set(c, ca, label.buf);
- bch_err_msg(c, ret, "creating new label");
+ prt_printf(err, "error creating new label: %s\n", bch2_err_str(ret));
if (ret)
goto err_late;
}
@@ -1980,22 +2192,25 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (test_bit(BCH_FS_started, &c->flags)) {
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- bch_err_msg(ca, ret, "marking new superblock");
- if (ret)
+ if (ret) {
+ prt_printf(err, "error marking new superblock: %s\n", bch2_err_str(ret));
goto err_late;
+ }
ret = bch2_fs_freespace_init(c);
- bch_err_msg(ca, ret, "initializing free space");
- if (ret)
+ if (ret) {
+ prt_printf(err, "error initializing free space: %s\n", bch2_err_str(ret));
goto err_late;
+ }
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
ret = bch2_dev_journal_alloc(ca, false);
- bch_err_msg(c, ret, "allocating journal");
- if (ret)
+ if (ret) {
+ prt_printf(err, "error allocating journal: %s\n", bch2_err_str(ret));
goto err_late;
+ }
}
/*
@@ -2028,7 +2243,7 @@ err_late:
}
/* Hot add existing device to running filesystem: */
-int bch2_dev_online(struct bch_fs *c, const char *path)
+int bch2_dev_online(struct bch_fs *c, const char *path, struct printbuf *err)
{
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb = { NULL };
@@ -2039,42 +2254,48 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
guard(rwsem_write)(&c->state_lock);
ret = bch2_read_super(path, &opts, &sb);
- if (ret)
+ if (ret) {
+ prt_printf(err, "error reading superblock: %s\n", bch2_err_str(ret));
return ret;
+ }
dev_idx = sb.sb->dev_idx;
ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
- bch_err_msg(c, ret, "bringing %s online", path);
- if (ret)
+ if (ret) {
+ prt_printf(err, "device not a member of fs: %s\n", bch2_err_str(ret));
goto err;
+ }
- ret = bch2_dev_attach_bdev(c, &sb);
+ ret = bch2_dev_attach_bdev(c, &sb, err);
if (ret)
goto err;
ca = bch2_dev_locked(c, dev_idx);
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_trans_mark_dev_sb() error: %s\n", bch2_err_str(ret));
goto err;
+ }
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
if (!ca->mi.freespace_initialized) {
ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
- bch_err_msg(ca, ret, "initializing free space");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_dev_freespace_init() error: %s\n", bch2_err_str(ret));
goto err;
+ }
}
if (!ca->journal.nr) {
ret = bch2_dev_journal_alloc(ca, false);
- bch_err_msg(ca, ret, "allocating journal");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_dev_journal_alloc() error: %s\n", bch2_err_str(ret));
goto err;
+ }
}
scoped_guard(mutex, &c->sb_lock) {
@@ -2089,17 +2310,17 @@ err:
return ret;
}
-int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags, struct printbuf *err)
{
guard(rwsem_write)(&c->state_lock);
if (!bch2_dev_is_online(ca)) {
- bch_err(ca, "Already offline");
+ prt_printf(err, "Already offline\n");
return 0;
}
- if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
- bch_err(ca, "Cannot offline required disk");
+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags, NULL)) {
+ prt_printf(err, "Cannot offline required disk\n");
return bch_err_throw(c, device_state_not_allowed);
}
@@ -2119,7 +2340,7 @@ static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new
bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets);
}
-int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets, struct printbuf *err)
{
u64 old_nbuckets;
int ret = 0;
@@ -2128,31 +2349,36 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
old_nbuckets = ca->mi.nbuckets;
if (nbuckets < ca->mi.nbuckets) {
- bch_err(ca, "Cannot shrink yet");
+ prt_printf(err, "Cannot shrink yet\n");
return -EINVAL;
}
if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
- bch_err(ca, "New device size too big (%llu greater than max %u)",
- nbuckets, BCH_MEMBER_NBUCKETS_MAX);
+ prt_printf(err, "New device size too big (%llu greater than max %u)\n",
+ nbuckets, BCH_MEMBER_NBUCKETS_MAX);
return bch_err_throw(c, device_size_too_big);
}
if (bch2_dev_is_online(ca) &&
get_capacity(ca->disk_sb.bdev->bd_disk) <
ca->mi.bucket_size * nbuckets) {
- bch_err(ca, "New size larger than device");
+ prt_printf(err, "New size %llu larger than device size %llu\n",
+ ca->mi.bucket_size * nbuckets,
+ get_capacity(ca->disk_sb.bdev->bd_disk));
return bch_err_throw(c, device_size_too_small);
}
ret = bch2_dev_buckets_resize(c, ca, nbuckets);
- bch_err_msg(ca, ret, "resizing buckets");
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_dev_buckets_resize() error: %s\n", bch2_err_str(ret));
return ret;
+ }
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- if (ret)
+ if (ret) {
+ prt_printf(err, "bch2_trans_mark_dev_sb() error: %s\n", bch2_err_str(ret));
return ret;
+ }
scoped_guard(mutex, &c->sb_lock) {
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
@@ -2163,8 +2389,10 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (ca->mi.freespace_initialized) {
ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets);
- if (ret)
+ if (ret) {
+ prt_printf(err, "__bch2_dev_resize_alloc() error: %s\n", bch2_err_str(ret));
return ret;
+ }
}
bch2_recalc_capacity(c);
@@ -2275,10 +2503,14 @@ static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
if (ca) {
+ CLASS(printbuf, buf)();
+ __bch2_log_msg_start(ca->name, &buf);
+ prt_printf(&buf, "offline from block layer\n");
+
bool dev = bch2_dev_state_allowed(c, ca,
BCH_MEMBER_STATE_failed,
- BCH_FORCE_IF_DEGRADED);
-
+ BCH_FORCE_IF_DEGRADED,
+ &buf);
if (!dev && sb) {
if (!surprise)
sync_filesystem(sb);
@@ -2286,11 +2518,6 @@ static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
evict_inodes(sb);
}
- CLASS(printbuf, buf)();
- __bch2_log_msg_start(ca->name, &buf);
-
- prt_printf(&buf, "offline from block layer");
-
if (dev) {
__bch2_dev_offline(c, ca);
} else {
@@ -2368,11 +2595,6 @@ struct bch_fs *bch2_fs_open(darray_const_str *devices,
BUG_ON(darray_push(&sbs, sb));
}
- if (opts->nochanges && !opts->read_only) {
- ret = bch_err_throw(c, erofs_nochanges);
- goto err_print;
- }
-
darray_for_each(sbs, sb)
if (!best || sb_cmp(sb->sb, best->sb) > 0)
best = sb;
@@ -2398,13 +2620,6 @@ struct bch_fs *bch2_fs_open(darray_const_str *devices,
if (ret)
goto err;
- scoped_guard(rwsem_write, &c->state_lock)
- darray_for_each(sbs, sb) {
- ret = bch2_dev_attach_bdev(c, sb);
- if (ret)
- goto err;
- }
-
if (!c->opts.nostart) {
ret = bch2_fs_start(c);
if (ret)
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index e90bab9afe78..d13dbf2b8227 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -17,18 +17,20 @@ struct bch_fs *bch2_dev_to_fs(dev_t);
struct bch_fs *bch2_uuid_to_fs(__uuid_t);
bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
+ enum bch_member_state, int,
+ struct printbuf *);
int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
+ enum bch_member_state, int,
+ struct printbuf *);
int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
- enum bch_member_state, int);
-
-int bch2_dev_fail(struct bch_dev *, int);
-int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_add(struct bch_fs *, const char *);
-int bch2_dev_online(struct bch_fs *, const char *);
-int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
+ enum bch_member_state, int,
+ struct printbuf *);
+
+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int, struct printbuf *);
+int bch2_dev_add(struct bch_fs *, const char *, struct printbuf *);
+int bch2_dev_online(struct bch_fs *, const char *, struct printbuf *);
+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int, struct printbuf *);
+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64, struct printbuf *);
struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
bool bch2_fs_emergency_read_only(struct bch_fs *);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 3776a1403104..269cdf1a87a4 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1179,6 +1179,11 @@ DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
TP_ARGS(trans, caller_ip)
);
+DEFINE_EVENT(fs_str, accounting_key_to_wb_slowpath,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
TRACE_EVENT(path_downgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
index 2ad338e282da..446770ec3bd6 100644
--- a/fs/bcachefs/vstructs.h
+++ b/fs/bcachefs/vstructs.h
@@ -23,6 +23,9 @@
(size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
})
+#define vstruct_u64s(_s) \
+ (offsetof(typeof(*(_s)), _data) / sizeof(u64) + __vstruct_u64s(_s))
+
#define vstruct_bytes(_s) \
__vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 6094b568dd33..6d7303008b19 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -4,6 +4,7 @@
#include "acl.h"
#include "bkey_methods.h"
#include "btree_update.h"
+#include "dirent.h"
#include "extents.h"
#include "fs.h"
#include "rebalance.h"
@@ -25,7 +26,7 @@ static u64 bch2_xattr_hash(const struct bch_hash_info *info,
bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
- return bch2_str_hash_end(&ctx, info);
+ return bch2_str_hash_end(&ctx, info, false);
}
static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
@@ -484,6 +485,22 @@ static int inode_opt_set_fn(struct btree_trans *trans,
return ret;
}
+ if (s->id == Inode_opt_inodes_32bit &&
+ !bch2_request_incompat_feature(trans->c, bcachefs_metadata_version_31bit_dirent_offset)) {
+ /*
+ * Make sure the dir is empty, as otherwise we'd need to
+ * rehash everything and update the dirent keys.
+ */
+ int ret = bch2_empty_dir_trans(trans, inode_inum(inode));
+ if (ret < 0)
+ return ret;
+
+ if (s->defined)
+ bi->bi_flags |= BCH_INODE_31bit_dirent_offset;
+ else
+ bi->bi_flags &= ~BCH_INODE_31bit_dirent_offset;
+ }
+
if (s->defined)
bi->bi_fields_set |= 1U << s->id;
else