summaryrefslogtreecommitdiff
path: root/fs/bcachefs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/alloc_background.c49
-rw-r--r--fs/bcachefs/alloc_background.h1
-rw-r--r--fs/bcachefs/alloc_foreground.c1
-rw-r--r--fs/bcachefs/alloc_types.h10
-rw-r--r--fs/bcachefs/backpointers.c2
-rw-r--r--fs/bcachefs/bcachefs.h9
-rw-r--r--fs/bcachefs/bcachefs_format.h1
-rw-r--r--fs/bcachefs/btree_gc.c26
-rw-r--r--fs/bcachefs/btree_io.c11
-rw-r--r--fs/bcachefs/btree_trans_commit.c4
-rw-r--r--fs/bcachefs/btree_types.h12
-rw-r--r--fs/bcachefs/btree_update_interior.c61
-rw-r--r--fs/bcachefs/checksum.h2
-rw-r--r--fs/bcachefs/data_update.c18
-rw-r--r--fs/bcachefs/data_update.h8
-rw-r--r--fs/bcachefs/disk_accounting.c176
-rw-r--r--fs/bcachefs/disk_accounting.h16
-rw-r--r--fs/bcachefs/errcode.h1
-rw-r--r--fs/bcachefs/error.c4
-rw-r--r--fs/bcachefs/extent_update.c1
-rw-r--r--fs/bcachefs/extents.c6
-rw-r--r--fs/bcachefs/extents.h4
-rw-r--r--fs/bcachefs/fs-io-buffered.c68
-rw-r--r--fs/bcachefs/fs-io-direct.c8
-rw-r--r--fs/bcachefs/fs-io.c4
-rw-r--r--fs/bcachefs/fs.c17
-rw-r--r--fs/bcachefs/inode.c52
-rw-r--r--fs/bcachefs/inode.h9
-rw-r--r--fs/bcachefs/io_misc.c14
-rw-r--r--fs/bcachefs/io_misc.h2
-rw-r--r--fs/bcachefs/io_read.c70
-rw-r--r--fs/bcachefs/io_read.h4
-rw-r--r--fs/bcachefs/io_write.c50
-rw-r--r--fs/bcachefs/io_write.h4
-rw-r--r--fs/bcachefs/io_write_types.h2
-rw-r--r--fs/bcachefs/lru.c45
-rw-r--r--fs/bcachefs/lru.h5
-rw-r--r--fs/bcachefs/migrate.c5
-rw-r--r--fs/bcachefs/move.c196
-rw-r--r--fs/bcachefs/move.h34
-rw-r--r--fs/bcachefs/opts.c40
-rw-r--r--fs/bcachefs/opts.h14
-rw-r--r--fs/bcachefs/progress.c2
-rw-r--r--fs/bcachefs/rebalance.c223
-rw-r--r--fs/bcachefs/rebalance.h52
-rw-r--r--fs/bcachefs/recovery.c14
-rw-r--r--fs/bcachefs/reflink.c16
-rw-r--r--fs/bcachefs/sb-counters_format.h2
-rw-r--r--fs/bcachefs/sb-errors_format.h2
-rw-r--r--fs/bcachefs/super.c11
-rw-r--r--fs/bcachefs/sysfs.c4
-rw-r--r--fs/bcachefs/trace.h36
-rw-r--r--fs/bcachefs/xattr.c7
53 files changed, 816 insertions, 619 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index b6850b15494d..21cdc42eff46 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1771,13 +1771,6 @@ static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
darray_remove_item(&ca->discard_buckets_in_flight, i);
}
-struct discard_buckets_state {
- u64 seen;
- u64 open;
- u64 need_journal_commit;
- u64 discarded;
-};
-
static int bch2_discard_one_bucket(struct btree_trans *trans,
struct bch_dev *ca,
struct btree_iter *need_discard_iter,
@@ -1790,6 +1783,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
bool discard_locked = false;
int ret = 0;
+ s->seen++;
+
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
s->open++;
return 0;
@@ -1800,6 +1795,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
if (seq_ready > c->journal.flushed_seq_ondisk) {
if (seq_ready > c->journal.flushing_seq)
s->need_journal_commit++;
+ else
+ s->commit_in_flight++;
return 0;
}
@@ -1815,6 +1812,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
return ret;
if (a->v.data_type != BCH_DATA_need_discard) {
+ s->bad_data_type++;
+
if (need_discard_or_freespace_err(trans, k, true, true, true)) {
ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false);
if (ret)
@@ -1826,8 +1825,10 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
}
if (!fastpath) {
- if (discard_in_flight_add(ca, iter.pos.offset, true))
+ if (discard_in_flight_add(ca, iter.pos.offset, true)) {
+ s->already_discarding++;
goto out;
+ }
discard_locked = true;
}
@@ -1861,6 +1862,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
commit:
ret = bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
@@ -1873,14 +1875,11 @@ out:
fsck_err:
if (discard_locked)
discard_in_flight_remove(ca, iter.pos.offset);
- if (!ret)
- s->seen++;
return ret;
}
-static void bch2_do_discards_work(struct work_struct *work)
+static void __bch2_dev_do_discards(struct bch_dev *ca)
{
- struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
struct bch_fs *c = ca->fs;
struct discard_buckets_state s = {};
struct bpos discard_pos_done = POS_MAX;
@@ -1901,10 +1900,25 @@ static void bch2_do_discards_work(struct work_struct *work)
if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal))
bch2_journal_flush_async(&c->journal, NULL);
- trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
- bch2_err_str(ret));
+ trace_discard_buckets(c, &s, bch2_err_str(ret));
enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
+}
+
+void bch2_do_discards_going_ro(struct bch_fs *c)
+{
+ for_each_member_device(c, ca)
+ if (bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards))
+ __bch2_dev_do_discards(ca);
+}
+
+static void bch2_do_discards_work(struct work_struct *work)
+{
+ struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
+ struct bch_fs *c = ca->fs;
+
+ __bch2_dev_do_discards(ca);
+
enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
}
@@ -1992,7 +2006,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
break;
}
- trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
+ trace_discard_buckets_fast(c, &s, bch2_err_str(ret));
bch2_trans_put(trans);
enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
@@ -2384,8 +2398,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
* We clear the LRU and need_discard btrees first so that we don't race
* with bch2_do_invalidates() and bch2_do_discards()
*/
- ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
+ ret = bch2_dev_remove_lrus(c, ca) ?:
bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
@@ -2396,7 +2409,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
BTREE_TRIGGER_norun, NULL) ?:
- bch2_dev_usage_remove(c, ca->dev_idx);
+ bch2_dev_usage_remove(c, ca);
bch_err_msg(ca, ret, "removing dev alloc info");
return ret;
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index c2e8482fbbe6..a602507fef19 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -320,6 +320,7 @@ static inline int bch2_check_discard_freespace_key_async(struct btree_trans *tra
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
void bch2_dev_do_discards(struct bch_dev *);
+void bch2_do_discards_going_ro(struct bch_fs *);
void bch2_do_discards(struct bch_fs *);
static inline u64 should_invalidate_buckets(struct bch_dev *ca,
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 3d125ee81663..97b627ed3b22 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1529,6 +1529,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
printbuf_tabstop_push(out, 24);
prt_printf(out, "capacity\t%llu\n", c->capacity);
+ prt_printf(out, "used\t%llu\n", bch2_fs_usage_read_short(c).used);
prt_printf(out, "reserved\t%llu\n", c->reserved);
prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden));
prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree));
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index e7becdf22cba..ee52b66dc5d7 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -118,4 +118,14 @@ struct write_point_specifier {
unsigned long v;
};
+struct discard_buckets_state {
+ u64 seen;
+ u64 open;
+ u64 need_journal_commit;
+ u64 commit_in_flight;
+ u64 bad_data_type;
+ u64 already_discarding;
+ u64 discarded;
+};
+
#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 6aeb1c876619..c662eeba66ab 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -820,7 +820,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
for (enum btree_id btree_id = 0;
btree_id < btree_id_nr_alive(c);
btree_id++) {
- int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+ int level, depth = btree_type_has_data_ptrs(btree_id) ? 0 : 1;
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 0ede47f62129..83d6ab9c1a91 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -458,7 +458,6 @@ BCH_DEBUG_PARAMS_ALL()
x(btree_node_compact) \
x(btree_node_merge) \
x(btree_node_sort) \
- x(btree_node_get) \
x(btree_node_read) \
x(btree_node_read_done) \
x(btree_node_write) \
@@ -466,10 +465,6 @@ BCH_DEBUG_PARAMS_ALL()
x(btree_interior_update_total) \
x(btree_gc) \
x(data_write) \
- x(data_write_to_submit) \
- x(data_write_to_queue) \
- x(data_write_to_btree_update) \
- x(data_write_btree_update) \
x(data_read) \
x(data_promote) \
x(journal_flush_write) \
@@ -483,6 +478,7 @@ BCH_DEBUG_PARAMS_ALL()
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
x(blocked_write_buffer_full) \
+ x(blocked_writeback_throttle) \
x(nocow_lock_contended)
enum bch_time_stats {
@@ -675,6 +671,7 @@ struct bch_dev {
x(error) \
x(topology_error) \
x(errors_fixed) \
+ x(errors_fixed_silent) \
x(errors_not_fixed) \
x(no_invalid_checks) \
x(discard_mount_opt_set) \
@@ -808,6 +805,8 @@ struct bch_fs {
struct bch_disk_groups_cpu __rcu *disk_groups;
struct bch_opts opts;
+ atomic_t opt_change_cookie;
+
unsigned loglevel;
unsigned prev_loglevel;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index b2de993d802b..0839397105a9 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -654,7 +654,6 @@ struct bch_sb_field_ext {
/*
* field 1: version name
* field 2: BCH_VERSION(major, minor)
- * field 3: recovery passess required on upgrade
*/
#define BCH_METADATA_VERSIONS() \
x(bkey_renumber, BCH_VERSION(0, 10)) \
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 43f294284d57..2338feb8d8ed 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -717,16 +717,12 @@ fsck_err:
static int bch2_gc_btree(struct btree_trans *trans,
struct progress_indicator_state *progress,
- enum btree_id btree, bool initial)
+ enum btree_id btree, unsigned target_depth,
+ bool initial)
{
struct bch_fs *c = trans->c;
- unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
int ret = 0;
- /* We need to make sure every leaf node is readable before going RW */
- if (initial)
- target_depth = 0;
-
for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
struct btree *prev = NULL;
struct btree_iter iter;
@@ -797,7 +793,21 @@ static int bch2_gc_btrees(struct bch_fs *c)
if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
continue;
- ret = bch2_gc_btree(trans, &progress, btree, true);
+
+ unsigned target_depth = BIT_ULL(btree) & btree_leaf_has_triggers_mask ? 0 : 1;
+
+ /*
+ * In fsck, we need to make sure every leaf node is readable
+ * before going RW, otherwise we can no longer rewind inside
+ * btree_lost_data to repair during the current fsck run.
+ *
+ * Otherwise, we can delay the repair to the next
+ * mount or offline fsck.
+ */
+ if (test_bit(BCH_FS_in_fsck, &c->flags))
+ target_depth = 0;
+
+ ret = bch2_gc_btree(trans, &progress, btree, target_depth, true);
}
bch_err_fn(c, ret);
@@ -1228,7 +1238,7 @@ int bch2_gc_gens(struct bch_fs *c)
}
for (unsigned i = 0; i < BTREE_ID_NR; i++)
- if (btree_type_has_ptrs(i)) {
+ if (btree_type_has_data_ptrs(i)) {
c->gc_gens_btree = i;
c->gc_gens_pos = POS_MIN;
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 34ec1a90980d..52d21259ed6f 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -27,10 +27,15 @@
#include <linux/moduleparam.h>
#include <linux/sched/mm.h>
+static __maybe_unused unsigned bch2_btree_read_corrupt_ratio;
+static __maybe_unused int bch2_btree_read_corrupt_device;
+
#ifdef CONFIG_BCACHEFS_DEBUG
-static unsigned bch2_btree_read_corrupt_ratio;
module_param_named(btree_read_corrupt_ratio, bch2_btree_read_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(btree_read_corrupt_ratio, "");
+
+module_param_named(btree_read_corrupt_device, bch2_btree_read_corrupt_device, int, 0644);
+MODULE_PARM_DESC(btree_read_corrupt_ratio, "");
#endif
static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
@@ -1438,7 +1443,9 @@ start:
memset(&bio->bi_iter, 0, sizeof(bio->bi_iter));
bio->bi_iter.bi_size = btree_buf_bytes(b);
- bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio);
+ if (bch2_btree_read_corrupt_device == rb->pick.ptr.dev ||
+ bch2_btree_read_corrupt_device < 0)
+ bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio);
ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
if (ret != -BCH_ERR_btree_node_read_err_want_retry &&
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 5fa7f2f9f1e9..2966971ee43e 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -970,6 +970,7 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans,
struct bkey_i *accounting;
retry:
+ memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
percpu_down_read(&c->mark_lock);
for (accounting = btree_trans_subbuf_base(trans, &trans->accounting);
accounting != btree_trans_subbuf_top(trans, &trans->accounting);
@@ -983,6 +984,9 @@ retry:
}
percpu_up_read(&c->mark_lock);
+ /* Only fatal errors are possible later, so no need to revert this */
+ bch2_trans_account_disk_usage_change(trans);
+
trans_for_each_update(trans, i) {
ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
if (ret)
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index e893eb938bb3..9e3c851200eb 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -840,6 +840,10 @@ static inline bool btree_node_type_has_triggers(enum btree_node_type type)
return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
}
+/* A mask of btree id bits that have triggers for their leaves */
+__maybe_unused
+static const u64 btree_leaf_has_triggers_mask = BTREE_NODE_TYPE_HAS_TRIGGERS >> 1;
+
static const u64 btree_is_extents_mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr)
BCH_BTREE_IDS()
@@ -883,15 +887,15 @@ static inline bool btree_type_has_snapshot_field(enum btree_id btree)
return BIT_ULL(btree) & mask;
}
-static inline bool btree_type_has_ptrs(enum btree_id btree)
-{
- const u64 mask = 0
+static const u64 btree_has_data_ptrs_mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr)
BCH_BTREE_IDS()
#undef x
;
- return BIT_ULL(btree) & mask;
+static inline bool btree_type_has_data_ptrs(enum btree_id btree)
+{
+ return BIT_ULL(btree) & btree_has_data_ptrs_mask;
}
static inline bool btree_type_uses_write_buffer(enum btree_id btree)
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 8d9fcaa26268..ce86d158aa8e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -324,9 +324,6 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct btree *b;
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
- ? BTREE_NODE_RESERVE
- : 0;
int ret;
b = bch2_btree_node_mem_alloc(trans, interior_node);
@@ -334,41 +331,6 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
return b;
BUG_ON(b->ob.nr);
-
- mutex_lock(&c->btree_reserve_cache_lock);
- if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) {
- guard(spinlock)(&c->freelist_lock);
- if (c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark)) {
- if (cl)
- closure_wait(&c->open_buckets_wait, cl);
-
- ret = cl
- ? bch_err_throw(c, bucket_alloc_blocked)
- : bch_err_throw(c, open_buckets_empty);
- mutex_unlock(&c->btree_reserve_cache_lock);
- goto err;
- }
- }
-
- if (c->btree_reserve_cache_nr > nr_reserve) {
- for (struct btree_alloc *a = c->btree_reserve_cache;
- a < c->btree_reserve_cache + c->btree_reserve_cache_nr;) {
- /* check if it has sufficient durability */
-
- if (!can_use_btree_node(c, res, target, bkey_i_to_s_c(&a->k))) {
- bch2_open_buckets_put(c, &a->ob);
- *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr];
- continue;
- }
-
- bkey_copy(&b->key, &a->k);
- b->ob = a->ob;
- *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr];
- mutex_unlock(&c->btree_reserve_cache_lock);
- goto out;
- }
- }
- mutex_unlock(&c->btree_reserve_cache_lock);
retry:
ret = bch2_alloc_sectors_start_trans(trans,
target ?:
@@ -398,12 +360,29 @@ retry:
goto retry;
}
+ mutex_lock(&c->btree_reserve_cache_lock);
+ while (c->btree_reserve_cache_nr) {
+ struct btree_alloc *a = c->btree_reserve_cache + --c->btree_reserve_cache_nr;
+
+ /* check if it has sufficient durability */
+
+ if (can_use_btree_node(c, res, target, bkey_i_to_s_c(&a->k))) {
+ bkey_copy(&b->key, &a->k);
+ b->ob = a->ob;
+ mutex_unlock(&c->btree_reserve_cache_lock);
+ goto out;
+ }
+
+ bch2_open_buckets_put(c, &a->ob);
+ }
+ mutex_unlock(&c->btree_reserve_cache_lock);
+
bkey_btree_ptr_v2_init(&b->key);
bch2_alloc_sectors_append_ptrs(c, wp, &b->key, btree_sectors(c), false);
bch2_open_bucket_get(c, wp, &b->ob);
- bch2_alloc_sectors_done(c, wp);
out:
+ bch2_alloc_sectors_done(c, wp);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -723,8 +702,10 @@ static void btree_update_nodes_written(struct btree_update *as)
if (ret)
goto err;
- if (!btree_update_new_nodes_marked_sb(as))
+ if (!btree_update_new_nodes_marked_sb(as)) {
+ bch2_trans_unlock_long(trans);
btree_update_new_nodes_mark_sb(as);
+ }
/*
* Wait for any in flight writes to finish before we free the old nodes
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 7bd9cf6104ca..10bfadcde80a 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -130,7 +130,7 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type,
}
static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
- struct bch_io_opts opts)
+ struct bch_inode_opts opts)
{
if (opts.nocow)
return 0;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 20b900bee32d..7a0da6cdf78c 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -11,6 +11,7 @@
#include "ec.h"
#include "error.h"
#include "extents.h"
+#include "inode.h"
#include "io_write.h"
#include "keylist.h"
#include "move.h"
@@ -428,13 +429,18 @@ restart_drop_extra_replicas:
goto out;
}
+ struct bch_inode_opts opts;
+
ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?:
bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?:
- bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
+ bch2_inum_snapshot_opts_get(trans, k.k->p.inode, k.k->p.snapshot, &opts) ?:
+ bch2_bkey_set_needs_rebalance(c, &opts, insert,
+ SET_NEEDS_REBALANCE_foreground,
+ m->op.opts.change_cookie) ?:
bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_internal_snapshot_node);
if (ret)
@@ -613,7 +619,7 @@ int bch2_update_unwritten_extent(struct btree_trans *trans,
}
void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
{
if (!out->nr_tabstops)
@@ -682,7 +688,7 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update
int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
{
struct bch_fs *c = trans->c;
@@ -732,7 +738,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
}
static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts *io_opts,
unsigned buf_bytes)
{
unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
@@ -759,7 +765,7 @@ static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
}
int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
- struct bch_io_opts *io_opts)
+ struct bch_inode_opts *io_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
const union bch_extent_entry *entry;
@@ -831,7 +837,7 @@ int bch2_data_update_init(struct btree_trans *trans,
struct moving_context *ctxt,
struct data_update *m,
struct write_point_specifier wp,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts *io_opts,
struct data_update_opts data_opts,
enum btree_id btree_id,
struct bkey_s_c k)
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index fc12aa65366f..3b0ba6f6497f 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -23,7 +23,7 @@ struct data_update_opts {
};
void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
- struct bch_io_opts *, struct data_update_opts *);
+ struct bch_inode_opts *, struct data_update_opts *);
#define BCH_DATA_UPDATE_TYPES() \
x(copygc, 0) \
@@ -76,18 +76,18 @@ void bch2_data_update_read_done(struct data_update *);
int bch2_extent_drop_ptrs(struct btree_trans *,
struct btree_iter *,
struct bkey_s_c,
- struct bch_io_opts *,
+ struct bch_inode_opts *,
struct data_update_opts *);
int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
- struct bch_io_opts *);
+ struct bch_inode_opts *);
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
struct moving_context *,
struct data_update *,
struct write_point_specifier,
- struct bch_io_opts *, struct data_update_opts,
+ struct bch_inode_opts *, struct data_update_opts,
enum btree_id, struct bkey_s_c);
void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index f0ebf91cd5fd..a99f821c6a1c 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -239,10 +239,12 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
c, accounting_key_junk_at_end,
"junk at end of accounting key");
- bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type],
+ const unsigned nr_counters = bch2_accounting_counters(k.k);
+
+ bkey_fsck_err_on(!nr_counters || nr_counters > BCH_ACCOUNTING_MAX_COUNTERS,
c, accounting_key_nr_counters_wrong,
"accounting key with %u counters, should be %u",
- bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]);
+ nr_counters, bch2_accounting_type_nr_counters[acc_k.type]);
fsck_err:
return ret;
}
@@ -359,10 +361,13 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
accounting_pos_cmp, &a.k->p) < acc->k.nr)
return 0;
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+
struct accounting_mem_entry n = {
.pos = a.k->p,
.bversion = a.k->bversion,
- .nr_counters = bch2_accounting_counters(a.k),
+ .nr_counters = bch2_accounting_type_nr_counters[acc_k.type],
.v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
sizeof(u64), GFP_KERNEL),
};
@@ -878,46 +883,44 @@ int bch2_accounting_read(struct bch_fs *c)
*dst++ = *i;
keys->gap = keys->nr = dst - keys->data;
- guard(percpu_write)(&c->mark_lock);
-
- darray_for_each_reverse(acc->k, i) {
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, i->pos);
+ CLASS(printbuf, underflow_err)();
- u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
- memset(v, 0, sizeof(v));
+ scoped_guard(percpu_write, &c->mark_lock) {
+ darray_for_each_reverse(acc->k, i) {
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, i->pos);
- for (unsigned j = 0; j < i->nr_counters; j++)
- v[j] = percpu_u64_get(i->v[0] + j);
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ memset(v, 0, sizeof(v));
- /*
- * If the entry counters are zeroed, it should be treated as
- * nonexistent - it might point to an invalid device.
- *
- * Remove it, so that if it's re-added it gets re-marked in the
- * superblock:
- */
- ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
- ? -BCH_ERR_remove_disk_accounting_entry
- : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
-
- if (ret == -BCH_ERR_remove_disk_accounting_entry) {
- free_percpu(i->v[0]);
- free_percpu(i->v[1]);
- darray_remove_item(&acc->k, i);
- ret = 0;
- continue;
- }
+ for (unsigned j = 0; j < i->nr_counters; j++)
+ v[j] = percpu_u64_get(i->v[0] + j);
- if (ret)
- return ret;
- }
+ /*
+ * If the entry counters are zeroed, it should be treated as
+ * nonexistent - it might point to an invalid device.
+ *
+ * Remove it, so that if it's re-added it gets re-marked in the
+ * superblock:
+ */
+ ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
+ ? -BCH_ERR_remove_disk_accounting_entry
+ : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
+
+ if (ret == -BCH_ERR_remove_disk_accounting_entry) {
+ free_percpu(i->v[0]);
+ free_percpu(i->v[1]);
+ darray_remove_item(&acc->k, i);
+ ret = 0;
+ continue;
+ }
- eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, NULL);
+ if (ret)
+ return ret;
+ }
- scoped_guard(preempt) {
- struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
+ eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, NULL);
for (unsigned i = 0; i < acc->k.nr; i++) {
struct disk_accounting_pos k;
@@ -939,27 +942,20 @@ int bch2_accounting_read(struct bch_fs *c)
underflow |= (s64) v[j] < 0;
if (underflow) {
- CLASS(printbuf, buf)();
- bch2_log_msg_start(c, &buf);
-
- prt_printf(&buf, "Accounting underflow for\n");
- bch2_accounting_key_to_text(&buf, &k);
+ if (!underflow_err.pos) {
+ bch2_log_msg_start(c, &underflow_err);
+ prt_printf(&underflow_err, "Accounting underflow for\n");
+ }
+ bch2_accounting_key_to_text(&underflow_err, &k);
for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
- prt_printf(&buf, " %lli", v[j]);
-
- bool print = bch2_count_fsck_err(c, accounting_key_underflow, &buf);
- unsigned pos = buf.pos;
- ret = bch2_run_explicit_recovery_pass(c, &buf,
- BCH_RECOVERY_PASS_check_allocations, 0);
- print |= buf.pos != pos;
-
- if (print)
- bch2_print_str(c, KERN_ERR, buf.buf);
- if (ret)
- return ret;
+ prt_printf(&underflow_err, " %lli", v[j]);
+ prt_newline(&underflow_err);
}
+ guard(preempt)();
+ struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
+
switch (k.type) {
case BCH_DISK_ACCOUNTING_persistent_reserved:
usage->reserved += v[0] * k.persistent_reserved.nr_replicas;
@@ -986,24 +982,60 @@ int bch2_accounting_read(struct bch_fs *c)
}
}
+ if (underflow_err.pos) {
+ bool print = bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err);
+ unsigned pos = underflow_err.pos;
+ ret = bch2_run_explicit_recovery_pass(c, &underflow_err,
+ BCH_RECOVERY_PASS_check_allocations, 0);
+ print |= underflow_err.pos != pos;
+
+ if (print)
+ bch2_print_str(c, KERN_ERR, underflow_err.buf);
+ if (ret)
+ return ret;
+ }
+
return ret;
}
-int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
+int bch2_dev_usage_remove(struct bch_fs *c, struct bch_dev *ca)
{
CLASS(btree_trans, trans)(c);
+
+ struct disk_accounting_pos start;
+ disk_accounting_key_init(start, dev_data_type, .dev = ca->dev_idx);
+
+ struct disk_accounting_pos end;
+ disk_accounting_key_init(end, dev_data_type, .dev = ca->dev_idx, .data_type = U8_MAX);
+
return bch2_btree_write_buffer_flush_sync(trans) ?:
- for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN,
- BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({
- struct disk_accounting_pos acc;
- bpos_to_disk_accounting_pos(&acc, k.k->p);
-
- acc.type == BCH_DISK_ACCOUNTING_dev_data_type &&
- acc.dev_data_type.dev == dev
- ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0)
- : 0;
- })) ?:
- bch2_btree_write_buffer_flush_sync(trans);
+ commit_do(trans, NULL, NULL, 0, ({
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_accounting,
+ disk_accounting_pos_to_bpos(&start),
+ disk_accounting_pos_to_bpos(&end),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->type != KEY_TYPE_accounting)
+ continue;
+
+ struct disk_accounting_pos acc;
+ bpos_to_disk_accounting_pos(&acc, k.k->p);
+
+ const unsigned nr = bch2_accounting_counters(k.k);
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ memcpy_u64s_small(v, bkey_s_c_to_accounting(k).v->d, nr);
+
+ bch2_u64s_neg(v, nr);
+
+ ret = bch2_disk_accounting_mod(trans, &acc, v, nr, false);
+ if (ret)
+ break;
+ }
+
+ ret;
+ })) ?: bch2_btree_write_buffer_flush_sync(trans);
}
int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
@@ -1074,13 +1106,17 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
case BCH_DISK_ACCOUNTING_dev_data_type: {
{
guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */
+ const enum bch_data_type data_type = acc_k.dev_data_type.data_type;
struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
if (!ca)
continue;
- v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
- v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
- v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
+ v[0] = percpu_u64_get(&ca->usage->d[data_type].buckets);
+ v[1] = percpu_u64_get(&ca->usage->d[data_type].sectors);
+ v[2] = percpu_u64_get(&ca->usage->d[data_type].fragmented);
+
+ if (data_type == BCH_DATA_sb || data_type == BCH_DATA_journal)
+ base.hidden += a.v->d[0] * ca->mi.bucket_size;
}
if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
@@ -1108,7 +1144,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
mismatch = true; \
}
- //check(hidden);
+ check(hidden);
check(btree);
check(data);
check(cached);
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index cc73cce98a44..c0d3d7e8fda6 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -186,11 +186,15 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
break;
case BCH_DISK_ACCOUNTING_dev_data_type: {
guard(rcu)();
+ const enum bch_data_type data_type = acc_k.dev_data_type.data_type;
struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
if (ca) {
- this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
- this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
- this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
+ this_cpu_add(ca->usage->d[data_type].buckets, a.v->d[0]);
+ this_cpu_add(ca->usage->d[data_type].sectors, a.v->d[1]);
+ this_cpu_add(ca->usage->d[data_type].fragmented, a.v->d[2]);
+
+ if (data_type == BCH_DATA_sb || data_type == BCH_DATA_journal)
+ trans->fs_usage_delta.hidden += a.v->d[0] * ca->mi.bucket_size;
}
break;
}
@@ -212,9 +216,9 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
struct accounting_mem_entry *e = &acc->k.data[idx];
- EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
+ const unsigned nr = min_t(unsigned, bch2_accounting_counters(a.k), e->nr_counters);
- for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
+ for (unsigned i = 0; i < nr; i++)
this_cpu_add(e->v[gc][i], a.v->d[i]);
return 0;
}
@@ -297,7 +301,7 @@ int bch2_gc_accounting_done(struct bch_fs *);
int bch2_accounting_read(struct bch_fs *);
-int bch2_dev_usage_remove(struct bch_fs *, unsigned);
+int bch2_dev_usage_remove(struct bch_fs *, struct bch_dev *);
int bch2_dev_usage_init(struct bch_dev *, bool);
void bch2_verify_accounting_clean(struct bch_fs *c);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index adc1f9315eab..420f6922dacb 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -345,6 +345,7 @@
x(BCH_ERR_data_read, data_read_no_encryption_key) \
x(BCH_ERR_data_read, data_read_buffer_too_small) \
x(BCH_ERR_data_read, data_read_key_overwritten) \
+ x(0, rbio_narrow_crcs_fail) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 9e69263eb796..a16f55d98d97 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -468,10 +468,10 @@ int __bch2_fsck_err(struct bch_fs *c,
if ((flags & FSCK_ERR_SILENT) ||
test_bit(err, c->sb.errors_silent)) {
- ret = flags & FSCK_CAN_FIX
+ set_bit(BCH_FS_errors_fixed_silent, &c->flags);
+ return flags & FSCK_CAN_FIX
? bch_err_throw(c, fsck_fix)
: bch_err_throw(c, fsck_ignore);
- goto err;
}
printbuf_indent_add_nextline(out, 2);
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 73eb28090bc7..1279026b4c1e 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -146,6 +146,7 @@ int bch2_extent_trim_atomic(struct btree_trans *trans,
if (bpos_ge(bkey_start_pos(k.k), end))
break;
+ nr_iters += 1;
ret = count_iters_for_insert(trans, k, offset, &end, &nr_iters);
if (ret)
break;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index c0d00a692c18..86aa93ea2345 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1151,7 +1151,7 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke
return NULL;
}
-static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
+static bool want_cached_ptr(struct bch_fs *c, struct bch_inode_opts *opts,
struct bch_extent_ptr *ptr)
{
unsigned target = opts->promote_target ?: opts->foreground_target;
@@ -1165,7 +1165,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
}
void bch2_extent_ptr_set_cached(struct bch_fs *c,
- struct bch_io_opts *opts,
+ struct bch_inode_opts *opts,
struct bkey_s k,
struct bch_extent_ptr *ptr)
{
@@ -1241,7 +1241,7 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
* the promote target.
*/
bool bch2_extent_normalize_by_opts(struct bch_fs *c,
- struct bch_io_opts *opts,
+ struct bch_inode_opts *opts,
struct bkey_s k)
{
struct bkey_ptrs ptrs;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index f6dcb17108cd..03ea7c689d9a 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -686,10 +686,10 @@ bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
struct bch_extent_ptr *
bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
-void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *,
+void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_inode_opts *,
struct bkey_s, struct bch_extent_ptr *);
-bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s);
+bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_inode_opts *, struct bkey_s);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 45175a478b92..fe684adca370 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -284,12 +284,12 @@ void bch2_readahead(struct readahead_control *ractl)
{
struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts;
struct folio *folio;
struct readpages_iter readpages_iter;
struct blk_plug plug;
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+ struct bch_inode_opts opts;
+ bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts);
int ret = readpages_iter_init(&readpages_iter, ractl);
if (ret)
@@ -350,7 +350,7 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_read_bio *rbio;
- struct bch_io_opts opts;
+ struct bch_inode_opts opts;
struct blk_plug plug;
int ret;
DECLARE_COMPLETION_ONSTACK(done);
@@ -361,7 +361,7 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
if (!bch2_folio_create(folio, GFP_KERNEL))
return -ENOMEM;
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+ bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts);
rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
c,
@@ -407,7 +407,7 @@ struct bch_writepage_io {
struct bch_writepage_state {
struct bch_writepage_io *io;
- struct bch_io_opts opts;
+ struct bch_inode_opts opts;
struct bch_folio_sector *tmp;
unsigned tmp_sectors;
struct blk_plug plug;
@@ -532,6 +532,39 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
}
+static bool can_write_now(struct bch_fs *c, unsigned replicas_want, struct closure *cl)
+{
+ unsigned reserved = OPEN_BUCKETS_COUNT -
+ (OPEN_BUCKETS_COUNT - bch2_open_buckets_reserved(BCH_WATERMARK_normal)) / 2;
+
+ if (unlikely(c->open_buckets_nr_free <= reserved)) {
+ closure_wait(&c->open_buckets_wait, cl);
+ return false;
+ }
+
+ if (BCH_WATERMARK_normal < c->journal.watermark && !bch2_journal_error(&c->journal)) {
+ closure_wait(&c->journal.async_wait, cl);
+ return false;
+ }
+
+ return true;
+}
+
+static void throttle_writes(struct bch_fs *c, unsigned replicas_want, struct closure *cl)
+{
+ u64 start = 0;
+ while (!can_write_now(c, replicas_want, cl)) {
+ if (!start)
+ start = local_clock();
+ closure_sync(cl);
+ }
+
+ BUG_ON(closure_nr_remaining(cl) > 1);
+
+ if (start)
+ bch2_time_stats_update(&c->times[BCH_TIME_blocked_writeback_throttle], start);
+}
+
static int __bch2_writepage(struct folio *folio,
struct writeback_control *wbc,
void *data)
@@ -667,26 +700,25 @@ do_io:
return 0;
}
-static int bch2_write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc, void *data)
-{
- struct folio *folio = NULL;
- int error;
-
- while ((folio = writeback_iter(mapping, wbc, folio, &error)))
- error = __bch2_writepage(folio, wbc, data);
- return error;
-}
-
int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct bch_fs *c = mapping->host->i_sb->s_fs_info;
struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL);
- bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode);
+ bch2_inode_opts_get_inode(c, &to_bch_ei(mapping->host)->ei_inode, &w->opts);
blk_start_plug(&w->plug);
- int ret = bch2_write_cache_pages(mapping, wbc, w);
+
+ struct closure cl;
+ closure_init_stack(&cl);
+
+ struct folio *folio = NULL;
+ int ret = 0;
+
+ while (throttle_writes(c, w->opts.data_replicas, &cl),
+ (folio = writeback_iter(mapping, wbc, folio, &ret)))
+ ret = __bch2_writepage(folio, wbc, w);
+
if (w->io)
bch2_writepage_do_io(w);
blk_finish_plug(&w->plug);
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 79823234160f..a104b9d70bea 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -68,7 +68,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
struct file *file = req->ki_filp;
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts;
struct dio_read *dio;
struct bio *bio;
struct blk_plug plug;
@@ -78,7 +77,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
size_t shorten;
ssize_t ret;
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+ struct bch_inode_opts opts;
+ bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts);
/* bios must be 512 byte aligned: */
if ((offset|iter->count) & (SECTOR_SIZE - 1))
@@ -445,13 +445,13 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
struct kiocb *req = dio->req;
struct address_space *mapping = dio->mapping;
struct bch_inode_info *inode = dio->inode;
- struct bch_io_opts opts;
+ struct bch_inode_opts opts;
struct bio *bio = &dio->op.wbio.bio;
unsigned unaligned, iter_count;
bool sync = dio->sync, dropped_locks;
long ret;
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+ bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts);
while (1) {
iter_count = dio->iter.count;
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index de0d965f3fde..57e9459afa07 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -627,10 +627,10 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bpos end_pos = POS(inode->v.i_ino, end_sector);
- struct bch_io_opts opts;
+ struct bch_inode_opts opts;
int ret = 0;
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+ bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts);
CLASS(btree_trans, trans)(c);
CLASS(btree_iter, iter)(trans, BTREE_ID_extents,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index c7bb5b108e2f..d6a2031e17e8 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -2147,9 +2147,11 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN);
int ret = bch2_inode_rm(c, inode_inum(inode));
if (ret && !bch2_err_matches(ret, EROFS)) {
- bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu",
- inode->ei_inum.subvol,
- inode->ei_inum.inum);
+ CLASS(printbuf, buf)();
+ bch2_trans_do(c, bch2_inum_to_path(trans, inode->ei_inum, &buf));
+
+ bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu\n%s",
+ inode->ei_inum.subvol, inode->ei_inum.inum, buf.buf);
bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm);
}
@@ -2236,11 +2238,16 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
struct bch_fs *c = sb->s_fs_info;
struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
unsigned shift = sb->s_blocksize_bits - 9;
+
/*
- * this assumes inodes take up 64 bytes, which is a decent average
+ * This assumes inodes take up 64 bytes, which is a decent average
* number:
+ *
+ * Not anymore - bi_dir, bi_dir_offset came later and shouldn't have
+ * been varint fields: seeing 144-160 byte inodes, so let's call it 256
+ * bytes:
*/
- u64 avail_inodes = ((usage.capacity - usage.used) << 3);
+ u64 avail_inodes = ((usage.capacity - usage.used) << 1);
buf->f_type = BCACHEFS_STATFS_MAGIC;
buf->f_bsize = sb->s_blocksize;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index bba273d55c37..543627fb58be 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -369,9 +369,9 @@ err:
}
int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans,
- u64 inode_nr, u32 snapshot,
- struct bch_inode_unpacked *inode,
- unsigned flags)
+ u64 inode_nr, u32 snapshot,
+ struct bch_inode_unpacked *inode,
+ unsigned flags)
{
CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, SPOS(0, inode_nr, snapshot), flags);
struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
@@ -673,7 +673,7 @@ static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
{
- unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
+ unsigned f = bkey_inode_flags(k);
return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
}
@@ -1223,32 +1223,45 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
return ret;
}
-void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
- struct bch_inode_unpacked *inode)
+void bch2_inode_opts_get_inode(struct bch_fs *c,
+ struct bch_inode_unpacked *inode,
+ struct bch_inode_opts *ret)
{
#define x(_name, _bits) \
if ((inode)->bi_##_name) { \
- opts->_name = inode->bi_##_name - 1; \
- opts->_name##_from_inode = true; \
+ ret->_name = inode->bi_##_name - 1; \
+ ret->_name##_from_inode = true; \
} else { \
- opts->_name = c->opts._name; \
- opts->_name##_from_inode = false; \
+ ret->_name = c->opts._name; \
+ ret->_name##_from_inode = false; \
}
BCH_INODE_OPTS()
#undef x
- bch2_io_opts_fixups(opts);
+ ret->change_cookie = atomic_read(&c->opt_change_cookie);
+
+ bch2_io_opts_fixups(ret);
}
-int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
+int bch2_inum_snapshot_opts_get(struct btree_trans *trans,
+ u64 inum, u32 snapshot,
+ struct bch_inode_opts *opts)
{
- struct bch_inode_unpacked inode;
- int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
+ if (inum) {
+ struct bch_inode_unpacked inode;
+ int ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0);
+ if (ret)
+ return ret;
- if (ret)
- return ret;
+ bch2_inode_opts_get_inode(trans->c, &inode, opts);
+ } else {
+ /*
+ * data_update_index_update may call us for reflink btree extent
+ * updates, inum will be 0
+ */
- bch2_inode_opts_get(opts, trans->c, &inode);
+ bch2_inode_opts_get(trans->c, opts);
+ }
return 0;
}
@@ -1346,7 +1359,7 @@ err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- return ret ?: bch_err_throw(c, transaction_restart_nested);
+ return ret;
}
/*
@@ -1385,7 +1398,8 @@ next_parent:
int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
- delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
+ delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)) ?:
+ bch_err_throw(trans->c, transaction_restart_nested);
}
static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos,
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 79092ea74844..63b7088811fb 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -289,9 +289,8 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
-void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
- struct bch_inode_unpacked *);
-int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *);
+void bch2_inode_opts_get_inode(struct bch_fs *, struct bch_inode_unpacked *, struct bch_inode_opts *);
+int bch2_inum_snapshot_opts_get(struct btree_trans *, u64, u32, struct bch_inode_opts *);
int bch2_inode_set_casefold(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *, unsigned);
@@ -300,8 +299,8 @@ int bch2_inode_set_casefold(struct btree_trans *, subvol_inum,
static inline struct bch_extent_rebalance
bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
{
- struct bch_io_opts io_opts;
- bch2_inode_opts_get(&io_opts, c, inode);
+ struct bch_inode_opts io_opts;
+ bch2_inode_opts_get_inode(c, inode, &io_opts);
return io_opts_to_rebalance_opts(c, &io_opts);
}
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index fa0b06e17d17..04eb5ecd102b 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -24,7 +24,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
subvol_inum inum,
struct btree_iter *iter,
u64 sectors,
- struct bch_io_opts opts,
+ struct bch_inode_opts opts,
s64 *i_sectors_delta,
struct write_point_specifier write_point)
{
@@ -109,7 +109,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
}
ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
- 0, i_sectors_delta, true);
+ 0, i_sectors_delta, true, 0);
err:
if (!ret && sectors_allocated)
bch2_increment_clock(c, sectors_allocated, WRITE);
@@ -211,7 +211,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
bch2_cut_back(end_pos, &delete);
ret = bch2_extent_update(trans, inum, iter, &delete,
- &disk_res, 0, i_sectors_delta, false);
+ &disk_res, 0, i_sectors_delta, false, 0);
bch2_disk_reservation_put(c, &disk_res);
}
@@ -373,7 +373,6 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
- struct bch_io_opts opts;
u64 dst_offset = le64_to_cpu(op->v.dst_offset);
u64 src_offset = le64_to_cpu(op->v.src_offset);
s64 shift = dst_offset - src_offset;
@@ -384,10 +383,6 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
bool warn_errors = i_sectors_delta != NULL;
int ret = 0;
- ret = bch2_inum_opts_get(trans, inum, &opts);
- if (ret)
- return ret;
-
/*
* check for missing subvolume before fpunch, as in resume we don't want
* it to be a fatal error
@@ -476,8 +471,7 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
- ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?:
- bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
index b93e4d4b3c0c..6a294f2a6dd6 100644
--- a/fs/bcachefs/io_misc.h
+++ b/fs/bcachefs/io_misc.h
@@ -3,7 +3,7 @@
#define _BCACHEFS_IO_MISC_H
int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
- u64, struct bch_io_opts, s64 *,
+ u64, struct bch_inode_opts, s64 *,
struct write_point_specifier);
int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos);
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 579815c691af..e7ba0d0bf5ef 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -37,12 +37,6 @@ module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(read_corrupt_ratio, "");
#endif
-static bool bch2_poison_extents_on_checksum_error;
-module_param_named(poison_extents_on_checksum_error,
- bch2_poison_extents_on_checksum_error, bool, 0644);
-MODULE_PARM_DESC(poison_extents_on_checksum_error,
- "Extents with checksum errors are marked as poisoned - unsafe without read fua support");
-
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static inline u32 bch2_dev_congested_read(struct bch_dev *ca, u64 now)
@@ -164,7 +158,7 @@ static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
struct bpos pos,
- struct bch_io_opts opts,
+ struct bch_inode_opts opts,
unsigned flags,
struct bch_io_failures *failed)
{
@@ -545,9 +539,6 @@ static void get_rbio_extent(struct btree_trans *trans,
static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
enum btree_id btree, struct bkey_s_c read_k)
{
- if (!bch2_poison_extents_on_checksum_error)
- return 0;
-
struct bch_fs *c = trans->c;
struct data_update *u = rbio_data_update(rbio);
@@ -749,15 +740,13 @@ static void bch2_rbio_error(struct bch_read_bio *rbio,
}
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
- struct bch_read_bio *rbio)
+ struct bch_read_bio *rbio,
+ struct bch_extent_crc_unpacked *new_crc)
{
struct bch_fs *c = rbio->c;
u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
int ret = 0;
- if (crc_is_compressed(rbio->pick.crc))
- return 0;
-
CLASS(btree_iter, iter)(trans, rbio->data_btree, rbio->data_pos, BTREE_ITER_intent);
struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
if ((ret = bkey_err(k)))
@@ -765,21 +754,12 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
if (bversion_cmp(k.k->bversion, rbio->version) ||
!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
- return 0;
-
- /* Extent was merged? */
- if (bkey_start_offset(k.k) < data_offset ||
- k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
- return 0;
+ return bch_err_throw(c, rbio_narrow_crcs_fail);
- struct bch_extent_crc_unpacked new_crc;
- if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
- rbio->pick.crc, NULL, &new_crc,
- bkey_start_offset(k.k) - data_offset, k.k->size,
- rbio->pick.crc.csum_type)) {
- bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
- return 0;
- }
+ /* Extent was trimmed/merged? */
+ if (!bpos_eq(bkey_start_pos(k.k), rbio->data_pos) ||
+ k.k->p.offset != rbio->data_pos.offset + rbio->pick.crc.live_size)
+ return bch_err_throw(c, rbio_narrow_crcs_fail);
/*
* going to be temporarily appending another checksum entry:
@@ -791,17 +771,37 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
bkey_reassemble(new, k);
- if (!bch2_bkey_narrow_crcs(new, new_crc))
- return 0;
+ if (!bch2_bkey_narrow_crcs(new, *new_crc))
+ return bch_err_throw(c, rbio_narrow_crcs_fail);
return bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node);
}
static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
{
- CLASS(btree_trans, trans)(rbio->c);
- commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_rbio_narrow_crcs(trans, rbio));
+ struct bch_fs *c = rbio->c;
+
+ if (crc_is_compressed(rbio->pick.crc))
+ return;
+
+ u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
+
+ struct bch_extent_crc_unpacked new_crc;
+ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+ rbio->pick.crc, NULL, &new_crc,
+ rbio->data_pos.offset - data_offset, rbio->pick.crc.live_size,
+ rbio->pick.crc.csum_type)) {
+ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+ return;
+ }
+
+ CLASS(btree_trans, trans)(c);
+ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __bch2_rbio_narrow_crcs(trans, rbio, &new_crc));
+ if (!ret)
+ count_event(c, io_read_narrow_crcs);
+ else if (ret == -BCH_ERR_rbio_narrow_crcs_fail)
+ count_event(c, io_read_narrow_crcs_fail);
}
static void bch2_read_decompress_err(struct work_struct *work)
@@ -1274,6 +1274,10 @@ retry_pick:
async_object_list_add(c, rbio, rbio, &rbio->list_idx);
+ /* XXX: also nvme read recovery level */
+ if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev)))
+ rbio->bio.bi_opf |= REQ_FUA;
+
if (rbio->bounce)
trace_and_count(c, io_read_bounce, &rbio->bio);
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
index 1e1c0476bd03..df4632f6fe9e 100644
--- a/fs/bcachefs/io_read.h
+++ b/fs/bcachefs/io_read.h
@@ -74,7 +74,7 @@ struct bch_read_bio {
struct bpos data_pos;
struct bversion version;
- struct bch_io_opts opts;
+ struct bch_inode_opts opts;
struct work_struct work;
@@ -192,7 +192,7 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
static inline struct bch_read_bio *rbio_init(struct bio *bio,
struct bch_fs *c,
- struct bch_io_opts opts,
+ struct bch_inode_opts opts,
bio_end_io_t end_io)
{
struct bch_read_bio *rbio = to_rbio(bio);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index aed22fc7759b..6a5da02ce266 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -205,7 +205,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
struct btree_iter *extent_iter,
u64 new_i_size,
- s64 i_sectors_delta)
+ s64 i_sectors_delta,
+ struct bch_inode_unpacked *inode_u)
{
/*
* Crazy performance optimization:
@@ -227,7 +228,13 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
BTREE_ITER_intent|
BTREE_ITER_cached);
struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
- int ret = bkey_err(k);
+
+ /*
+ * XXX: we currently need to unpack the inode on every write because we
+ * need the current io_opts, for transactional consistency - inode_v4?
+ */
+ int ret = bkey_err(k) ?:
+ bch2_inode_unpack(k, inode_u);
if (unlikely(ret))
return ret;
@@ -303,8 +310,10 @@ int bch2_extent_update(struct btree_trans *trans,
struct disk_reservation *disk_res,
u64 new_i_size,
s64 *i_sectors_delta_total,
- bool check_enospc)
+ bool check_enospc,
+ u32 change_cookie)
{
+ struct bch_fs *c = trans->c;
struct bpos next_pos;
bool usage_increasing;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
@@ -335,7 +344,7 @@ int bch2_extent_update(struct btree_trans *trans,
if (disk_res &&
disk_sectors_delta > (s64) disk_res->sectors) {
- ret = bch2_disk_reservation_add(trans->c, disk_res,
+ ret = bch2_disk_reservation_add(c, disk_res,
disk_sectors_delta - disk_res->sectors,
!check_enospc || !usage_increasing
? BCH_DISK_RESERVATION_NOFAIL : 0);
@@ -349,9 +358,16 @@ int bch2_extent_update(struct btree_trans *trans,
* aren't changing - for fsync to work properly; fsync relies on
* inode->bi_journal_seq which is updated by the trigger code:
*/
+ struct bch_inode_unpacked inode;
+ struct bch_inode_opts opts;
+
ret = bch2_extent_update_i_size_sectors(trans, iter,
min(k->k.p.offset << 9, new_i_size),
- i_sectors_delta) ?:
+ i_sectors_delta, &inode) ?:
+ (bch2_inode_opts_get_inode(c, &inode, &opts),
+ bch2_bkey_set_needs_rebalance(c, &opts, k,
+ SET_NEEDS_REBALANCE_foreground,
+ change_cookie)) ?:
bch2_trans_update(trans, iter, k, 0) ?:
bch2_trans_commit(trans, disk_res, NULL,
BCH_TRANS_COMMIT_no_check_rw|
@@ -402,7 +418,8 @@ static int bch2_write_index_default(struct bch_write_op *op)
ret = bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
- op->flags & BCH_WRITE_check_enospc);
+ op->flags & BCH_WRITE_check_enospc,
+ op->opts.change_cookie);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
@@ -792,10 +809,6 @@ static void init_append_extent(struct bch_write_op *op,
bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
op->flags & BCH_WRITE_cached);
-
- if (!(op->flags & BCH_WRITE_move))
- bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i);
-
bch2_keylist_push(&op->insert_keys);
}
@@ -1225,6 +1238,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
return 0;
}
+ struct bch_fs *c = trans->c;
struct bkey_i *new = bch2_trans_kmalloc_nomemzero(trans,
bkey_bytes(k.k) + sizeof(struct bch_extent_rebalance));
int ret = PTR_ERR_OR_ZERO(new);
@@ -1239,8 +1253,6 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
bkey_for_each_ptr(ptrs, ptr)
ptr->unwritten = 0;
- bch2_bkey_set_needs_rebalance(op->c, &op->opts, new);
-
/*
* Note that we're not calling bch2_subvol_get_snapshot() in this path -
* that was done when we kicked off the write, and here it's important
@@ -1248,8 +1260,20 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
* since been created. The write is still outstanding, so we're ok
* w.r.t. snapshot atomicity:
*/
+
+ /*
+ * For transactional consistency, set_needs_rebalance() has to be called
+ * with the io_opts from the btree in the same transaction:
+ */
+ struct bch_inode_unpacked inode;
+ struct bch_inode_opts opts;
+
return bch2_extent_update_i_size_sectors(trans, iter,
- min(new->k.p.offset << 9, new_i_size), 0) ?:
+ min(new->k.p.offset << 9, new_i_size), 0, &inode) ?:
+ (bch2_inode_opts_get_inode(c, &inode, &opts),
+ bch2_bkey_set_needs_rebalance(c, &opts, new,
+ SET_NEEDS_REBALANCE_foreground,
+ op->opts.change_cookie)) ?:
bch2_trans_update(trans, iter, new,
BTREE_UPDATE_internal_snapshot_node);
}
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
index 2c0a8f35ee1f..692529bf401d 100644
--- a/fs/bcachefs/io_write.h
+++ b/fs/bcachefs/io_write.h
@@ -28,10 +28,10 @@ int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
struct bkey_i *, bool *, s64 *, s64 *);
int bch2_extent_update(struct btree_trans *, subvol_inum,
struct btree_iter *, struct bkey_i *,
- struct disk_reservation *, u64, s64 *, bool);
+ struct disk_reservation *, u64, s64 *, bool, u32);
static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
- struct bch_io_opts opts)
+ struct bch_inode_opts opts)
{
op->c = c;
op->end_io = NULL;
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
index 5da4eb8bb6f6..ab36b03e0a46 100644
--- a/fs/bcachefs/io_write_types.h
+++ b/fs/bcachefs/io_write_types.h
@@ -90,7 +90,7 @@ struct bch_write_op {
struct bch_devs_list devs_have;
u16 target;
u16 nonce;
- struct bch_io_opts opts;
+ struct bch_inode_opts opts;
u32 subvol;
struct bpos pos;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index b9c0834498dd..c533b60706bf 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -51,25 +51,17 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
: 0;
}
-int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+static int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
{
- return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted);
-}
-
-int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
-{
- return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
+ return __bch2_lru_set(trans, lru_id, dev_bucket, time, true);
}
int __bch2_lru_change(struct btree_trans *trans,
u16 lru_id, u64 dev_bucket,
u64 old_time, u64 new_time)
{
- if (old_time == new_time)
- return 0;
-
- return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?:
- bch2_lru_set(trans, lru_id, dev_bucket, new_time);
+ return __bch2_lru_set(trans, lru_id, dev_bucket, old_time, false) ?:
+ __bch2_lru_set(trans, lru_id, dev_bucket, new_time, true);
}
static const char * const bch2_lru_types[] = {
@@ -87,7 +79,6 @@ int bch2_lru_check_set(struct btree_trans *trans,
struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
- CLASS(printbuf, buf)();
CLASS(btree_iter, lru_iter)(trans, BTREE_ID_lru, lru_pos(lru_id, dev_bucket, time), 0);
struct bkey_s_c lru_k = bch2_btree_iter_peek_slot(&lru_iter);
int ret = bkey_err(lru_k);
@@ -99,10 +90,13 @@ int bch2_lru_check_set(struct btree_trans *trans,
if (ret)
return ret;
- if (fsck_err(trans, alloc_key_to_missing_lru_entry,
- "missing %s lru entry\n%s",
- bch2_lru_types[lru_type(lru_k)],
- (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
+ CLASS(printbuf, buf)();
+ prt_printf(&buf, "missing %s lru entry at pos ", bch2_lru_types[lru_type(lru_k)]);
+ bch2_bpos_to_text(&buf, lru_iter.pos);
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, referring_k);
+
+ if (fsck_err(trans, alloc_key_to_missing_lru_entry, "%s", buf.buf)) {
ret = bch2_lru_set(trans, lru_id, dev_bucket, time);
if (ret)
return ret;
@@ -127,6 +121,23 @@ static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k)
}
}
+int bch2_dev_remove_lrus(struct bch_fs *c, struct bch_dev *ca)
+{
+ CLASS(btree_trans, trans)(c);
+ int ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+ for_each_btree_key(trans, iter,
+ BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, ({
+ struct bbpos bp = lru_pos_to_bp(k);
+
+ bp.btree == BTREE_ID_alloc && bp.pos.inode == ca->dev_idx
+ ? (bch2_btree_delete_at(trans, &iter, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0))
+ : 0;
+ }));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
static u64 bkey_lru_type_idx(struct bch_fs *c,
enum bch_lru_type type,
struct bkey_s_c k)
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 6f1e0a7b5db5..d5a2620f2507 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -59,8 +59,6 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
.min_val_size = 8, \
})
-int bch2_lru_del(struct btree_trans *, u16, u64, u64);
-int bch2_lru_set(struct btree_trans *, u16, u64, u64);
int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
static inline int bch2_lru_change(struct btree_trans *trans,
@@ -72,9 +70,10 @@ static inline int bch2_lru_change(struct btree_trans *trans,
: 0;
}
+int bch2_dev_remove_lrus(struct bch_fs *, struct bch_dev *);
+
struct bkey_buf;
int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *);
-
int bch2_check_lrus(struct bch_fs *);
#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 5b4c3f4b1c25..8a3981e1016e 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -126,8 +126,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c,
{
CLASS(btree_trans, trans)(c);
+ /* FIXME: this does not handle unknown btrees with data pointers */
for (unsigned id = 0; id < BTREE_ID_NR; id++) {
- if (!btree_type_has_ptrs(id))
+ if (!btree_type_has_data_ptrs(id))
continue;
/* Stripe keys have pointers, but are handled separately */
@@ -167,7 +168,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c,
bch2_bkey_buf_init(&k);
closure_init_stack(&cl);
- for (id = 0; id < BTREE_ID_NR; id++) {
+ for (id = 0; id < btree_id_nr_alive(c); id++) {
bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
BTREE_ITER_prefetch);
retry:
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index c46a8965a7eb..9a440d3f7180 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -46,12 +46,12 @@ struct evacuate_bucket_arg {
static bool evacuate_bucket_pred(struct bch_fs *, void *,
enum btree_id, struct bkey_s_c,
- struct bch_io_opts *,
+ struct bch_inode_opts *,
struct data_update_opts *);
static noinline void
trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
{
CLASS(printbuf, buf)();
@@ -72,7 +72,7 @@ static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
static noinline void
trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts,
move_pred_fn pred, void *_arg, bool p)
{
@@ -327,7 +327,7 @@ int bch2_move_extent(struct moving_context *ctxt,
struct move_bucket *bucket_in_flight,
struct btree_iter *iter,
struct bkey_s_c k,
- struct bch_io_opts io_opts,
+ struct bch_inode_opts io_opts,
struct data_update_opts data_opts)
{
struct btree_trans *trans = ctxt->trans;
@@ -451,93 +451,6 @@ err:
return ret;
}
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
- struct per_snapshot_io_opts *io_opts,
- struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
- struct btree_iter *extent_iter,
- struct bkey_s_c extent_k)
-{
- struct bch_fs *c = trans->c;
- u32 restart_count = trans->restart_count;
- struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
- int ret = 0;
-
- if (btree_iter_path(trans, extent_iter)->level)
- return opts_ret;
-
- if (extent_k.k->type == KEY_TYPE_reflink_v)
- goto out;
-
- if (io_opts->cur_inum != extent_pos.inode) {
- io_opts->d.nr = 0;
-
- ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode),
- BTREE_ITER_all_snapshots, k, ({
- if (k.k->p.offset != extent_pos.inode)
- break;
-
- if (!bkey_is_inode(k.k))
- continue;
-
- struct bch_inode_unpacked inode;
- _ret3 = bch2_inode_unpack(k, &inode);
- if (_ret3)
- break;
-
- struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
- bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
-
- darray_push(&io_opts->d, e);
- }));
- io_opts->cur_inum = extent_pos.inode;
- }
-
- ret = ret ?: trans_was_restarted(trans, restart_count);
- if (ret)
- return ERR_PTR(ret);
-
- if (extent_k.k->p.snapshot)
- darray_for_each(io_opts->d, i)
- if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) {
- opts_ret = &i->io_opts;
- break;
- }
-out:
- ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k);
- if (ret)
- return ERR_PTR(ret);
- return opts_ret;
-}
-
-int bch2_move_get_io_opts_one(struct btree_trans *trans,
- struct bch_io_opts *io_opts,
- struct btree_iter *extent_iter,
- struct bkey_s_c extent_k)
-{
- struct bch_fs *c = trans->c;
-
- *io_opts = bch2_opts_to_inode_opts(c->opts);
-
- /* reflink btree? */
- if (extent_k.k->p.inode) {
- CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes,
- SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
- BTREE_ITER_cached);
- struct bkey_s_c inode_k = bch2_btree_iter_peek_slot(&inode_iter);
- int ret = bkey_err(inode_k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
-
- if (!ret && bkey_is_inode(inode_k.k)) {
- struct bch_inode_unpacked inode;
- bch2_inode_unpack(inode_k, &inode);
- bch2_inode_opts_get(io_opts, c, &inode);
- }
- }
-
- return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
-}
-
int bch2_move_ratelimit(struct moving_context *ctxt)
{
struct bch_fs *c = ctxt->trans->c;
@@ -582,37 +495,6 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
return 0;
}
-/*
- * Move requires non extents iterators, and there's also no need for it to
- * signal indirect_extent_missing_error:
- */
-static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c_reflink_p p)
-{
- if (unlikely(REFLINK_P_ERROR(p.v)))
- return bkey_s_c_null;
-
- struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v));
-
- bch2_trans_iter_init(trans, iter,
- BTREE_ID_reflink, reflink_pos,
- BTREE_ITER_not_extents);
-
- struct bkey_s_c k = bch2_btree_iter_peek(iter);
- if (!k.k || bkey_err(k)) {
- bch2_trans_iter_exit(iter);
- return k;
- }
-
- if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) {
- bch2_trans_iter_exit(iter);
- return bkey_s_c_null;
- }
-
- return k;
-}
-
int bch2_move_data_btree(struct moving_context *ctxt,
struct bpos start,
struct bpos end,
@@ -622,17 +504,11 @@ int bch2_move_data_btree(struct moving_context *ctxt,
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct per_snapshot_io_opts snapshot_io_opts;
- struct bch_io_opts *io_opts;
+ struct bch_inode_opts *io_opts;
struct bkey_buf sk;
struct btree_iter iter, reflink_iter = {};
struct bkey_s_c k;
struct data_update_opts data_opts;
- /*
- * If we're moving a single file, also process reflinked data it points
- * to (this includes propagating changed io_opts from the inode to the
- * extent):
- */
- bool walk_indirect = start.inode == end.inode;
int ret = 0, ret2;
per_snapshot_io_opts_init(&snapshot_io_opts, c);
@@ -697,8 +573,6 @@ root_err:
bch2_ratelimit_reset(ctxt->rate);
while (!bch2_move_ratelimit(ctxt)) {
- struct btree_iter *extent_iter = &iter;
-
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
@@ -717,41 +591,18 @@ root_err:
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
- if (walk_indirect &&
- k.k->type == KEY_TYPE_reflink_p &&
- REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-
- bch2_trans_iter_exit(&reflink_iter);
- k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p);
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- if (!k.k)
- goto next_nondata;
-
- /*
- * XXX: reflink pointers may point to multiple indirect
- * extents, so don't advance past the entire reflink
- * pointer - need to fixup iter->k
- */
- extent_iter = &reflink_iter;
- }
-
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
- io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
- iter.pos, extent_iter, k);
+ io_opts = bch2_extent_get_apply_io_opts(trans, &snapshot_io_opts,
+ iter.pos, &iter, k,
+ SET_NEEDS_REBALANCE_other);
ret = PTR_ERR_OR_ZERO(io_opts);
if (ret)
continue;
memset(&data_opts, 0, sizeof(data_opts));
- if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts))
+ if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts))
goto next;
/*
@@ -762,7 +613,7 @@ root_err:
k = bkey_i_to_s_c(sk.k);
if (!level)
- ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
+ ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
else if (!data_opts.scrub)
ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level,
k.k->p, data_opts.target, 0);
@@ -824,7 +675,7 @@ static int bch2_move_data(struct bch_fs *c,
unsigned min_depth_this_btree = min_depth;
/* Stripe keys have pointers, but are handled separately */
- if (!btree_type_has_ptrs(id) ||
+ if (!btree_type_has_data_ptrs(id) ||
id == BTREE_ID_stripes)
min_depth_this_btree = max(min_depth_this_btree, 1);
@@ -859,7 +710,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
bool is_kthread = current->flags & PF_KTHREAD;
- struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_iter iter = {};
struct bkey_buf sk;
struct bkey_s_c k;
@@ -867,6 +717,9 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
u64 check_mismatch_done = bucket_start;
int ret = 0;
+ struct bch_inode_opts io_opts;
+ bch2_inode_opts_get(c, &io_opts);
+
/* Userspace might have supplied @dev: */
CLASS(bch2_dev_tryget_noerror, ca)(c, dev);
if (!ca)
@@ -942,7 +795,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
goto next;
if (!bp.v->level) {
- ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
+ ret = bch2_extent_get_apply_io_opts_one(trans, &io_opts, &iter, k,
+ SET_NEEDS_REBALANCE_other);
if (ret) {
bch2_trans_iter_exit(&iter);
continue;
@@ -1039,7 +893,7 @@ int bch2_move_data_phys(struct bch_fs *c,
static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg,
enum btree_id btree, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
{
struct evacuate_bucket_arg *arg = _arg;
@@ -1080,7 +934,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
}
typedef bool (*move_btree_pred)(struct bch_fs *, void *,
- struct btree *, struct bch_io_opts *,
+ struct btree *, struct bch_inode_opts *,
struct data_update_opts *);
static int bch2_move_btree(struct bch_fs *c,
@@ -1090,7 +944,6 @@ static int bch2_move_btree(struct bch_fs *c,
struct bch_move_stats *stats)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
- struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct moving_context ctxt;
struct btree_trans *trans;
struct btree_iter iter;
@@ -1099,6 +952,9 @@ static int bch2_move_btree(struct bch_fs *c,
struct data_update_opts data_opts;
int ret = 0;
+ struct bch_inode_opts io_opts;
+ bch2_inode_opts_get(c, &io_opts);
+
bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
writepoint_ptr(&c->btree_write_point),
true);
@@ -1159,7 +1015,7 @@ next:
static bool rereplicate_pred(struct bch_fs *c, void *arg,
enum btree_id btree, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
{
unsigned nr_good = bch2_bkey_durability(c, k);
@@ -1190,7 +1046,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
static bool migrate_pred(struct bch_fs *c, void *arg,
enum btree_id btree, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -1227,7 +1083,7 @@ static bool bformat_needs_redo(struct bkey_format *f)
static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
struct btree *b,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
{
if (b->version_ondisk != c->sb.version ||
@@ -1264,7 +1120,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
enum btree_id btree, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
{
unsigned durability = bch2_bkey_durability(c, k);
@@ -1302,7 +1158,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
static bool scrub_pred(struct bch_fs *c, void *_arg,
enum btree_id btree, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts *io_opts,
struct data_update_opts *data_opts)
{
struct bch_ioctl_data *arg = _arg;
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 481026ff99ab..754b0ad45950 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -73,7 +73,7 @@ do { \
} while (1)
typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c,
- struct bch_io_opts *, struct data_update_opts *);
+ struct bch_inode_opts *, struct data_update_opts *);
extern const char * const bch2_data_ops_strs[];
@@ -87,45 +87,15 @@ void bch2_moving_ctxt_flush_all(struct moving_context *);
void bch2_move_ctxt_wait_for_io(struct moving_context *);
int bch2_move_ratelimit(struct moving_context *);
-/* Inodes in different snapshots may have different IO options: */
-struct snapshot_io_opts_entry {
- u32 snapshot;
- struct bch_io_opts io_opts;
-};
-
-struct per_snapshot_io_opts {
- u64 cur_inum;
- struct bch_io_opts fs_io_opts;
- DARRAY(struct snapshot_io_opts_entry) d;
-};
-
-static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
-{
- memset(io_opts, 0, sizeof(*io_opts));
- io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
-}
-
-static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
-{
- darray_exit(&io_opts->d);
-}
-
-int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *,
- struct btree_iter *, struct bkey_s_c);
-
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
int bch2_move_extent(struct moving_context *,
struct move_bucket *,
struct btree_iter *,
struct bkey_s_c,
- struct bch_io_opts,
+ struct bch_inode_opts,
struct data_update_opts);
-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
- struct per_snapshot_io_opts *, struct bpos,
- struct btree_iter *, struct bkey_s_c);
-
int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos,
move_pred_fn, void *, enum btree_id, unsigned);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index c3ef35dc01e2..122bc98e4cbb 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -518,7 +518,7 @@ void bch2_opts_to_text(struct printbuf *out,
}
}
-int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v)
+int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, u64 v)
{
int ret = 0;
@@ -531,6 +531,8 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id
case Opt_compression:
case Opt_background_compression:
ret = bch2_check_set_has_compressed_data(c, v);
+ if (ret)
+ return ret;
break;
case Opt_erasure_code:
if (v)
@@ -546,7 +548,7 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id
int bch2_opts_hooks_pre_set(struct bch_fs *c)
{
for (unsigned i = 0; i < bch2_opts_nr; i++) {
- int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i));
+ int ret = bch2_opt_hook_pre_set(c, NULL, 0, i, bch2_opt_get_by_id(&c->opts, i));
if (ret)
return ret;
}
@@ -555,26 +557,15 @@ int bch2_opts_hooks_pre_set(struct bch_fs *c)
}
void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum,
- struct bch_opts *new_opts, enum bch_opt_id id)
+ enum bch_opt_id id, u64 v)
{
switch (id) {
case Opt_foreground_target:
- if (new_opts->foreground_target &&
- !new_opts->background_target)
- bch2_set_rebalance_needs_scan(c, inum);
- break;
case Opt_compression:
- if (new_opts->compression &&
- !new_opts->background_compression)
- bch2_set_rebalance_needs_scan(c, inum);
- break;
case Opt_background_target:
- if (new_opts->background_target)
- bch2_set_rebalance_needs_scan(c, inum);
- break;
case Opt_background_compression:
- if (new_opts->background_compression)
- bch2_set_rebalance_needs_scan(c, inum);
+ bch2_set_rebalance_needs_scan(c, inum);
+ bch2_rebalance_wakeup(c);
break;
case Opt_rebalance_enabled:
bch2_rebalance_wakeup(c);
@@ -600,12 +591,14 @@ void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum,
* upgrades at runtime as well, but right now there's nothing
* that does that:
*/
- if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible)
+ if (v == BCH_VERSION_UPGRADE_incompatible)
bch2_sb_upgrade_incompat(c);
break;
default:
break;
}
+
+ atomic_inc(&c->opt_change_cookie);
}
int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
@@ -802,16 +795,17 @@ bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
/* io opts: */
-struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+void bch2_inode_opts_get(struct bch_fs *c, struct bch_inode_opts *ret)
{
- struct bch_io_opts opts = {
-#define x(_name, _bits) ._name = src._name,
+ memset(ret, 0, sizeof(*ret));
+
+#define x(_name, _bits) ret->_name = c->opts._name,
BCH_INODE_OPTS()
#undef x
- };
- bch2_io_opts_fixups(&opts);
- return opts;
+ ret->change_cookie = atomic_read(&c->opt_change_cookie);
+
+ bch2_io_opts_fixups(ret);
}
bool bch2_opt_is_inode_opt(enum bch_opt_id id)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index f8828f4699c7..22cf109fb9c9 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -658,10 +658,9 @@ void bch2_opts_to_text(struct printbuf *,
struct bch_fs *, struct bch_sb *,
unsigned, unsigned, unsigned);
-int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64);
+int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, u64, enum bch_opt_id, u64);
int bch2_opts_hooks_pre_set(struct bch_fs *);
-void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64,
- struct bch_opts *, enum bch_opt_id);
+void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, enum bch_opt_id, u64);
int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
struct printbuf *, const char *, const char *);
@@ -670,16 +669,19 @@ int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
/* inode opts: */
-struct bch_io_opts {
+struct bch_inode_opts {
#define x(_name, _bits) u##_bits _name;
BCH_INODE_OPTS()
#undef x
+
#define x(_name, _bits) u64 _name##_from_inode:1;
BCH_INODE_OPTS()
#undef x
+
+ u32 change_cookie;
};
-static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
+static inline void bch2_io_opts_fixups(struct bch_inode_opts *opts)
{
if (!opts->background_target)
opts->background_target = opts->foreground_target;
@@ -692,7 +694,7 @@ static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
}
}
-struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+void bch2_inode_opts_get(struct bch_fs *, struct bch_inode_opts *);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c
index 792fc6fef270..541ee951d1c9 100644
--- a/fs/bcachefs/progress.c
+++ b/fs/bcachefs/progress.c
@@ -12,7 +12,7 @@ void bch2_progress_init(struct progress_indicator_state *s,
s->next_print = jiffies + HZ * 10;
- for (unsigned i = 0; i < BTREE_ID_NR; i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
if (!(btree_id_mask & BIT_ULL(i)))
continue;
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 35aff96bf12a..fa73de7890da 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -93,7 +93,7 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
}
static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
- struct bch_io_opts *opts,
+ struct bch_inode_opts *opts,
struct bkey_s_c k,
struct bkey_ptrs_c ptrs)
{
@@ -120,7 +120,7 @@ static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
}
static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
- struct bch_io_opts *opts,
+ struct bch_inode_opts *opts,
struct bkey_ptrs_c ptrs)
{
if (!opts->background_target ||
@@ -141,7 +141,7 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
}
static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
- struct bch_io_opts *opts,
+ struct bch_inode_opts *opts,
struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -194,7 +194,7 @@ incompressible:
return sectors;
}
-static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
+static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_inode_opts *opts,
struct bkey_s_c k)
{
if (!bkey_extent_is_direct_data(k.k))
@@ -210,8 +210,10 @@ static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opt
}
}
-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
- struct bkey_i *_k)
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts,
+ struct bkey_i *_k,
+ enum set_needs_rebalance_ctx ctx,
+ u32 change_cookie)
{
if (!bkey_extent_is_direct_data(&_k->k))
return 0;
@@ -235,10 +237,11 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
return 0;
}
-int bch2_get_update_rebalance_opts(struct btree_trans *trans,
- struct bch_io_opts *io_opts,
- struct btree_iter *iter,
- struct bkey_s_c k)
+static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
+ struct bch_inode_opts *io_opts,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ enum set_needs_rebalance_ctx ctx)
{
BUG_ON(iter->flags & BTREE_ITER_is_extents);
BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
@@ -267,10 +270,121 @@ int bch2_get_update_rebalance_opts(struct btree_trans *trans,
/* On successfull transaction commit, @k was invalidated: */
- return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
+ return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n, ctx, 0) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL, 0) ?:
- bch_err_throw(trans->c, transaction_restart_nested);
+ bch_err_throw(trans->c, transaction_restart_commit);
+}
+
+static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans *trans,
+ struct per_snapshot_io_opts *io_opts,
+ struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
+ struct btree_iter *extent_iter,
+ struct bkey_s_c extent_k)
+{
+ struct bch_fs *c = trans->c;
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
+
+ if (btree_iter_path(trans, extent_iter)->level)
+ return &io_opts->fs_io_opts;
+
+ if (extent_k.k->type == KEY_TYPE_reflink_v)
+ return &io_opts->fs_io_opts;
+
+ if (io_opts->cur_inum != extent_pos.inode) {
+ io_opts->d.nr = 0;
+
+ ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode),
+ BTREE_ITER_all_snapshots, k, ({
+ if (k.k->p.offset != extent_pos.inode)
+ break;
+
+ if (!bkey_is_inode(k.k))
+ continue;
+
+ struct bch_inode_unpacked inode;
+ _ret3 = bch2_inode_unpack(k, &inode);
+ if (_ret3)
+ break;
+
+ struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
+ bch2_inode_opts_get_inode(c, &inode, &e.io_opts);
+
+ darray_push(&io_opts->d, e);
+ }));
+ io_opts->cur_inum = extent_pos.inode;
+ }
+
+ ret = ret ?: trans_was_restarted(trans, restart_count);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (extent_k.k->p.snapshot)
+ darray_for_each(io_opts->d, i)
+ if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
+ return &i->io_opts;
+
+ return &io_opts->fs_io_opts;
+}
+
+struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *trans,
+ struct per_snapshot_io_opts *snapshot_io_opts,
+ struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
+ struct btree_iter *extent_iter,
+ struct bkey_s_c extent_k,
+ enum set_needs_rebalance_ctx ctx)
+{
+ struct bch_inode_opts *opts =
+ bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, extent_iter, extent_k);
+ if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level)
+ return opts;
+
+ int ret = bch2_get_update_rebalance_opts(trans, opts, extent_iter, extent_k, ctx);
+ return ret ? ERR_PTR(ret) : opts;
+}
+
+int bch2_extent_get_io_opts_one(struct btree_trans *trans,
+ struct bch_inode_opts *io_opts,
+ struct btree_iter *extent_iter,
+ struct bkey_s_c extent_k,
+ enum set_needs_rebalance_ctx ctx)
+{
+ struct bch_fs *c = trans->c;
+
+ bch2_inode_opts_get(c, io_opts);
+
+ /* reflink btree? */
+ if (extent_k.k->p.inode) {
+ CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes,
+ SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
+ BTREE_ITER_cached);
+ struct bkey_s_c inode_k = bch2_btree_iter_peek_slot(&inode_iter);
+ int ret = bkey_err(inode_k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
+
+ if (!ret && bkey_is_inode(inode_k.k)) {
+ struct bch_inode_unpacked inode;
+ bch2_inode_unpack(inode_k, &inode);
+ bch2_inode_opts_get_inode(c, &inode, io_opts);
+ }
+ }
+
+ return 0;
+}
+
+int bch2_extent_get_apply_io_opts_one(struct btree_trans *trans,
+ struct bch_inode_opts *io_opts,
+ struct btree_iter *extent_iter,
+ struct bkey_s_c extent_k,
+ enum set_needs_rebalance_ctx ctx)
+{
+ int ret = bch2_extent_get_io_opts_one(trans, io_opts, extent_iter, extent_k, ctx);
+ if (ret || btree_iter_path(trans, extent_iter)->level)
+ return ret;
+
+ return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k, ctx);
}
#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
@@ -403,9 +517,10 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
}
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
+ struct per_snapshot_io_opts *snapshot_io_opts,
struct bpos work_pos,
struct btree_iter *extent_iter,
- struct bch_io_opts *io_opts,
+ struct bch_inode_opts **opts_ret,
struct data_update_opts *data_opts)
{
struct bch_fs *c = trans->c;
@@ -419,13 +534,19 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
if (bkey_err(k))
return k;
- int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
+ struct bch_inode_opts *opts =
+ bch2_extent_get_apply_io_opts(trans, snapshot_io_opts,
+ extent_iter->pos, extent_iter, k,
+ SET_NEEDS_REBALANCE_other);
+ int ret = PTR_ERR_OR_ZERO(opts);
if (ret)
return bkey_s_c_err(ret);
+ *opts_ret = opts;
+
memset(data_opts, 0, sizeof(*data_opts));
- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
- data_opts->target = io_opts->background_target;
+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k);
+ data_opts->target = opts->background_target;
data_opts->write_flags |= BCH_WRITE_only_specified_devs;
if (!data_opts->rewrite_ptrs) {
@@ -450,19 +571,19 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
+ unsigned p = bch2_bkey_ptrs_need_compress(c, opts, k, ptrs);
if (p) {
prt_str(&buf, "compression=");
- bch2_compression_opt_to_text(&buf, io_opts->background_compression);
+ bch2_compression_opt_to_text(&buf, opts->background_compression);
prt_str(&buf, " ");
bch2_prt_u64_base2(&buf, p);
prt_newline(&buf);
}
- p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
+ p = bch2_bkey_ptrs_need_move(c, opts, ptrs);
if (p) {
prt_str(&buf, "move=");
- bch2_target_to_text(&buf, c, io_opts->background_target);
+ bch2_target_to_text(&buf, c, opts->background_target);
prt_str(&buf, " ");
bch2_prt_u64_base2(&buf, p);
prt_newline(&buf);
@@ -477,6 +598,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
noinline_for_stack
static int do_rebalance_extent(struct moving_context *ctxt,
+ struct per_snapshot_io_opts *snapshot_io_opts,
struct bpos work_pos,
struct btree_iter *extent_iter)
{
@@ -484,7 +606,7 @@ static int do_rebalance_extent(struct moving_context *ctxt,
struct bch_fs *c = trans->c;
struct bch_fs_rebalance *r = &trans->c->rebalance;
struct data_update_opts data_opts;
- struct bch_io_opts io_opts;
+ struct bch_inode_opts *io_opts;
struct bkey_s_c k;
struct bkey_buf sk;
int ret;
@@ -495,8 +617,8 @@ static int do_rebalance_extent(struct moving_context *ctxt,
bch2_bkey_buf_init(&sk);
ret = lockrestart_do(trans,
- bkey_err(k = next_rebalance_extent(trans, work_pos,
- extent_iter, &io_opts, &data_opts)));
+ bkey_err(k = next_rebalance_extent(trans, snapshot_io_opts,
+ work_pos, extent_iter, &io_opts, &data_opts)));
if (ret || !k.k)
goto out;
@@ -509,7 +631,7 @@ static int do_rebalance_extent(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
+ ret = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
if (ret) {
if (bch2_err_matches(ret, ENOMEM)) {
/* memory allocation failure, wait for some IO to finish */
@@ -528,7 +650,31 @@ out:
return ret;
}
+static int do_rebalance_scan_indirect(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ struct bch_inode_opts *opts)
+{
+ u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
+ u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad);
+ u32 restart_count = trans->restart_count;
+
+ int ret = for_each_btree_key(trans, iter, BTREE_ID_reflink,
+ POS(0, idx), BTREE_ITER_not_extents, k, ({
+ if (bpos_ge(bkey_start_pos(k.k), POS(0, end)))
+ break;
+ bch2_get_update_rebalance_opts(trans, opts, &iter, k,
+ SET_NEEDS_REBALANCE_opt_change_indirect);
+ }));
+ if (ret)
+ return ret;
+
+ /* suppress trans_was_restarted() check */
+ trans->restart_count = restart_count;
+ return 0;
+}
+
static int do_rebalance_scan(struct moving_context *ctxt,
+ struct per_snapshot_io_opts *snapshot_io_opts,
u64 inum, u64 cookie, u64 *sectors_scanned)
{
struct btree_trans *trans = ctxt->trans;
@@ -548,32 +694,33 @@ static int do_rebalance_scan(struct moving_context *ctxt,
r->state = BCH_REBALANCE_scanning;
- struct per_snapshot_io_opts snapshot_io_opts;
- per_snapshot_io_opts_init(&snapshot_io_opts, c);
-
int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
r->scan_start.pos, r->scan_end.pos,
BTREE_ITER_all_snapshots|
BTREE_ITER_prefetch, k, ({
ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
- struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
- &snapshot_io_opts, iter.pos, &iter, k);
- PTR_ERR_OR_ZERO(io_opts);
+ struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans,
+ snapshot_io_opts, iter.pos, &iter, k,
+ SET_NEEDS_REBALANCE_opt_change);
+ PTR_ERR_OR_ZERO(opts) ?:
+ (inum &&
+ k.k->type == KEY_TYPE_reflink_p &&
+ REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)
+ ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), opts)
+ : 0);
})) ?:
commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_clear_rebalance_needs_scan(trans, inum, cookie));
- per_snapshot_io_opts_exit(&snapshot_io_opts);
*sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen);
- bch2_move_stats_exit(&r->scan_stats, c);
-
/*
* Ensure that the rebalance_work entries we created are seen by the
* next iteration of do_rebalance(), so we don't end up stuck in
* rebalance_wait():
*/
*sectors_scanned += 1;
+ bch2_move_stats_exit(&r->scan_stats, c);
bch2_btree_write_buffer_flush_sync(trans);
@@ -625,6 +772,9 @@ static int do_rebalance(struct moving_context *ctxt)
bch2_move_stats_init(&r->work_stats, "rebalance_work");
+ struct per_snapshot_io_opts snapshot_io_opts;
+ per_snapshot_io_opts_init(&snapshot_io_opts, c);
+
while (!bch2_move_ratelimit(ctxt)) {
if (!bch2_rebalance_enabled(c)) {
bch2_moving_ctxt_flush_all(ctxt);
@@ -639,15 +789,18 @@ static int do_rebalance(struct moving_context *ctxt)
break;
ret = k->k.type == KEY_TYPE_cookie
- ? do_rebalance_scan(ctxt, k->k.p.inode,
+ ? do_rebalance_scan(ctxt, &snapshot_io_opts,
+ k->k.p.inode,
le64_to_cpu(bkey_i_to_cookie(k)->v.cookie),
&sectors_scanned)
- : do_rebalance_extent(ctxt, k->k.p, &extent_iter);
+ : do_rebalance_extent(ctxt, &snapshot_io_opts,
+ k->k.p, &extent_iter);
if (ret)
break;
}
bch2_trans_iter_exit(&extent_iter);
+ per_snapshot_io_opts_exit(&snapshot_io_opts);
bch2_move_stats_exit(&r->work_stats, c);
if (!ret &&
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 4a8812a65c61..bff91aa0102e 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -8,7 +8,7 @@
#include "rebalance_types.h"
static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
- struct bch_io_opts *opts)
+ struct bch_inode_opts *opts)
{
struct bch_extent_rebalance r = {
.type = BIT(BCH_EXTENT_ENTRY_rebalance),
@@ -30,11 +30,51 @@ void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *,
const struct bch_extent_rebalance *);
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
-int bch2_get_update_rebalance_opts(struct btree_trans *,
- struct bch_io_opts *,
- struct btree_iter *,
- struct bkey_s_c);
+
+enum set_needs_rebalance_ctx {
+ SET_NEEDS_REBALANCE_opt_change,
+ SET_NEEDS_REBALANCE_opt_change_indirect,
+ SET_NEEDS_REBALANCE_foreground,
+ SET_NEEDS_REBALANCE_other,
+};
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *,
+ struct bkey_i *, enum set_needs_rebalance_ctx, u32);
+
+/* Inodes in different snapshots may have different IO options: */
+struct snapshot_io_opts_entry {
+ u32 snapshot;
+ struct bch_inode_opts io_opts;
+};
+
+struct per_snapshot_io_opts {
+ u64 cur_inum;
+ struct bch_inode_opts fs_io_opts;
+ DARRAY(struct snapshot_io_opts_entry) d;
+};
+
+static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
+{
+ memset(io_opts, 0, sizeof(*io_opts));
+ bch2_inode_opts_get(c, &io_opts->fs_io_opts);
+}
+
+static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
+{
+ darray_exit(&io_opts->d);
+}
+
+struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *,
+ struct per_snapshot_io_opts *, struct bpos,
+ struct btree_iter *, struct bkey_s_c,
+ enum set_needs_rebalance_ctx);
+
+int bch2_extent_get_io_opts_one(struct btree_trans *, struct bch_inode_opts *,
+ struct btree_iter *, struct bkey_s_c,
+ enum set_needs_rebalance_ctx);
+int bch2_extent_get_apply_io_opts_one(struct btree_trans *, struct bch_inode_opts *,
+ struct btree_iter *, struct bkey_s_c,
+ enum set_needs_rebalance_ctx);
int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 8679c8aad0e7..531c2ef128ae 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -837,33 +837,39 @@ use_clean:
bch2_async_btree_node_rewrites_flush(c);
/* fsync if we fixed errors */
- if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+ bool errors_fixed = test_bit(BCH_FS_errors_fixed, &c->flags) ||
+ test_bit(BCH_FS_errors_fixed_silent, &c->flags);
+
+ if (errors_fixed) {
bch2_journal_flush_all_pins(&c->journal);
bch2_journal_meta(&c->journal);
}
/* If we fixed errors, verify that fs is actually clean now: */
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- test_bit(BCH_FS_errors_fixed, &c->flags) &&
+ errors_fixed &&
!test_bit(BCH_FS_errors_not_fixed, &c->flags) &&
!test_bit(BCH_FS_error, &c->flags)) {
bch2_flush_fsck_errs(c);
bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
+ errors_fixed = test_bit(BCH_FS_errors_fixed, &c->flags);
clear_bit(BCH_FS_errors_fixed, &c->flags);
+ clear_bit(BCH_FS_errors_fixed_silent, &c->flags);
ret = bch2_run_recovery_passes(c,
BCH_RECOVERY_PASS_check_alloc_info);
if (ret)
goto err;
- if (test_bit(BCH_FS_errors_fixed, &c->flags) ||
+ if (errors_fixed ||
test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
bch_err(c, "Second fsck run was not clean");
set_bit(BCH_FS_errors_not_fixed, &c->flags);
}
- set_bit(BCH_FS_errors_fixed, &c->flags);
+ if (errors_fixed)
+ set_bit(BCH_FS_errors_fixed, &c->flags);
}
if (enabled_qtypes(c)) {
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 238a362de19e..d54468fdcb18 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -589,7 +589,6 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bpos dst_start = POS(dst_inum.inum, dst_offset);
struct bpos src_start = POS(src_inum.inum, src_offset);
struct bpos dst_end = dst_start, src_end = src_start;
- struct bch_io_opts opts;
struct bpos src_want;
u64 dst_done = 0;
u32 dst_snapshot, src_snapshot;
@@ -609,10 +608,6 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_bkey_buf_init(&new_src);
CLASS(btree_trans, trans)(c);
- ret = bch2_inum_opts_get(trans, src_inum, &opts);
- if (ret)
- goto err;
-
bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
BTREE_ITER_intent);
bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
@@ -709,11 +704,10 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?:
- bch2_extent_update(trans, dst_inum, &dst_iter,
- new_dst.k, &disk_res,
- new_i_size, i_sectors_delta,
- true);
+ ret = bch2_extent_update(trans, dst_inum, &dst_iter,
+ new_dst.k, &disk_res,
+ new_i_size, i_sectors_delta,
+ true, 0);
bch2_disk_reservation_put(c, &disk_res);
}
bch2_trans_iter_exit(&dst_iter);
@@ -744,7 +738,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_iter_exit(&inode_iter);
} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
-err:
+
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
index 17cd617664d9..3907ba7edff2 100644
--- a/fs/bcachefs/sb-counters_format.h
+++ b/fs/bcachefs/sb-counters_format.h
@@ -23,6 +23,8 @@ enum counters_flags {
x(io_read_reuse_race, 34, TYPE_COUNTER) \
x(io_read_retry, 32, TYPE_COUNTER) \
x(io_read_fail_and_poison, 95, TYPE_COUNTER) \
+ x(io_read_narrow_crcs, 97, TYPE_COUNTER) \
+ x(io_read_narrow_crcs_fail, 98, TYPE_COUNTER) \
x(io_write, 1, TYPE_SECTORS) \
x(io_move, 2, TYPE_SECTORS) \
x(io_move_read, 35, TYPE_SECTORS) \
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 7c6f18a1ee2a..728d878057af 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -160,7 +160,7 @@ enum bch_fsck_flags {
x(extent_ptrs_unwritten, 140, 0) \
x(extent_ptrs_written_and_unwritten, 141, 0) \
x(ptr_to_invalid_device, 142, 0) \
- x(ptr_to_removed_device, 322, 0) \
+ x(ptr_to_removed_device, 322, FSCK_AUTOFIX) \
x(ptr_to_duplicate_device, 143, 0) \
x(ptr_after_last_bucket, 144, 0) \
x(ptr_before_first_bucket, 145, 0) \
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 32b12311928e..473ad4b51180 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -322,6 +322,8 @@ static void __bch2_fs_read_only(struct bch_fs *c)
do {
clean_passes++;
+ bch2_do_discards_going_ro(c);
+
if (bch2_btree_interior_updates_flush(c) ||
bch2_btree_write_buffer_flush_going_ro(c) ||
bch2_journal_flush_all_pins(&c->journal) ||
@@ -1209,12 +1211,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
bch2_opts_apply(&c->opts, *opts);
+#ifdef __KERNEL__
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
c->opts.block_size > PAGE_SIZE) {
bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE");
ret = -EINVAL;
goto err;
}
+#endif
c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
if (c->opts.inodes_use_key_cache)
@@ -1286,7 +1290,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
if (ret)
goto err;
- if (go_rw_in_recovery(c)) {
+ /*
+ * just make sure this is always allocated if we might need it - mount
+ * failing due to kthread_create() failing is _very_ annoying
+ */
+ if (!(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) ||
+ go_rw_in_recovery(c)) {
/*
* start workqueues/kworkers early - kthread creation checks for
* pending signals, which is _very_ annoying
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 6b071dcc062b..4c6e6c46d18a 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -784,7 +784,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
u64 v;
ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?:
- bch2_opt_hook_pre_set(c, ca, id, v);
+ bch2_opt_hook_pre_set(c, ca, 0, id, v);
kfree(tmp);
if (ret < 0)
@@ -807,7 +807,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
bch2_opt_set_by_id(&c->opts, id, v);
if (changed)
- bch2_opt_hook_post_set(c, ca, 0, &c->opts, id);
+ bch2_opt_hook_post_set(c, ca, 0, id, v);
ret = size;
err:
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 269cdf1a87a4..6c312fd9a447 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -720,47 +720,55 @@ DEFINE_EVENT(fs_str, bucket_alloc_fail,
);
DECLARE_EVENT_CLASS(discard_buckets_class,
- TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
- u64 need_journal_commit, u64 discarded, const char *err),
- TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
+ TP_PROTO(struct bch_fs *c, struct discard_buckets_state *s, const char *err),
+ TP_ARGS(c, s, err),
TP_STRUCT__entry(
__field(dev_t, dev )
__field(u64, seen )
__field(u64, open )
__field(u64, need_journal_commit )
+ __field(u64, commit_in_flight )
+ __field(u64, bad_data_type )
+ __field(u64, already_discarding )
__field(u64, discarded )
__array(char, err, 16 )
),
TP_fast_assign(
__entry->dev = c->dev;
- __entry->seen = seen;
- __entry->open = open;
- __entry->need_journal_commit = need_journal_commit;
- __entry->discarded = discarded;
+ __entry->seen = s->seen;
+ __entry->open = s->open;
+ __entry->need_journal_commit = s->need_journal_commit;
+ __entry->commit_in_flight = s->commit_in_flight;
+ __entry->bad_data_type = s->bad_data_type;
+ __entry->already_discarding = s->already_discarding;
+ __entry->discarded = s->discarded;
strscpy(__entry->err, err, sizeof(__entry->err));
),
- TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
+ TP_printk("%d%d seen %llu open %llu\n"
+ "need_commit %llu committing %llu bad_data_type %llu\n"
+ "already_discarding %llu discarded %llu err %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->seen,
__entry->open,
__entry->need_journal_commit,
+ __entry->commit_in_flight,
+ __entry->bad_data_type,
+ __entry->already_discarding,
__entry->discarded,
__entry->err)
);
DEFINE_EVENT(discard_buckets_class, discard_buckets,
- TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
- u64 need_journal_commit, u64 discarded, const char *err),
- TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
+ TP_PROTO(struct bch_fs *c, struct discard_buckets_state *s, const char *err),
+ TP_ARGS(c, s, err)
);
DEFINE_EVENT(discard_buckets_class, discard_buckets_fast,
- TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
- u64 need_journal_commit, u64 discarded, const char *err),
- TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
+ TP_PROTO(struct bch_fs *c, struct discard_buckets_state *s, const char *err),
+ TP_ARGS(c, s, err)
);
TRACE_EVENT(bucket_invalidate,
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 6d7303008b19..784e75a21132 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -535,10 +535,9 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
return -EINVAL;
s.id = inode_opt_id;
+ u64 v = 0;
if (value) {
- u64 v = 0;
-
buf = kmalloc(size + 1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -551,7 +550,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
if (ret < 0)
goto err;
- ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v);
+ ret = bch2_opt_hook_pre_set(c, NULL, inode->ei_inode.bi_inum, opt_id, v);
if (ret < 0)
goto err;
@@ -591,6 +590,8 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
}
+
+ bch2_opt_hook_post_set(c, NULL, inode->ei_inode.bi_inum, opt_id, v);
err:
return bch2_err_class(ret);
}