summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--Makefile2
-rw-r--r--c_src/cmd_option.c4
-rw-r--r--include/linux/atomic.h7
-rw-r--r--include/linux/closure.h33
-rw-r--r--libbcachefs/Makefile2
-rw-r--r--libbcachefs/alloc_foreground.h2
-rw-r--r--libbcachefs/backpointers.c14
-rw-r--r--libbcachefs/bcachefs_format.h8
-rw-r--r--libbcachefs/bkey_methods.c9
-rw-r--r--libbcachefs/bkey_methods.h3
-rw-r--r--libbcachefs/btree_cache.c16
-rw-r--r--libbcachefs/btree_gc.c42
-rw-r--r--libbcachefs/btree_key_cache.c14
-rw-r--r--libbcachefs/btree_locking.c14
-rw-r--r--libbcachefs/buckets.c59
-rw-r--r--libbcachefs/data_update.c4
-rw-r--r--libbcachefs/disk_accounting_format.h10
-rw-r--r--libbcachefs/ec.c13
-rw-r--r--libbcachefs/extents.c47
-rw-r--r--libbcachefs/extents.h5
-rw-r--r--libbcachefs/fs-io-direct.c1
-rw-r--r--libbcachefs/fsck.c6
-rw-r--r--libbcachefs/io_read.c13
-rw-r--r--libbcachefs/migrate.c20
-rw-r--r--libbcachefs/opts.c34
-rw-r--r--libbcachefs/opts.h24
-rw-r--r--libbcachefs/progress.c39
-rw-r--r--libbcachefs/progress.h12
-rw-r--r--libbcachefs/rebalance.c282
-rw-r--r--libbcachefs/rebalance.h4
-rw-r--r--libbcachefs/recovery.c12
-rw-r--r--libbcachefs/sb-downgrade.c11
-rw-r--r--libbcachefs/sb-errors_format.h5
-rw-r--r--libbcachefs/super.c59
-rw-r--r--libbcachefs/sysfs.c119
-rw-r--r--libbcachefs/util.c8
-rw-r--r--libbcachefs/xattr.c2
-rw-r--r--linux/closure.c195
39 files changed, 597 insertions, 559 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 13c25773..5fc72f30 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-292344971769fe1dd561d8844c57c15c833f91ef
+1c8d3fc41e7291ee39458e225a1ceac76bb8d8f1
diff --git a/Makefile b/Makefile
index 11a5304b..db2944b0 100644
--- a/Makefile
+++ b/Makefile
@@ -266,8 +266,10 @@ update-bcachefs-sources:
git rm -rf --ignore-unmatch libbcachefs
test -d libbcachefs || mkdir libbcachefs
cp $(LINUX_DIR)/fs/bcachefs/*.[ch] libbcachefs/
+ cp $(LINUX_DIR)/fs/bcachefs/Makefile libbcachefs/
rm libbcachefs/fast_list.c libbcachefs/async_objs.c
git add libbcachefs/*.[ch]
+ git add libbcachefs/Makefile
git rm -f libbcachefs/mean_and_variance_test.c
cp $(LINUX_DIR)/include/linux/closure.h include/linux/
git add include/linux/closure.h
diff --git a/c_src/cmd_option.c b/c_src/cmd_option.c
index 433d6196..1ae4e076 100644
--- a/c_src/cmd_option.c
+++ b/c_src/cmd_option.c
@@ -117,7 +117,7 @@ int cmd_set_option(int argc, char *argv[])
fprintf(stderr, "Can't set option %s\n", opt->attr.name);
if (opt->flags & OPT_FS) {
- ret = bch2_opt_hook_pre_set(c, NULL, 0, i, v);
+ ret = bch2_opt_hook_pre_set(c, NULL, 0, i, v, true);
if (ret < 0) {
fprintf(stderr, "error setting %s: %i\n", opt->attr.name, ret);
continue;
@@ -135,7 +135,7 @@ int cmd_set_option(int argc, char *argv[])
continue;
}
- ret = bch2_opt_hook_pre_set(c, ca, 0, i, v);
+ ret = bch2_opt_hook_pre_set(c, ca, 0, i, v, true);
if (ret < 0) {
fprintf(stderr, "error setting %s: %i\n", opt->attr.name, ret);
continue;
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index ae87a25a..62c70879 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -275,6 +275,13 @@ static inline bool a_type##_try_cmpxchg_acquire(a_type##_t *v, i_type *old, i_ty
i_type prev = *old; \
*old = cmpxchg_acquire(&v->counter, *old, new); \
return prev == *old; \
+} \
+ \
+static inline bool a_type##_try_cmpxchg_release(a_type##_t *v, i_type *old, i_type new)\
+{ \
+ i_type prev = *old; \
+ *old = cmpxchg_release(&v->counter, *old, new); \
+ return prev == *old; \
}
DEF_ATOMIC_OPS(atomic, int)
diff --git a/include/linux/closure.h b/include/linux/closure.h
index 880fe85e..83a0dde3 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -128,14 +128,15 @@ enum closure_state {
* annotate where references are being transferred.
*/
- CLOSURE_BITS_START = (1U << 26),
- CLOSURE_DESTRUCTOR = (1U << 26),
+ CLOSURE_BITS_START = (1U << 24),
+ CLOSURE_DESTRUCTOR = (1U << 24),
+ CLOSURE_SLEEPING = (1U << 26),
CLOSURE_WAITING = (1U << 28),
CLOSURE_RUNNING = (1U << 30),
};
#define CLOSURE_GUARD_MASK \
- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
+ (((CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)|(CLOSURE_BITS_START >> 1))
#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
@@ -144,7 +145,7 @@ struct closure {
union {
struct {
struct workqueue_struct *wq;
- struct closure_syncer *s;
+ struct task_struct *sleeper;
struct llist_node list;
closure_fn *fn;
};
@@ -154,7 +155,6 @@ struct closure {
struct closure *parent;
atomic_t remaining;
- bool closure_get_happened;
#ifdef CONFIG_DEBUG_CLOSURES
#define CLOSURE_MAGIC_DEAD 0xc054dead
@@ -169,11 +169,18 @@ struct closure {
};
void closure_sub(struct closure *cl, int v);
-void closure_put(struct closure *cl);
void __closure_wake_up(struct closure_waitlist *list);
bool closure_wait(struct closure_waitlist *list, struct closure *cl);
void __closure_sync(struct closure *cl);
+/*
+ * closure_put - decrement a closure's refcount
+ */
+static inline void closure_put(struct closure *cl)
+{
+ closure_sub(cl, 1);
+}
+
static inline unsigned closure_nr_remaining(struct closure *cl)
{
return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK;
@@ -187,11 +194,7 @@ static inline unsigned closure_nr_remaining(struct closure *cl)
*/
static inline void closure_sync(struct closure *cl)
{
-#ifdef CONFIG_DEBUG_CLOSURES
- BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened);
-#endif
-
- if (cl->closure_get_happened)
+ if (closure_nr_remaining(cl) > 1)
__closure_sync(cl);
}
@@ -199,10 +202,7 @@ int __closure_sync_timeout(struct closure *cl, unsigned long timeout);
static inline int closure_sync_timeout(struct closure *cl, unsigned long timeout)
{
-#ifdef CONFIG_DEBUG_CLOSURES
- BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened);
-#endif
- return cl->closure_get_happened
+ return closure_nr_remaining(cl) > 1
? __closure_sync_timeout(cl, timeout)
: 0;
}
@@ -275,8 +275,6 @@ static inline void closure_queue(struct closure *cl)
*/
static inline void closure_get(struct closure *cl)
{
- cl->closure_get_happened = true;
-
#ifdef CONFIG_DEBUG_CLOSURES
BUG_ON((atomic_inc_return(&cl->remaining) &
CLOSURE_REMAINING_MASK) <= 1);
@@ -314,7 +312,6 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
closure_get(parent);
atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
- cl->closure_get_happened = false;
closure_debug_create(cl);
closure_set_ip(cl);
diff --git a/libbcachefs/Makefile b/libbcachefs/Makefile
index 93c8ee54..bb2a80fb 100644
--- a/libbcachefs/Makefile
+++ b/libbcachefs/Makefile
@@ -41,7 +41,6 @@ bcachefs-y := \
extents.o \
extent_update.o \
eytzinger.o \
- fast_list.o \
fs.o \
fs-ioctl.o \
fs-io.o \
@@ -99,6 +98,7 @@ bcachefs-y := \
varint.o \
xattr.o
+bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += fast_list.o
bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o
obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h
index 02aef668..ae6d0aa8 100644
--- a/libbcachefs/alloc_foreground.h
+++ b/libbcachefs/alloc_foreground.h
@@ -306,7 +306,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
void __bch2_wait_on_allocator(struct bch_fs *, struct closure *);
static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
{
- if (cl->closure_get_happened)
+ if (closure_nr_remaining(cl) > 1)
__bch2_wait_on_allocator(c, cl);
}
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c
index c662eeba..3193dbcf 100644
--- a/libbcachefs/backpointers.c
+++ b/libbcachefs/backpointers.c
@@ -432,6 +432,10 @@ fsck_err:
/* verify that every backpointer has a corresponding alloc key */
int bch2_check_btree_backpointers(struct bch_fs *c)
{
+ struct progress_indicator_state progress;
+
+ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_backpointers));
+
struct bkey_buf last_flushed;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
@@ -439,8 +443,10 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
CLASS(btree_trans, trans)(c);
int ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_backpointers, POS_MIN, 0, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ progress_update_iter(trans, &progress, &iter);
+ bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed);
+ }));
bch2_bkey_buf_exit(&last_flushed, c);
return ret;
@@ -815,7 +821,9 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct progress_indicator_state progress;
int ret = 0;
- bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
+ bch2_progress_init_inner(&progress, trans->c,
+ btree_has_data_ptrs_mask,
+ ~0ULL);
for (enum btree_id btree_id = 0;
btree_id < btree_id_nr_alive(c);
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 08393971..d29bd684 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -706,7 +706,8 @@ struct bch_sb_field_ext {
x(fast_device_removal, BCH_VERSION(1, 27)) \
x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \
x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \
- x(31bit_dirent_offset, BCH_VERSION(1, 30))
+ x(31bit_dirent_offset, BCH_VERSION(1, 30)) \
+ x(btree_node_accounting, BCH_VERSION(1, 31))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -717,7 +718,7 @@ enum bcachefs_metadata_version {
};
static const __maybe_unused
-unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_btree_node_accounting;
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
@@ -965,7 +966,8 @@ enum bch_sb_feature {
x(alloc_info, 0) \
x(alloc_metadata, 1) \
x(extents_above_btree_updates_done, 2) \
- x(bformat_overflow_done, 3)
+ x(bformat_overflow_done, 3) \
+ x(no_stale_ptrs, 4)
enum bch_sb_compat {
#define x(f, n) BCH_COMPAT_##f,
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c
index 75d73677..da1a1a21 100644
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -344,15 +344,6 @@ void bch2_bkey_swab_val(struct bkey_s k)
ops->swab(k);
}
-bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
-
- return ops->key_normalize
- ? ops->key_normalize(c, k)
- : false;
-}
-
bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
{
const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type);
diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h
index bf34111c..5adce4e9 100644
--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@@ -26,7 +26,6 @@ struct bkey_ops {
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
void (*swab)(struct bkey_s);
- bool (*key_normalize)(struct bch_fs *, struct bkey_s);
bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
int (*trigger)(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
@@ -66,8 +65,6 @@ void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
void bch2_bkey_swab_val(struct bkey_s);
-bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
-
static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
{
return l->type == r->type &&
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 3b1d694d..59638d09 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -15,7 +15,6 @@
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
-#include <linux/seq_buf.h>
#include <linux/swap.h>
const char * const bch2_btree_node_flags[] = {
@@ -566,19 +565,6 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
return btree_cache_can_free(list);
}
-static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
-{
- struct btree_cache_list *list = shrink->private_data;
- struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
-
- char *cbuf;
- size_t buflen = seq_buf_get_buf(s, &cbuf);
- struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
-
- bch2_btree_cache_to_text(&out, bc);
- seq_buf_commit(s, out.pos);
-}
-
void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
@@ -673,7 +659,6 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bc->live[0].shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
- shrink->to_text = bch2_btree_cache_shrinker_to_text;
shrink->seeks = 2;
shrink->private_data = &bc->live[0];
shrinker_register(shrink);
@@ -684,7 +669,6 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bc->live[1].shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
- shrink->to_text = bch2_btree_cache_shrinker_to_text;
shrink->seeks = 8;
shrink->private_data = &bc->live[1];
shrinker_register(shrink);
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 2338feb8..63dc0836 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -780,7 +780,7 @@ static int bch2_gc_btrees(struct bch_fs *c)
int ret = 0;
struct progress_indicator_state progress;
- bch2_progress_init(&progress, c, ~0ULL);
+ bch2_progress_init_inner(&progress, c, ~0ULL, ~0ULL);
enum btree_id ids[BTREE_ID_NR];
for (unsigned i = 0; i < BTREE_ID_NR; i++)
@@ -1140,43 +1140,11 @@ static int gc_btree_gens_key(struct btree_trans *trans,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
return -EROFS;
- bool too_stale = false;
- scoped_guard(rcu) {
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
-
- too_stale |= dev_ptr_stale(ca, ptr) > 16;
- }
-
- if (!too_stale)
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
-
- u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
- if (gen_after(*gen, ptr->gen))
- *gen = ptr->gen;
- }
- }
-
- if (too_stale) {
- struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0);
- int ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
-
- bch2_extent_normalize(c, bkey_i_to_s(u));
- }
-
- return 0;
+ return bch2_bkey_drop_stale_ptrs(trans, iter, k);
}
static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
@@ -1281,6 +1249,12 @@ int bch2_gc_gens(struct bch_fs *c)
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
trace_and_count(c, gc_gens_end, c);
+
+ if (!(c->sb.compat & BIT_ULL(BCH_COMPAT_no_stale_ptrs))) {
+ guard(mutex)(&c->sb_lock);
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(BIT_ULL(BCH_COMPAT_no_stale_ptrs));
+ bch2_write_super(c);
+ }
err:
for_each_member_device(c, ca) {
kvfree(ca->oldest_gen);
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index e3336ab2..4890cbc8 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -13,7 +13,6 @@
#include "trace.h"
#include <linux/sched/mm.h>
-#include <linux/seq_buf.h>
static inline bool btree_uses_pcpu_readers(enum btree_id id)
{
@@ -809,18 +808,6 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
}
-static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
-{
- struct bch_fs *c = shrink->private_data;
- struct btree_key_cache *bc = &c->btree_key_cache;
- char *cbuf;
- size_t buflen = seq_buf_get_buf(s, &cbuf);
- struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
-
- bch2_btree_key_cache_to_text(&out, bc);
- seq_buf_commit(s, out.pos);
-}
-
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
@@ -845,7 +832,6 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
bc->shrink = shrink;
shrink->count_objects = bch2_btree_key_cache_count;
shrink->scan_objects = bch2_btree_key_cache_scan;
- shrink->to_text = bch2_btree_key_cache_shrinker_to_text;
shrink->batch = 1 << 14;
shrink->seeks = 0;
shrink->private_data = c;
diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c
index a4f8aac4..00477464 100644
--- a/libbcachefs/btree_locking.c
+++ b/libbcachefs/btree_locking.c
@@ -69,6 +69,7 @@ struct trans_waiting_for_lock {
struct lock_graph {
struct trans_waiting_for_lock g[8];
unsigned nr;
+ bool printed_chain;
};
static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
@@ -89,6 +90,10 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
{
+ if (g->printed_chain || g->nr <= 1)
+ return;
+ g->printed_chain = true;
+
struct trans_waiting_for_lock *i;
for (i = g->g; i != g->g + g->nr; i++) {
@@ -124,6 +129,7 @@ static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
.node_want = trans->locking,
.lock_want = trans->locking_wait.lock_want,
};
+ g->printed_chain = false;
}
static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
@@ -265,8 +271,12 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
if (unlikely(g->nr == ARRAY_SIZE(g->g))) {
closure_put(&trans->ref);
- if (orig_trans->lock_may_not_fail)
+ if (orig_trans->lock_may_not_fail) {
+ /* Other threads will have to rerun the cycle detector: */
+ for (struct trans_waiting_for_lock *i = g->g + 1; i < g->g + g->nr; i++)
+ wake_up_process(i->trans->locking_wait.task);
return 0;
+ }
lock_graph_pop_all(g);
@@ -398,7 +408,7 @@ next:
}
}
up:
- if (g.nr > 1 && cycle)
+ if (cycle)
print_chain(cycle, &g);
lock_graph_up(&g);
goto next;
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 021f5cb7..7f08863f 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -462,6 +462,7 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
CLASS(printbuf, buf)();
bool inserting = sectors > 0;
+ int ret = 0;
BUG_ON(!sectors);
@@ -489,8 +490,17 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
BCH_FSCK_ERR_ptr_too_stale);
}
- if (b_gen != ptr->gen && ptr->cached)
+ if (b_gen != ptr->gen && ptr->cached) {
+ if (fsck_err_on(c->sb.compat & BIT_ULL(BCH_COMPAT_no_stale_ptrs),
+ trans, stale_ptr_with_no_stale_ptrs_feature,
+ "stale cached ptr, but have no_stale_ptrs feature\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ guard(mutex)(&c->sb_lock);
+ c->disk_sb.sb->compat[0] &= ~cpu_to_le64(BIT_ULL(BCH_COMPAT_no_stale_ptrs));
+ bch2_write_super(c);
+ }
return 1;
+ }
if (unlikely(b_gen != ptr->gen)) {
bch2_log_msg_start(c, &buf);
@@ -530,7 +540,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
}
*bucket_sectors += sectors;
- return 0;
+fsck_err:
+ return ret;
}
void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
@@ -749,6 +760,7 @@ static int __trigger_extent(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
bool gc = flags & BTREE_TRIGGER_gc;
+ bool insert = !(flags & BTREE_TRIGGER_overwrite);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@@ -802,7 +814,7 @@ static int __trigger_extent(struct btree_trans *trans,
if (cur_compression_type &&
cur_compression_type != p.crc.compression_type) {
- if (flags & BTREE_TRIGGER_overwrite)
+ if (!insert)
bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
@@ -835,7 +847,7 @@ static int __trigger_extent(struct btree_trans *trans,
}
if (cur_compression_type) {
- if (flags & BTREE_TRIGGER_overwrite)
+ if (!insert)
bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
@@ -845,12 +857,17 @@ static int __trigger_extent(struct btree_trans *trans,
}
if (level) {
- ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id);
+ const bool leaf_node = level == 1;
+ s64 v[3] = {
+ replicas_sectors,
+ insert ? 1 : -1,
+ !leaf_node ? (insert ? 1 : -1) : 0,
+ };
+
+ ret = bch2_disk_accounting_mod2(trans, gc, v, btree, btree_id);
if (ret)
return ret;
} else {
- bool insert = !(flags & BTREE_TRIGGER_overwrite);
-
s64 v[3] = {
insert ? 1 : -1,
insert ? k.k->size : -((s64) k.k->size),
@@ -869,7 +886,6 @@ int bch2_trigger_extent(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s new,
enum btree_iter_update_trigger_flags flags)
{
- struct bch_fs *c = trans->c;
struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
@@ -900,30 +916,9 @@ int bch2_trigger_extent(struct btree_trans *trans,
return ret;
}
- int need_rebalance_delta = 0;
- s64 need_rebalance_sectors_delta[1] = { 0 };
-
- s64 s = bch2_bkey_sectors_need_rebalance(c, old);
- need_rebalance_delta -= s != 0;
- need_rebalance_sectors_delta[0] -= s;
-
- s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
- need_rebalance_delta += s != 0;
- need_rebalance_sectors_delta[0] += s;
-
- if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
- int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
- new.k->p, need_rebalance_delta > 0);
- if (ret)
- return ret;
- }
-
- if (need_rebalance_sectors_delta[0]) {
- int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
- need_rebalance_sectors_delta, rebalance_work);
- if (ret)
- return ret;
- }
+ int ret = bch2_trigger_extent_rebalance(trans, old, new.s_c, flags);
+ if (ret)
+ return ret;
}
return 0;
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 7a0da6cd..ca925c5d 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -393,7 +393,7 @@ restart_drop_extra_replicas:
bch2_extent_ptr_decoded_append(insert, &p);
bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
- bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert));
+ bch2_bkey_drop_extra_cached_ptrs(c, &m->op.opts, bkey_i_to_s(insert));
ret = bch2_sum_sector_overwrites(trans, &iter, insert,
&should_check_enospc,
@@ -721,7 +721,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_error key, or just a discard if it was a cached extent)
*/
- bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n));
+ bch2_bkey_drop_extra_cached_ptrs(c, io_opts, bkey_i_to_s(n));
/*
* Since we're not inserting through an extent iterator
diff --git a/libbcachefs/disk_accounting_format.h b/libbcachefs/disk_accounting_format.h
index 8269af1d..730a17ea 100644
--- a/libbcachefs/disk_accounting_format.h
+++ b/libbcachefs/disk_accounting_format.h
@@ -108,7 +108,7 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
x(dev_data_type, 3, 3) \
x(compression, 4, 3) \
x(snapshot, 5, 1) \
- x(btree, 6, 1) \
+ x(btree, 6, 3) \
x(rebalance_work, 7, 1) \
x(inum, 8, 3)
@@ -174,6 +174,14 @@ struct bch_acct_snapshot {
__u32 id;
} __packed;
+/*
+ * Metadata accounting per btree id:
+ * [
+ * total btree disk usage in sectors
+ * total number of btree nodes
+ * number of non-leaf btree nodes
+ * ]
+ */
struct bch_acct_btree {
__u32 id;
} __packed;
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 271e2521..89a95b6c 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -22,6 +22,7 @@
#include "io_write.h"
#include "keylist.h"
#include "lru.h"
+#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
@@ -1129,7 +1130,13 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
(union bch_extent_entry *) ec_ptr,
(union bch_extent_entry *) &stripe_ptr);
- ret = bch2_trans_update(trans, &iter, n, 0);
+ struct bch_inode_opts opts;
+
+ ret = bch2_extent_get_io_opts_one(trans, &opts, &iter, bkey_i_to_s_c(n),
+ SET_NEEDS_REBALANCE_other) ?:
+ bch2_bkey_set_needs_rebalance(trans->c, &opts, n,
+ SET_NEEDS_REBALANCE_other, 0) ?:
+ bch2_trans_update(trans, &iter, n, 0);
out:
bch2_trans_iter_exit(&iter);
return ret;
@@ -1144,8 +1151,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
int ret = 0;
CLASS(bch2_dev_tryget, ca)(c, ptr.dev);
- if (!ca)
- return bch_err_throw(c, ENOENT_dev_not_found);
+ if (!ca) /* BCH_SB_MEMBER_INVALID */
+ return 0;
struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 86aa93ea..3274ba42 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -12,6 +12,7 @@
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
+#include "btree_update.h"
#include "buckets.h"
#include "checksum.h"
#include "compress.h"
@@ -1213,6 +1214,21 @@ drop:
bch2_bkey_drop_ptr_noerror(k, ptr);
}
+static bool bch2_bkey_has_stale_ptrs(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_dev *ca;
+
+ guard(rcu)();
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->cached &&
+ (ca = bch2_dev_rcu_noerror(c, ptr->dev)) &&
+ dev_ptr_stale_rcu(ca, ptr) > 0)
+ return true;
+
+ return false;
+}
+
/*
* bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
*
@@ -1221,7 +1237,7 @@ drop:
* For existing keys, only called when btree nodes are being rewritten, not when
* they're merely being compacted/resorted in memory.
*/
-bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
+static void __bch2_bkey_drop_stale_ptrs(struct bch_fs *c, struct bkey_s k)
{
struct bch_dev *ca;
@@ -1230,19 +1246,26 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
ptr->cached &&
(!(ca = bch2_dev_rcu_noerror(c, ptr->dev)) ||
dev_ptr_stale_rcu(ca, ptr) > 0));
+}
+
+int bch2_bkey_drop_stale_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
+{
+ if (bch2_bkey_has_stale_ptrs(trans->c, k)) {
+ struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k,
+ BTREE_UPDATE_internal_snapshot_node);
+ int ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
+
+ __bch2_bkey_drop_stale_ptrs(trans->c, bkey_i_to_s(u));
+ }
- return bkey_deleted(k.k);
+ return 0;
}
-/*
- * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
- *
- * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
- * the promote target.
- */
-bool bch2_extent_normalize_by_opts(struct bch_fs *c,
- struct bch_inode_opts *opts,
- struct bkey_s k)
+void bch2_bkey_drop_extra_cached_ptrs(struct bch_fs *c,
+ struct bch_inode_opts *opts,
+ struct bkey_s k)
{
struct bkey_ptrs ptrs;
bool have_cached_ptr;
@@ -1260,8 +1283,6 @@ restart_drop_ptrs:
}
have_cached_ptr = true;
}
-
- return bkey_deleted(k.k);
}
void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 03ea7c68..1ea9752b 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -440,7 +440,6 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
.key_validate = bch2_bkey_ptrs_validate, \
.val_to_text = bch2_bkey_ptrs_to_text, \
.swab = bch2_ptr_swab, \
- .key_normalize = bch2_extent_normalize, \
.key_merge = bch2_extent_merge, \
.trigger = bch2_trigger_extent, \
})
@@ -689,8 +688,8 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_inode_opts *,
struct bkey_s, struct bch_extent_ptr *);
-bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_inode_opts *, struct bkey_s);
-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+int bch2_bkey_drop_stale_ptrs(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
+void bch2_bkey_drop_extra_cached_ptrs(struct bch_fs *, struct bch_inode_opts *, struct bkey_s);
void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c
index a104b9d7..d5340973 100644
--- a/libbcachefs/fs-io-direct.c
+++ b/libbcachefs/fs-io-direct.c
@@ -117,7 +117,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
} else {
atomic_set(&dio->cl.remaining,
CLOSURE_REMAINING_INITIALIZER + 1);
- dio->cl.closure_get_happened = true;
}
dio->req = req;
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index ccc44b1f..3bde5c07 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -1963,7 +1963,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
}
}
- ret = check_extent_overbig(trans, iter, k);
+ ret = check_extent_overbig(trans, iter, k) ?:
+ bch2_bkey_drop_stale_ptrs(trans, iter, k);
if (ret)
goto err;
@@ -2040,7 +2041,8 @@ int bch2_check_indirect_extents(struct bch_fs *c)
BCH_TRANS_COMMIT_no_enospc, ({
progress_update_iter(trans, &progress, &iter);
bch2_disk_reservation_put(c, &res);
- check_extent_overbig(trans, &iter, k);
+ check_extent_overbig(trans, &iter, k) ?:
+ bch2_bkey_drop_stale_ptrs(trans, &iter, k);
}));
bch2_disk_reservation_put(c, &res);
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index e7ba0d0b..3765aa52 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -37,6 +37,12 @@ module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(read_corrupt_ratio, "");
#endif
+static bool bch2_poison_extents_on_checksum_error;
+module_param_named(poison_extents_on_checksum_error,
+ bch2_poison_extents_on_checksum_error, bool, 0644);
+MODULE_PARM_DESC(poison_extents_on_checksum_error,
+ "Extents with checksum errors are marked as poisoned - unsafe without read fua support");
+
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static inline u32 bch2_dev_congested_read(struct bch_dev *ca, u64 now)
@@ -539,6 +545,9 @@ static void get_rbio_extent(struct btree_trans *trans,
static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
enum btree_id btree, struct bkey_s_c read_k)
{
+ if (!bch2_poison_extents_on_checksum_error)
+ return 0;
+
struct bch_fs *c = trans->c;
struct data_update *u = rbio_data_update(rbio);
@@ -1274,10 +1283,6 @@ retry_pick:
async_object_list_add(c, rbio, rbio, &rbio->list_idx);
- /* XXX: also nvme read recovery level */
- if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev)))
- rbio->bio.bi_opf |= REQ_FUA;
-
if (rbio->bounce)
trace_and_count(c, io_read_bounce, &rbio->bio);
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 8a3981e1..92edff50 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -84,13 +84,6 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
return ret;
/*
- * If the new extent no longer has any pointers, bch2_extent_normalize()
- * will do the appropriate thing with it (turning it into a
- * KEY_TYPE_error key, or just a discard if it was a cached extent)
- */
- bch2_extent_normalize(c, bkey_i_to_s(n));
-
- /*
* Since we're not inserting through an extent iterator
* (BTREE_ITER_all_snapshots iterators aren't extent iterators),
* we aren't using the extent overwrite path to delete, we're
@@ -273,10 +266,15 @@ int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx,
unsigned flags, struct printbuf *err)
{
struct progress_indicator_state progress;
+ int ret;
+
bch2_progress_init(&progress, c,
- BIT_ULL(BTREE_ID_extents)|
- BIT_ULL(BTREE_ID_reflink));
+ btree_has_data_ptrs_mask & ~BIT_ULL(BTREE_ID_stripes));
+
+ if ((ret = bch2_dev_usrdata_drop(c, &progress, dev_idx, flags, err)))
+ return ret;
+
+ bch2_progress_init_inner(&progress, c, 0, ~0ULL);
- return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags, err) ?:
- bch2_dev_metadata_drop(c, &progress, dev_idx, flags, err);
+ return bch2_dev_metadata_drop(c, &progress, dev_idx, flags, err);
}
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index 122bc98e..bd5faafc 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -518,7 +518,8 @@ void bch2_opts_to_text(struct printbuf *out,
}
}
-int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, u64 v)
+int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, u64 v,
+ bool change)
{
int ret = 0;
@@ -542,13 +543,26 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum b
break;
}
+ if (change &&
+ (id == Opt_foreground_target ||
+ id == Opt_background_target ||
+ id == Opt_promote_target ||
+ id == Opt_compression ||
+ id == Opt_background_compression ||
+ id == Opt_data_checksum ||
+ id == Opt_data_replicas)) {
+ ret = bch2_set_rebalance_needs_scan(c, inum);
+ if (ret)
+ return ret;
+ }
+
return ret;
}
int bch2_opts_hooks_pre_set(struct bch_fs *c)
{
for (unsigned i = 0; i < bch2_opts_nr; i++) {
- int ret = bch2_opt_hook_pre_set(c, NULL, 0, i, bch2_opt_get_by_id(&c->opts, i));
+ int ret = bch2_opt_hook_pre_set(c, NULL, 0, i, bch2_opt_get_by_id(&c->opts, i), false);
if (ret)
return ret;
}
@@ -559,14 +573,18 @@ int bch2_opts_hooks_pre_set(struct bch_fs *c)
void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum,
enum bch_opt_id id, u64 v)
{
- switch (id) {
- case Opt_foreground_target:
- case Opt_compression:
- case Opt_background_target:
- case Opt_background_compression:
+ if (id == Opt_foreground_target ||
+ id == Opt_background_target ||
+ id == Opt_promote_target ||
+ id == Opt_compression ||
+ id == Opt_background_compression ||
+ id == Opt_data_checksum ||
+ id == Opt_data_replicas) {
bch2_set_rebalance_needs_scan(c, inum);
bch2_rebalance_wakeup(c);
- break;
+ }
+
+ switch (id) {
case Opt_rebalance_enabled:
bch2_rebalance_wakeup(c);
break;
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 22cf109f..6b9f1883 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -149,12 +149,12 @@ enum fsck_err_opts {
BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
NULL, "Number of consecutive write errors allowed before kicking out a device")\
x(metadata_replicas, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX + 1), \
BCH_SB_META_REPLICAS_WANT, 1, \
"#", "Number of metadata replicas") \
x(data_replicas, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX + 1), \
BCH_SB_DATA_REPLICAS_WANT, 1, \
"#", "Number of data replicas") \
@@ -175,12 +175,12 @@ enum fsck_err_opts {
BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \
"size", "Maximum size of checksummed/compressed extents")\
x(metadata_checksum, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_RUNTIME, \
OPT_STR(__bch2_csum_opts), \
BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(data_checksum, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
OPT_STR(__bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
@@ -190,12 +190,12 @@ enum fsck_err_opts {
BCH_SB_CSUM_ERR_RETRY_NR, 3, \
NULL, NULL) \
x(compression, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
OPT_FN(bch2_opt_compression), \
BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \
NULL, NULL) \
x(background_compression, u8, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
OPT_FN(bch2_opt_compression), \
BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \
NULL, NULL) \
@@ -205,27 +205,27 @@ enum fsck_err_opts {
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
NULL, "Hash function for directory entries and xattrs")\
x(metadata_target, u16, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_METADATA_TARGET, 0, \
"(target)", "Device or label for metadata writes") \
x(foreground_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_FOREGROUND_TARGET, 0, \
"(target)", "Device or label for foreground writes") \
x(background_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_BACKGROUND_TARGET, 0, \
"(target)", "Device or label to move data to in the background")\
x(promote_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_PROMOTE_TARGET, 0, \
"(target)", "Device or label to promote data to on read") \
x(erasure_code, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_ERASURE_CODE, false, \
NULL, "Enable erasure coding (DO NOT USE YET)") \
@@ -658,7 +658,7 @@ void bch2_opts_to_text(struct printbuf *,
struct bch_fs *, struct bch_sb *,
unsigned, unsigned, unsigned);
-int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, u64, enum bch_opt_id, u64);
+int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, u64, enum bch_opt_id, u64, bool);
int bch2_opts_hooks_pre_set(struct bch_fs *);
void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, enum bch_opt_id, u64);
diff --git a/libbcachefs/progress.c b/libbcachefs/progress.c
index 541ee951..7cc16490 100644
--- a/libbcachefs/progress.c
+++ b/libbcachefs/progress.c
@@ -4,14 +4,21 @@
#include "disk_accounting.h"
#include "progress.h"
-void bch2_progress_init(struct progress_indicator_state *s,
- struct bch_fs *c,
- u64 btree_id_mask)
+void bch2_progress_init_inner(struct progress_indicator_state *s,
+ struct bch_fs *c,
+ u64 leaf_btree_id_mask,
+ u64 inner_btree_id_mask)
{
memset(s, 0, sizeof(*s));
s->next_print = jiffies + HZ * 10;
+ /* This is only an estimation: nodes can have different replica counts */
+ const u32 expected_node_disk_sectors =
+ READ_ONCE(c->opts.metadata_replicas) * btree_sectors(c);
+
+ const u64 btree_id_mask = leaf_btree_id_mask | inner_btree_id_mask;
+
for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
if (!(btree_id_mask & BIT_ULL(i)))
continue;
@@ -19,9 +26,29 @@ void bch2_progress_init(struct progress_indicator_state *s,
struct disk_accounting_pos acc;
disk_accounting_key_init(acc, btree, .id = i);
- u64 v;
- bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
- s->nodes_total += div64_ul(v, btree_sectors(c));
+ struct {
+ u64 disk_sectors;
+ u64 total_nodes;
+ u64 inner_nodes;
+ } v = {0};
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc),
+ (u64 *)&v, sizeof(v) / sizeof(u64));
+
+ /* Better to estimate as 0 than the total node count */
+ if (inner_btree_id_mask & BIT_ULL(i))
+ s->nodes_total += v.inner_nodes;
+
+ if (!(leaf_btree_id_mask & BIT_ULL(i)))
+ continue;
+
+ /*
+ * We check for zeros to degrade gracefully when run
+ * with un-upgraded accounting info (missing some counters).
+ */
+ if (v.total_nodes != 0)
+ s->nodes_total += v.total_nodes - v.inner_nodes;
+ else
+ s->nodes_total += div_u64(v.disk_sectors, expected_node_disk_sectors);
}
}
diff --git a/libbcachefs/progress.h b/libbcachefs/progress.h
index 972a7308..91f34533 100644
--- a/libbcachefs/progress.h
+++ b/libbcachefs/progress.h
@@ -20,7 +20,17 @@ struct progress_indicator_state {
struct btree *last_node;
};
-void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64);
+void bch2_progress_init_inner(struct progress_indicator_state *s,
+ struct bch_fs *c,
+ u64 leaf_btree_id_mask,
+ u64 inner_btree_id_mask);
+
+static inline void bch2_progress_init(struct progress_indicator_state *s,
+ struct bch_fs *c, u64 btree_id_mask)
+{
+ bch2_progress_init_inner(s, c, btree_id_mask, 0);
+}
+
void bch2_progress_update_iter(struct btree_trans *,
struct progress_indicator_state *,
struct btree_iter *,
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index fa73de78..67d6a90e 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -92,122 +92,140 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
}
}
-static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
- struct bch_inode_opts *opts,
- struct bkey_s_c k,
- struct bkey_ptrs_c ptrs)
+int bch2_trigger_extent_rebalance(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ enum btree_iter_update_trigger_flags flags)
{
- if (!opts->background_compression)
- return 0;
+ struct bch_fs *c = trans->c;
+ int need_rebalance_delta = 0;
+ s64 need_rebalance_sectors_delta[1] = { 0 };
- unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned ptr_bit = 1;
- unsigned rewrite_ptrs = 0;
+ s64 s = bch2_bkey_sectors_need_rebalance(c, old);
+ need_rebalance_delta -= s != 0;
+ need_rebalance_sectors_delta[0] -= s;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
- p.ptr.unwritten)
- return 0;
+ s = bch2_bkey_sectors_need_rebalance(c, new);
+ need_rebalance_delta += s != 0;
+ need_rebalance_sectors_delta[0] += s;
- if (!p.ptr.cached && p.crc.compression_type != compression_type)
- rewrite_ptrs |= ptr_bit;
- ptr_bit <<= 1;
+ if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
+ int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
+ new.k->p, need_rebalance_delta > 0);
+ if (ret)
+ return ret;
}
- return rewrite_ptrs;
-}
-
-static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
- struct bch_inode_opts *opts,
- struct bkey_ptrs_c ptrs)
-{
- if (!opts->background_target ||
- !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
- return 0;
-
- unsigned ptr_bit = 1;
- unsigned rewrite_ptrs = 0;
-
- guard(rcu)();
- bkey_for_each_ptr(ptrs, ptr) {
- if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
- rewrite_ptrs |= ptr_bit;
- ptr_bit <<= 1;
+ if (need_rebalance_sectors_delta[0]) {
+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
+ need_rebalance_sectors_delta, rebalance_work);
+ if (ret)
+ return ret;
}
- return rewrite_ptrs;
+ return 0;
}
-static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
- struct bch_inode_opts *opts,
- struct bkey_s_c k)
+static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_inode_opts *io_opts,
+ unsigned *move_ptrs,
+ unsigned *compress_ptrs,
+ u64 *sectors)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ *move_ptrs = 0;
+ *compress_ptrs = 0;
+ *sectors = 0;
- if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return 0;
-
- return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
- bch2_bkey_ptrs_need_move(c, opts, ptrs);
-}
-
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
-{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
- if (!opts)
- return 0;
+ const struct bch_extent_rebalance *rb_opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
+ if (!io_opts && !rb_opts)
+ return;
if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return 0;
+ return;
+
+ unsigned compression_type =
+ bch2_compression_opt_to_type(io_opts
+ ? io_opts->background_compression
+ : rb_opts->background_compression);
+ unsigned target = io_opts
+ ? io_opts->background_target
+ : rb_opts->background_target;
+ if (target && !bch2_target_accepts_data(c, BCH_DATA_user, target))
+ target = 0;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- u64 sectors = 0;
+ bool incompressible = false, unwritten = false;
+
+ unsigned ptr_idx = 1;
- if (opts->background_compression) {
- unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
+ guard(rcu)();
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ incompressible |= p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible;
+ unwritten |= p.ptr.unwritten;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
- p.ptr.unwritten) {
- sectors = 0;
- goto incompressible;
- }
+ if (!p.ptr.cached) {
+ if (p.crc.compression_type != compression_type)
+ *compress_ptrs |= ptr_idx;
- if (!p.ptr.cached && p.crc.compression_type != compression_type)
- sectors += p.crc.compressed_size;
+ if (target && !bch2_dev_in_target(c, p.ptr.dev, target))
+ *move_ptrs |= ptr_idx;
}
+
+ ptr_idx <<= 1;
}
-incompressible:
- if (opts->background_target) {
- guard(rcu)();
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached &&
- !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
- sectors += p.crc.compressed_size;
+
+ if (unwritten)
+ *compress_ptrs = 0;
+ if (incompressible)
+ *compress_ptrs = 0;
+
+ unsigned rb_ptrs = *move_ptrs | *compress_ptrs;
+
+ if (!rb_ptrs)
+ return;
+
+ ptr_idx = 1;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (rb_ptrs & ptr_idx)
+ *sectors += p.crc.compressed_size;
+ ptr_idx <<= 1;
}
+}
+
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+ unsigned move_ptrs = 0;
+ unsigned compress_ptrs = 0;
+ u64 sectors = 0;
+ bch2_bkey_needs_rebalance(c, k, NULL, &move_ptrs, &compress_ptrs, &sectors);
return sectors;
}
-static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_inode_opts *opts,
- struct bkey_s_c k)
+static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
+ struct bch_inode_opts *opts,
+ struct bkey_s_c k)
{
- if (!bkey_extent_is_direct_data(k.k))
- return 0;
+ unsigned move_ptrs = 0;
+ unsigned compress_ptrs = 0;
+ u64 sectors = 0;
- const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
+ bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &sectors);
+ return move_ptrs|compress_ptrs;
+}
- if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
- struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
- return old == NULL || memcmp(old, &new, sizeof(new));
- } else {
- return old != NULL;
+static inline bool bkey_should_have_rb_opts(struct bch_fs *c,
+ struct bch_inode_opts *opts,
+ struct bkey_s_c k)
+{
+ if (k.k->type == KEY_TYPE_reflink_v) {
+#define x(n) if (opts->n##_from_inode) return true;
+ BCH_REBALANCE_OPTS()
+#undef x
}
+ return bch2_bkey_ptrs_need_rebalance(c, opts, k);
}
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts,
@@ -222,7 +240,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts,
struct bch_extent_rebalance *old =
(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
- if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
+ if (bkey_should_have_rb_opts(c, opts, k.s_c)) {
if (!old) {
old = bkey_val_end(k);
k.k->u64s += sizeof(*old) / sizeof(u64);
@@ -243,22 +261,40 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
struct bkey_s_c k,
enum set_needs_rebalance_ctx ctx)
{
+ struct bch_fs *c = trans->c;
+
BUG_ON(iter->flags & BTREE_ITER_is_extents);
BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
- const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
- ? bch2_bkey_rebalance_opts(k) : NULL;
- if (r) {
-#define x(_name) \
- if (r->_name##_from_inode) { \
- io_opts->_name = r->_name; \
- io_opts->_name##_from_inode = true; \
+ if (!bkey_extent_is_direct_data(k.k))
+ return 0;
+
+ bool may_update_indirect = ctx == SET_NEEDS_REBALANCE_opt_change_indirect;
+
+ /*
+ * If it's an indirect extent, and we walked to it directly, we won't
+ * have the options from the inode that were directly applied: options
+ * from the extent take precedence - unless the io_opts option came from
+ * the inode and may_update_indirect is true (walked from a
+ * REFLINK_P_MAY_UPDATE_OPTIONS pointer).
+ */
+ const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
+ if (old && k.k->type == KEY_TYPE_reflink_v) {
+#define x(_name) \
+ if (old->_name##_from_inode && \
+ !(may_update_indirect && io_opts->_name##_from_inode)) { \
+ io_opts->_name = old->_name; \
+ io_opts->_name##_from_inode = true; \
}
BCH_REBALANCE_OPTS()
#undef x
}
- if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
+ struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, io_opts);
+
+ if (bkey_should_have_rb_opts(c, io_opts, k)
+ ? old && !memcmp(old, &new, sizeof(new))
+ : !old)
return 0;
struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
@@ -270,10 +306,10 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
/* On successfull transaction commit, @k was invalidated: */
- return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n, ctx, 0) ?:
+ return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
bch2_trans_commit(trans, NULL, NULL, 0) ?:
- bch_err_throw(trans->c, transaction_restart_commit);
+ bch_err_throw(c, transaction_restart_commit);
}
static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans *trans,
@@ -569,23 +605,25 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned move_ptrs = 0;
+ unsigned compress_ptrs = 0;
+ u64 sectors = 0;
- unsigned p = bch2_bkey_ptrs_need_compress(c, opts, k, ptrs);
- if (p) {
- prt_str(&buf, "compression=");
- bch2_compression_opt_to_text(&buf, opts->background_compression);
+ bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &sectors);
+
+ if (move_ptrs) {
+ prt_str(&buf, "move=");
+ bch2_target_to_text(&buf, c, opts->background_target);
prt_str(&buf, " ");
- bch2_prt_u64_base2(&buf, p);
+ bch2_prt_u64_base2(&buf, move_ptrs);
prt_newline(&buf);
}
- p = bch2_bkey_ptrs_need_move(c, opts, ptrs);
- if (p) {
- prt_str(&buf, "move=");
- bch2_target_to_text(&buf, c, opts->background_target);
+ if (compress_ptrs) {
+ prt_str(&buf, "compression=");
+ bch2_compression_opt_to_text(&buf, opts->background_compression);
prt_str(&buf, " ");
- bch2_prt_u64_base2(&buf, p);
+ bch2_prt_u64_base2(&buf, compress_ptrs);
prt_newline(&buf);
}
@@ -659,7 +697,9 @@ static int do_rebalance_scan_indirect(struct btree_trans *trans,
u32 restart_count = trans->restart_count;
int ret = for_each_btree_key(trans, iter, BTREE_ID_reflink,
- POS(0, idx), BTREE_ITER_not_extents, k, ({
+ POS(0, idx),
+ BTREE_ITER_intent|
+ BTREE_ITER_not_extents, k, ({
if (bpos_ge(bkey_start_pos(k.k), POS(0, end)))
break;
bch2_get_update_rebalance_opts(trans, opts, &iter, k,
@@ -696,10 +736,13 @@ static int do_rebalance_scan(struct moving_context *ctxt,
int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
r->scan_start.pos, r->scan_end.pos,
+ BTREE_ITER_intent|
BTREE_ITER_all_snapshots|
BTREE_ITER_prefetch, k, ({
ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
+ atomic64_add(k.k->size, &r->scan_stats.sectors_seen);
+
struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans,
snapshot_io_opts, iter.pos, &iter, k,
SET_NEEDS_REBALANCE_opt_change);
@@ -709,10 +752,31 @@ static int do_rebalance_scan(struct moving_context *ctxt,
REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)
? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), opts)
: 0);
- })) ?:
- commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+ }));
+ if (ret)
+ goto out;
+
+ if (!inum) {
+ ret = for_each_btree_key_max(trans, iter, BTREE_ID_reflink,
+ POS_MIN, POS_MAX,
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_prefetch, k, ({
+ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
+
+ atomic64_add(k.k->size, &r->scan_stats.sectors_seen);
+ struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans,
+ snapshot_io_opts, iter.pos, &iter, k,
+ SET_NEEDS_REBALANCE_opt_change);
+ PTR_ERR_OR_ZERO(opts);
+ }));
+ if (ret)
+ goto out;
+ }
+
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+out:
*sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen);
/*
* Ensure that the rebalance_work entries we created are seen by the
diff --git a/libbcachefs/rebalance.h b/libbcachefs/rebalance.h
index bff91aa0..24bafa42 100644
--- a/libbcachefs/rebalance.h
+++ b/libbcachefs/rebalance.h
@@ -29,6 +29,10 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_f
void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *,
const struct bch_extent_rebalance *);
+int bch2_trigger_extent_rebalance(struct btree_trans *,
+ struct bkey_s_c, struct bkey_s_c,
+ enum btree_iter_update_trigger_flags);
+
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
enum set_needs_rebalance_ctx {
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 531c2ef1..6942d3cf 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -920,6 +920,13 @@ use_clean:
if (bch2_blacklist_entries_gc(c))
write_sb = true;
+ if (!(c->sb.compat & BIT_ULL(BCH_COMPAT_no_stale_ptrs)) &&
+ (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_extents)) &&
+ (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_indirect_extents))) {
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(BIT_ULL(BCH_COMPAT_no_stale_ptrs));
+ write_sb = true;
+ }
+
if (write_sb)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -982,8 +989,9 @@ int bch2_fs_initialize(struct bch_fs *c)
set_bit(BCH_FS_new_fs, &c->flags);
scoped_guard(mutex, &c->sb_lock) {
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(BIT_ULL(BCH_COMPAT_extents_above_btree_updates_done));
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(BIT_ULL(BCH_COMPAT_bformat_overflow_done));
+ c->disk_sb.sb->compat[0] |= cpu_to_le64(BIT_ULL(BCH_COMPAT_no_stale_ptrs));
bch2_check_version_downgrade(c);
diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c
index de56a1ee..bfd06fd5 100644
--- a/libbcachefs/sb-downgrade.c
+++ b/libbcachefs/sb-downgrade.c
@@ -104,7 +104,10 @@
x(inode_has_case_insensitive, \
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \
- BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)
+ BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\
+ x(btree_node_accounting, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch)
#define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \
@@ -152,7 +155,11 @@
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_accounting_mismatch, \
BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
- BCH_FSCK_ERR_accounting_key_junk_at_end)
+ BCH_FSCK_ERR_accounting_key_junk_at_end) \
+ x(btree_node_accounting, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch, \
+ BCH_FSCK_ERR_accounting_key_nr_counters_wrong)
struct upgrade_downgrade_entry {
u64 recovery_passes;
diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h
index 728d8780..77e3fc92 100644
--- a/libbcachefs/sb-errors_format.h
+++ b/libbcachefs/sb-errors_format.h
@@ -170,9 +170,10 @@ enum bch_fsck_flags {
x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \
x(ptr_to_missing_stripe, 150, 0) \
x(ptr_to_incorrect_stripe, 151, 0) \
- x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \
+ x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \
x(ptr_too_stale, 153, 0) \
x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \
+ x(stale_ptr_with_no_stale_ptrs_feature, 327, FSCK_AUTOFIX) \
x(ptr_bucket_data_type_mismatch, 155, 0) \
x(ptr_cached_and_erasure_coded, 156, 0) \
x(ptr_crc_uncompressed_size_too_small, 157, 0) \
@@ -338,7 +339,7 @@ enum bch_fsck_flags {
x(dirent_stray_data_after_cf_name, 305, 0) \
x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
- x(MAX, 327, 0)
+ x(MAX, 328, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 473ad4b5..03b12c2d 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -238,6 +238,7 @@ static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
static void bch2_dev_io_ref_stop(struct bch_dev *, int);
static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
static int bch2_dev_attach_bdev(struct bch_fs *, struct bch_sb_handle *, struct printbuf *);
+static bool bch2_fs_will_resize_on_mount(struct bch_fs *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
{
@@ -964,6 +965,9 @@ static int bch2_fs_opt_version_init(struct bch_fs *c)
if (c->opts.journal_rewind)
c->opts.fsck = true;
+ bool may_upgrade_downgrade = !(c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) ||
+ bch2_fs_will_resize_on_mount(c);
+
CLASS(printbuf, p)();
bch2_log_msg_start(c, &p);
@@ -1040,22 +1044,24 @@ static int bch2_fs_opt_version_init(struct bch_fs *c)
prt_bitflags(&p, __bch2_btree_ids, btrees_lost_data);
}
- if (bch2_check_version_downgrade(c)) {
- prt_str(&p, "\nVersion downgrade required:");
+ if (may_upgrade_downgrade) {
+ if (bch2_check_version_downgrade(c)) {
+ prt_str(&p, "\nVersion downgrade required:");
- __le64 passes = ext->recovery_passes_required[0];
- bch2_sb_set_downgrade(c,
- BCH_VERSION_MINOR(bcachefs_metadata_version_current),
- BCH_VERSION_MINOR(c->sb.version));
- passes = ext->recovery_passes_required[0] & ~passes;
- if (passes) {
- prt_str(&p, "\nrunning recovery passes: ");
- prt_bitflags(&p, bch2_recovery_passes,
- bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
+ __le64 passes = ext->recovery_passes_required[0];
+ bch2_sb_set_downgrade(c,
+ BCH_VERSION_MINOR(bcachefs_metadata_version_current),
+ BCH_VERSION_MINOR(c->sb.version));
+ passes = ext->recovery_passes_required[0] & ~passes;
+ if (passes) {
+ prt_str(&p, "\nrunning recovery passes: ");
+ prt_bitflags(&p, bch2_recovery_passes,
+ bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
+ }
}
- }
- check_version_upgrade(c);
+ check_version_upgrade(c);
+ }
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
@@ -1993,7 +1999,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags,
struct printbuf *err)
{
unsigned dev_idx = ca->dev_idx, data;
- bool fast_device_removal = !bch2_request_incompat_feature(c,
+ bool fast_device_removal = (c->sb.compat & BIT_ULL(BCH_COMPAT_no_stale_ptrs)) &&
+ !bch2_request_incompat_feature(c,
bcachefs_metadata_version_fast_device_removal);
int ret;
@@ -2421,15 +2428,29 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets, struct p
return 0;
}
+static bool bch2_dev_will_resize_on_mount(struct bch_dev *ca)
+{
+ return ca->mi.resize_on_mount &&
+ ca->mi.nbuckets < div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk),
+ ca->mi.bucket_size);
+}
+
+static bool bch2_fs_will_resize_on_mount(struct bch_fs *c)
+{
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount)
+ if (bch2_dev_will_resize_on_mount(ca))
+ return true;
+ return false;
+}
+
int bch2_fs_resize_on_mount(struct bch_fs *c)
{
for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) {
- u64 old_nbuckets = ca->mi.nbuckets;
- u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk),
- ca->mi.bucket_size);
+ if (bch2_dev_will_resize_on_mount(ca)) {
+ u64 old_nbuckets = ca->mi.nbuckets;
+ u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk),
+ ca->mi.bucket_size);
- if (ca->mi.resize_on_mount &&
- new_nbuckets > ca->mi.nbuckets) {
bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size);
int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets);
bch_err_fn(ca, ret);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 4c6e6c46..40adefe7 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -45,7 +45,6 @@
#include <linux/blkdev.h>
#include <linux/sort.h>
-#include <linux/string_choices.h>
#include <linux/sched/clock.h>
#include "util.h"
@@ -158,7 +157,6 @@ write_attribute(trigger_recalc_capacity);
write_attribute(trigger_delete_dead_snapshots);
write_attribute(trigger_emergency_read_only);
read_attribute(gc_gens_pos);
-__sysfs_attribute(read_fua_test, 0400);
read_attribute(uuid);
read_attribute(minor);
@@ -306,116 +304,6 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
prt_printf(out, "reserved:\t\t%llu\n", b.reserved);
}
-static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca)
-{
- struct bch_fs *c = ca->fs;
- struct bio *bio = NULL;
- void *buf = NULL;
- unsigned bs = c->opts.block_size, iters;
- u64 end, test_duration = NSEC_PER_SEC * 2;
- struct bch2_time_stats stats_nofua, stats_fua, stats_random;
- int ret = 0;
-
- bch2_time_stats_init_no_pcpu(&stats_nofua);
- bch2_time_stats_init_no_pcpu(&stats_fua);
- bch2_time_stats_init_no_pcpu(&stats_random);
-
- if (!bch2_dev_get_ioref(c, ca->dev_idx, READ, BCH_DEV_READ_REF_read_fua_test)) {
- prt_str(out, "offline\n");
- return 0;
- }
-
- struct block_device *bdev = ca->disk_sb.bdev;
-
- bio = bio_kmalloc(1, GFP_KERNEL);
- if (!bio) {
- ret = -ENOMEM;
- goto err;
- }
-
- buf = kmalloc(bs, GFP_KERNEL);
- if (!buf)
- goto err;
-
- end = ktime_get_ns() + test_duration;
- for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
- bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ);
- bch2_bio_map(bio, buf, bs);
-
- u64 submit_time = ktime_get_ns();
- ret = submit_bio_wait(bio);
- bch2_time_stats_update(&stats_nofua, submit_time);
-
- if (ret)
- goto err;
- }
-
- end = ktime_get_ns() + test_duration;
- for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
- bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ);
- bch2_bio_map(bio, buf, bs);
-
- u64 submit_time = ktime_get_ns();
- ret = submit_bio_wait(bio);
- bch2_time_stats_update(&stats_fua, submit_time);
-
- if (ret)
- goto err;
- }
-
- u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca);
-
- end = ktime_get_ns() + test_duration;
- for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) {
- bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ);
- bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9;
- bch2_bio_map(bio, buf, bs);
-
- u64 submit_time = ktime_get_ns();
- ret = submit_bio_wait(bio);
- bch2_time_stats_update(&stats_random, submit_time);
-
- if (ret)
- goto err;
- }
-
- u64 ns_nofua = mean_and_variance_get_mean(stats_nofua.duration_stats);
- u64 ns_fua = mean_and_variance_get_mean(stats_fua.duration_stats);
- u64 ns_rand = mean_and_variance_get_mean(stats_random.duration_stats);
-
- u64 stddev_nofua = mean_and_variance_get_stddev(stats_nofua.duration_stats);
- u64 stddev_fua = mean_and_variance_get_stddev(stats_fua.duration_stats);
- u64 stddev_rand = mean_and_variance_get_stddev(stats_random.duration_stats);
-
- printbuf_tabstop_push(out, 8);
- printbuf_tabstop_push(out, 12);
- printbuf_tabstop_push(out, 12);
- prt_printf(out, "This test must be run on an idle drive for accurate results\n");
- prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device));
- prt_printf(out, "fua support advertized: %s\n", str_yes_no(bdev_fua(bdev)));
- prt_newline(out);
- prt_printf(out, "ns:\tlatency\rstddev\r\n");
- prt_printf(out, "nofua\t%llu\r%llu\r\n", ns_nofua, stddev_nofua);
- prt_printf(out, "fua\t%llu\r%llu\r\n", ns_fua, stddev_fua);
- prt_printf(out, "random\t%llu\r%llu\r\n", ns_rand, stddev_rand);
-
- bool read_cache = ns_nofua * 2 < ns_rand;
- bool fua_cached = read_cache && ns_fua < (ns_nofua + ns_rand) / 2;
-
- if (!read_cache)
- prt_str(out, "reads don't appear to be cached - safe\n");
- else if (!fua_cached)
- prt_str(out, "fua reads don't appear to be cached - safe\n");
- else
- prt_str(out, "fua reads appear to be cached - unsafe\n");
-err:
- kfree(buf);
- kfree(bio);
- enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_read_fua_test);
- bch_err_fn(c, ret);
- return ret;
-}
-
SHOW(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -784,7 +672,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
u64 v;
ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?:
- bch2_opt_hook_pre_set(c, ca, 0, id, v);
+ bch2_opt_hook_pre_set(c, ca, 0, id, v, true);
kfree(tmp);
if (ret < 0)
@@ -959,9 +847,6 @@ SHOW(bch2_dev)
if (attr == &sysfs_open_buckets)
bch2_open_buckets_to_text(out, c, ca);
- if (attr == &sysfs_read_fua_test)
- return bch2_read_fua_test(out, ca);
-
int opt_id = bch2_opt_lookup(attr->name);
if (opt_id >= 0)
return sysfs_opt_show(c, ca, opt_id, out);
@@ -1026,8 +911,6 @@ struct attribute *bch2_dev_files[] = {
&sysfs_congested,
#endif
- &sysfs_read_fua_test,
-
/* debug: */
&sysfs_alloc_debug,
&sysfs_open_buckets,
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 2a946227..16d746f1 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -299,8 +299,10 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigne
if (ret)
return ret;
+ skipnr += task == current;
+
do {
- nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1);
+ nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr);
} while (nr_entries == stack->size &&
!(ret = darray_make_room_gfp(stack, stack->size * 2, gfp)));
@@ -321,8 +323,10 @@ void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp)
{
+ skipnr += task == current;
+
CLASS(bch_stacktrace, stack)();
- int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp);
+ int ret = bch2_save_backtrace(&stack, task, skipnr, gfp);
bch2_prt_backtrace(out, &stack);
return ret;
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index 784e75a2..2b8d0502 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -550,7 +550,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
if (ret < 0)
goto err;
- ret = bch2_opt_hook_pre_set(c, NULL, inode->ei_inode.bi_inum, opt_id, v);
+ ret = bch2_opt_hook_pre_set(c, NULL, inode->ei_inode.bi_inum, opt_id, v, true);
if (ret < 0)
goto err;
diff --git a/linux/closure.c b/linux/closure.c
index 4fb78d18..f1b4a797 100644
--- a/linux/closure.c
+++ b/linux/closure.c
@@ -13,65 +13,83 @@
#include <linux/seq_file.h>
#include <linux/sched/debug.h>
-static inline void closure_put_after_sub_checks(struct closure *cl, int flags)
+static void closure_val_checks(struct closure *cl, unsigned new, int d)
{
- int r = flags & CLOSURE_REMAINING_MASK;
+ unsigned count = new & CLOSURE_REMAINING_MASK;
- if (WARN(flags & CLOSURE_GUARD_MASK,
- "closure %ps has guard bits set: %x (%u)",
+ if (WARN(new & CLOSURE_GUARD_MASK,
+ "closure %ps has guard bits set: %x (%u), delta %i",
cl->fn,
- flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r)))
- r &= ~CLOSURE_GUARD_MASK;
+ new, (unsigned) __fls(new & CLOSURE_GUARD_MASK), d))
+ new &= ~CLOSURE_GUARD_MASK;
- WARN(!r && (flags & ~CLOSURE_DESTRUCTOR),
+ WARN(!count && (new & ~(CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING)),
"closure %ps ref hit 0 with incorrect flags set: %x (%u)",
cl->fn,
- flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags));
+ new, (unsigned) __fls(new));
}
-static inline void closure_put_after_sub(struct closure *cl, int flags)
+enum new_closure_state {
+ CLOSURE_normal_put,
+ CLOSURE_requeue,
+ CLOSURE_done,
+};
+
+/* For clearing flags with the same atomic op as a put */
+void closure_sub(struct closure *cl, int v)
{
- closure_put_after_sub_checks(cl, flags);
+ enum new_closure_state s;
+ struct task_struct *sleeper;
- if (!(flags & CLOSURE_REMAINING_MASK)) {
- smp_acquire__after_ctrl_dep();
+ /* rcu_read_lock, atomic_read_acquire() are both for cl->sleeper: */
+ guard(rcu)();
- cl->closure_get_happened = false;
+ int old = atomic_read_acquire(&cl->remaining), new;
+ do {
+ new = old - v;
- if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
- atomic_set(&cl->remaining,
- CLOSURE_REMAINING_INITIALIZER);
- closure_queue(cl);
+ if (new & CLOSURE_REMAINING_MASK) {
+ s = CLOSURE_normal_put;
} else {
- struct closure *parent = cl->parent;
- closure_fn *destructor = cl->fn;
+ if ((cl->fn || (new & CLOSURE_SLEEPING)) &&
+ !(new & CLOSURE_DESTRUCTOR)) {
+ s = CLOSURE_requeue;
+ new += CLOSURE_REMAINING_INITIALIZER;
+ } else
+ s = CLOSURE_done;
- closure_debug_destroy(cl);
+ sleeper = new & CLOSURE_SLEEPING ? cl->sleeper : NULL;
+ new &= ~CLOSURE_SLEEPING;
+ }
- if (destructor)
- destructor(&cl->work);
+ closure_val_checks(cl, new, -v);
+ } while (!atomic_try_cmpxchg_release(&cl->remaining, &old, new));
- if (parent)
- closure_put(parent);
- }
+ if (s == CLOSURE_normal_put)
+ return;
+
+ if (sleeper) {
+ smp_mb();
+ wake_up_process(sleeper);
+ return;
}
-}
-/* For clearing flags with the same atomic op as a put */
-void closure_sub(struct closure *cl, int v)
-{
- closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining));
-}
-EXPORT_SYMBOL(closure_sub);
+ if (s == CLOSURE_requeue) {
+ closure_queue(cl);
+ } else {
+ struct closure *parent = cl->parent;
+ closure_fn *destructor = cl->fn;
-/*
- * closure_put - decrement a closure's refcount
- */
-void closure_put(struct closure *cl)
-{
- closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining));
+ closure_debug_destroy(cl);
+
+ if (destructor)
+ destructor(&cl->work);
+
+ if (parent)
+ closure_put(parent);
+ }
}
-EXPORT_SYMBOL(closure_put);
+EXPORT_SYMBOL(closure_sub);
/*
* closure_wake_up - wake up all closures on a wait list, without memory barrier
@@ -107,43 +125,26 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
return false;
- cl->closure_get_happened = true;
closure_set_waiting(cl, _RET_IP_);
- atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
+ unsigned r = atomic_add_return(CLOSURE_WAITING + 1, &cl->remaining);
+ closure_val_checks(cl, r, CLOSURE_WAITING + 1);
+
llist_add(&cl->list, &waitlist->list);
return true;
}
EXPORT_SYMBOL(closure_wait);
-struct closure_syncer {
- struct task_struct *task;
- int done;
-};
-
-static CLOSURE_CALLBACK(closure_sync_fn)
-{
- struct closure *cl = container_of(ws, struct closure, work);
- struct closure_syncer *s = cl->s;
- struct task_struct *p;
-
- rcu_read_lock();
- p = READ_ONCE(s->task);
- s->done = 1;
- wake_up_process(p);
- rcu_read_unlock();
-}
-
void __sched __closure_sync(struct closure *cl)
{
- struct closure_syncer s = { .task = current };
-
- cl->s = &s;
- continue_at(cl, closure_sync_fn, NULL);
+ cl->sleeper = current;
+ closure_sub(cl,
+ CLOSURE_REMAINING_INITIALIZER -
+ CLOSURE_SLEEPING);
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (s.done)
+ if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
break;
schedule();
}
@@ -157,31 +158,25 @@ EXPORT_SYMBOL(__closure_sync);
* for outstanding get()s to finish) and returning once closure refcount is 0.
*
* Unlike closure_sync() this doesn't reinit the ref to 1; subsequent
- * closure_get_not_zero() calls waill fail.
+ * closure_get_not_zero() calls will fail.
*/
void __sched closure_return_sync(struct closure *cl)
{
- struct closure_syncer s = { .task = current };
-
- cl->s = &s;
- set_closure_fn(cl, closure_sync_fn, NULL);
-
- unsigned flags = atomic_sub_return_release(1 + CLOSURE_RUNNING - CLOSURE_DESTRUCTOR,
- &cl->remaining);
-
- closure_put_after_sub_checks(cl, flags);
-
- if (unlikely(flags & CLOSURE_REMAINING_MASK)) {
- while (1) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (s.done)
- break;
- schedule();
- }
+ cl->sleeper = current;
+ closure_sub(cl,
+ CLOSURE_REMAINING_INITIALIZER -
+ CLOSURE_DESTRUCTOR -
+ CLOSURE_SLEEPING);
- __set_current_state(TASK_RUNNING);
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+ break;
+ schedule();
}
+ __set_current_state(TASK_RUNNING);
+
if (cl->parent)
closure_put(cl->parent);
}
@@ -189,31 +184,35 @@ EXPORT_SYMBOL(closure_return_sync);
int __sched __closure_sync_timeout(struct closure *cl, unsigned long timeout)
{
- struct closure_syncer s = { .task = current };
int ret = 0;
- cl->s = &s;
- continue_at(cl, closure_sync_fn, NULL);
+ cl->sleeper = current;
+ closure_sub(cl,
+ CLOSURE_REMAINING_INITIALIZER -
+ CLOSURE_SLEEPING);
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (s.done)
- break;
+ /*
+ * Carefully undo the continue_at() - but only if it
+ * hasn't completed, i.e. the final closure_put() hasn't
+ * happened yet:
+ */
+ unsigned old = atomic_read(&cl->remaining), new;
+ if (!(old & CLOSURE_SLEEPING))
+ goto success;
+
if (!timeout) {
- /*
- * Carefully undo the continue_at() - but only if it
- * hasn't completed, i.e. the final closure_put() hasn't
- * happened yet:
- */
- unsigned old, new, v = atomic_read(&cl->remaining);
do {
- old = v;
- if (!old || (old & CLOSURE_RUNNING))
+ if (!(old & CLOSURE_SLEEPING))
goto success;
- new = old + CLOSURE_REMAINING_INITIALIZER;
- } while ((v = atomic_cmpxchg(&cl->remaining, old, new)) != old);
+ new = old + CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING;
+ closure_val_checks(cl, new, CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING);
+ } while (!atomic_try_cmpxchg(&cl->remaining, &old, new));
+
ret = -ETIME;
+ break;
}
timeout = schedule_timeout(timeout);