summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2024-06-28 18:11:46 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2024-06-28 18:26:04 -0400
commit34b5654d9eb1999704e75d964645e3aa9b78e249 (patch)
tree27269b040f9128cf33155a9ca47038703efc5606
parentb0eb3c29304f9a4ca39c8534bb6476b170b2a7d0 (diff)
Update bcachefs sources to 9404a01d3dc5 bcachefs: Make read_only a mount option again, but hiddenv1.9.2
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/linux/atomic.h7
-rw-r--r--include/linux/closure.h23
-rw-r--r--include/linux/workqueue.h2
-rw-r--r--libbcachefs/alloc_background.c277
-rw-r--r--libbcachefs/alloc_background.h9
-rw-r--r--libbcachefs/alloc_foreground.c4
-rw-r--r--libbcachefs/bcachefs.h20
-rw-r--r--libbcachefs/bcachefs_format.h18
-rw-r--r--libbcachefs/btree_gc.c7
-rw-r--r--libbcachefs/btree_iter.c33
-rw-r--r--libbcachefs/btree_key_cache.c324
-rw-r--r--libbcachefs/btree_trans_commit.c11
-rw-r--r--libbcachefs/btree_types.h1
-rw-r--r--libbcachefs/buckets.c4
-rw-r--r--libbcachefs/buckets.h4
-rw-r--r--libbcachefs/chardev.c23
-rw-r--r--libbcachefs/checksum.c5
-rw-r--r--libbcachefs/debug.c109
-rw-r--r--libbcachefs/disk_accounting.c14
-rw-r--r--libbcachefs/ec.c2
-rw-r--r--libbcachefs/error.c19
-rw-r--r--libbcachefs/error.h7
-rw-r--r--libbcachefs/extents.c21
-rw-r--r--libbcachefs/extents.h2
-rw-r--r--libbcachefs/fs-io-buffered.c41
-rw-r--r--libbcachefs/fs-io-direct.c4
-rw-r--r--libbcachefs/fs-io-pagecache.c37
-rw-r--r--libbcachefs/fs-io-pagecache.h7
-rw-r--r--libbcachefs/fs.c34
-rw-r--r--libbcachefs/inode.c3
-rw-r--r--libbcachefs/io_read.c1
-rw-r--r--libbcachefs/io_write.c5
-rw-r--r--libbcachefs/journal.c5
-rw-r--r--libbcachefs/journal_io.c24
-rw-r--r--libbcachefs/journal_seq_blacklist.c2
-rw-r--r--libbcachefs/lru.h12
-rw-r--r--libbcachefs/lru_format.h25
-rw-r--r--libbcachefs/opts.h5
-rw-r--r--libbcachefs/recovery.c4
-rw-r--r--libbcachefs/recovery_passes.c4
-rw-r--r--libbcachefs/sb-downgrade.c1
-rw-r--r--libbcachefs/sb-errors.c14
-rw-r--r--libbcachefs/sb-errors_format.h564
-rw-r--r--libbcachefs/seqmutex.h11
-rw-r--r--libbcachefs/snapshot.c5
-rw-r--r--libbcachefs/super.c13
-rw-r--r--libbcachefs/util.h17
-rw-r--r--linux/closure.c54
49 files changed, 1003 insertions, 837 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 9c418ae2..50da14dd 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-792ca5ba3c9a07d762d9c1a440e31c0520f37de0
+9404a01d3dc5553b106fa590602f4771b8e0b8ae
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 73024023..dcc6e644 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -26,6 +26,7 @@ typedef struct {
#define __ATOMIC_READ(p) uatomic_read(p)
#define __ATOMIC_SET(p, v) uatomic_set(p, v)
+#define __ATOMIC_SET_RELEASE(p, v) uatomic_set(p, v)
#define __ATOMIC_ADD_RETURN(v, p) uatomic_add_return(p, v)
#define __ATOMIC_SUB_RETURN(v, p) uatomic_sub_return(p, v)
#define __ATOMIC_ADD(v, p) uatomic_add(p, v)
@@ -64,6 +65,7 @@ typedef struct {
#define __ATOMIC_READ(p) __atomic_load_n(p, __ATOMIC_RELAXED)
#define __ATOMIC_SET(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
+#define __ATOMIC_SET_RELEASE(p, v) __atomic_store_n(p, v, __ATOMIC_RELEASE)
#define __ATOMIC_ADD_RETURN(v, p) __atomic_add_fetch(p, v, __ATOMIC_RELAXED)
#define __ATOMIC_ADD_RETURN_RELEASE(v, p) \
__atomic_add_fetch(p, v, __ATOMIC_RELEASE)
@@ -189,6 +191,11 @@ static inline void a_type##_set(a_type##_t *v, i_type i) \
return __ATOMIC_SET(&v->counter, i); \
} \
\
+static inline void a_type##_set_release(a_type##_t *v, i_type i) \
+{ \
+ return __ATOMIC_SET_RELEASE(&v->counter, i); \
+} \
+ \
static inline i_type a_type##_add_return(i_type i, a_type##_t *v) \
{ \
return __ATOMIC_ADD_RETURN(i, &v->counter); \
diff --git a/include/linux/closure.h b/include/linux/closure.h
index 99155df1..59b8c06b 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -285,6 +285,21 @@ static inline void closure_get(struct closure *cl)
}
/**
+ * closure_get_not_zero
+ */
+static inline bool closure_get_not_zero(struct closure *cl)
+{
+ unsigned old = atomic_read(&cl->remaining);
+ do {
+ if (!(old & CLOSURE_REMAINING_MASK))
+ return false;
+
+ } while (!atomic_try_cmpxchg_acquire(&cl->remaining, &old, old + 1));
+
+ return true;
+}
+
+/**
* closure_init - Initialize a closure, setting the refcount to 1
* @cl: closure to initialize
* @parent: parent of the new closure. cl will take a refcount on it for its
@@ -310,6 +325,12 @@ static inline void closure_init_stack(struct closure *cl)
atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
}
+static inline void closure_init_stack_release(struct closure *cl)
+{
+ memset(cl, 0, sizeof(struct closure));
+ atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+}
+
/**
* closure_wake_up - wake up all closures on a wait list,
* with memory barrier
@@ -355,6 +376,8 @@ do { \
*/
#define closure_return(_cl) continue_at((_cl), NULL, NULL)
+void closure_return_sync(struct closure *cl);
+
/**
* continue_at_nobarrier - jump to another function without barrier
*
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1406c958..5d2ca5f8 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -151,7 +151,7 @@ extern void workqueue_set_max_active(struct workqueue_struct *wq,
extern bool current_is_workqueue_rescuer(void);
extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
extern unsigned int work_busy(struct work_struct *work);
-extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
+static inline __printf(1, 2) void set_worker_desc(const char *fmt, ...) {}
extern void print_worker_info(const char *log_lvl, struct task_struct *task);
extern void show_workqueue_state(void);
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 77aa85b9..8e8aed2a 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -30,7 +30,7 @@
#include <linux/sched/task.h>
#include <linux/sort.h>
-static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
+static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
/* Persistent alloc info: */
@@ -476,7 +476,8 @@ err:
}
__flatten
-struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos)
+struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
+ enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
@@ -484,7 +485,7 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
bch2_trans_iter_exit(trans, &iter);
return unlikely(ret) ? ERR_PTR(ret) : a;
}
@@ -595,8 +596,6 @@ int bch2_alloc_read(struct bch_fs *c)
struct bch_dev *ca = NULL;
int ret;
- down_read(&c->gc_lock);
-
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_prefetch, k, ({
@@ -645,7 +644,6 @@ int bch2_alloc_read(struct bch_fs *c)
bch2_dev_put(ca);
bch2_trans_put(trans);
- up_read(&c->gc_lock);
bch_err_fn(c, ret);
return ret;
@@ -847,6 +845,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
new_a->gen++;
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+ alloc_data_type_set(new_a, new_a->data_type);
}
if (old_a->data_type != new_a->data_type ||
@@ -958,12 +957,12 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (statechange(a->data_type == BCH_DATA_need_discard) &&
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
bucket_flushed(new_a))
- bch2_discard_one_bucket_fast(c, new.k->p);
+ bch2_discard_one_bucket_fast(ca, new.k->p.offset);
if (statechange(a->data_type == BCH_DATA_cached) &&
!bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
- bch2_do_invalidates(c);
+ bch2_dev_do_invalidates(ca);
if (statechange(a->data_type == BCH_DATA_need_gc_gens))
bch2_gc_gens_async(c);
@@ -1684,34 +1683,38 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
return ret;
}
-static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
+static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
{
int ret;
- mutex_lock(&c->discard_buckets_in_flight_lock);
- darray_for_each(c->discard_buckets_in_flight, i)
- if (bkey_eq(*i, bucket)) {
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i)
+ if (i->bucket == bucket) {
ret = -BCH_ERR_EEXIST_discard_in_flight_add;
goto out;
}
- ret = darray_push(&c->discard_buckets_in_flight, bucket);
+ ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
+ .in_progress = in_progress,
+ .bucket = bucket,
+ }));
out:
- mutex_unlock(&c->discard_buckets_in_flight_lock);
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
return ret;
}
-static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
+static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
{
- mutex_lock(&c->discard_buckets_in_flight_lock);
- darray_for_each(c->discard_buckets_in_flight, i)
- if (bkey_eq(*i, bucket)) {
- darray_remove_item(&c->discard_buckets_in_flight, i);
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i)
+ if (i->bucket == bucket) {
+ BUG_ON(!i->in_progress);
+ darray_remove_item(&ca->discard_buckets_in_flight, i);
goto found;
}
BUG();
found:
- mutex_unlock(&c->discard_buckets_in_flight_lock);
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
}
struct discard_buckets_state {
@@ -1719,26 +1722,11 @@ struct discard_buckets_state {
u64 open;
u64 need_journal_commit;
u64 discarded;
- struct bch_dev *ca;
u64 need_journal_commit_this_dev;
};
-static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
-{
- if (s->ca == ca)
- return;
-
- if (s->ca && s->need_journal_commit_this_dev >
- bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
- bch2_journal_flush_async(&c->journal, NULL);
-
- if (s->ca)
- percpu_ref_put(&s->ca->io_ref);
- s->ca = ca;
- s->need_journal_commit_this_dev = 0;
-}
-
static int bch2_discard_one_bucket(struct btree_trans *trans,
+ struct bch_dev *ca,
struct btree_iter *need_discard_iter,
struct bpos *discard_pos_done,
struct discard_buckets_state *s)
@@ -1752,16 +1740,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
bool discard_locked = false;
int ret = 0;
- struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode
- ? s->ca
- : bch2_dev_get_ioref(c, pos.inode, WRITE);
- if (!ca) {
- bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
- return 0;
- }
-
- discard_buckets_next_dev(c, s, ca);
-
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
s->open++;
goto out;
@@ -1821,7 +1799,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
goto out;
}
- if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
+ if (discard_in_flight_add(ca, iter.pos.offset, true))
goto out;
discard_locked = true;
@@ -1845,8 +1823,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
}
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
- alloc_data_type_set(&a->v, a->v.data_type);
write:
+ alloc_data_type_set(&a->v, a->v.data_type);
+
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
@@ -1858,7 +1837,7 @@ write:
s->discarded++;
out:
if (discard_locked)
- discard_in_flight_remove(c, iter.pos);
+ discard_in_flight_remove(ca, iter.pos.offset);
s->seen++;
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
@@ -1867,7 +1846,8 @@ out:
static void bch2_do_discards_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
+ struct bch_fs *c = ca->fs;
struct discard_buckets_state s = {};
struct bpos discard_pos_done = POS_MAX;
int ret;
@@ -1878,23 +1858,41 @@ static void bch2_do_discards_work(struct work_struct *work)
* successful commit:
*/
ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter,
- BTREE_ID_need_discard, POS_MIN, 0, k,
- bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
-
- discard_buckets_next_dev(c, &s, NULL);
+ for_each_btree_key_upto(trans, iter,
+ BTREE_ID_need_discard,
+ POS(ca->dev_idx, 0),
+ POS(ca->dev_idx, U64_MAX), 0, k,
+ bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_dev_do_discards(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ return;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
+ goto put_ioref;
+
+ if (queue_work(c->write_ref_wq, &ca->discard_work))
+ return;
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+put_ioref:
+ percpu_ref_put(&ca->io_ref);
}
void bch2_do_discards(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
- !queue_work(c->write_ref_wq, &c->discard_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ for_each_member_device(c, ca)
+ bch2_dev_do_discards(ca);
}
static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
@@ -1923,68 +1921,69 @@ err:
static void bch2_do_discards_fast_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
+ struct bch_fs *c = ca->fs;
while (1) {
bool got_bucket = false;
- struct bpos bucket;
- struct bch_dev *ca;
+ u64 bucket;
- mutex_lock(&c->discard_buckets_in_flight_lock);
- darray_for_each(c->discard_buckets_in_flight, i) {
- if (i->snapshot)
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i) {
+ if (i->in_progress)
continue;
- ca = bch2_dev_get_ioref(c, i->inode, WRITE);
- if (!ca) {
- darray_remove_item(&c->discard_buckets_in_flight, i);
- continue;
- }
-
got_bucket = true;
- bucket = *i;
- i->snapshot = true;
+ bucket = i->bucket;
+ i->in_progress = true;
break;
}
- mutex_unlock(&c->discard_buckets_in_flight_lock);
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
if (!got_bucket)
break;
if (ca->mi.discard && !c->opts.nochanges)
blkdev_issue_discard(ca->disk_sb.bdev,
- bucket.offset * ca->mi.bucket_size,
+ bucket_to_sector(ca, bucket),
ca->mi.bucket_size,
GFP_KERNEL);
int ret = bch2_trans_do(c, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc,
- bch2_clear_bucket_needs_discard(trans, bucket));
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
bch_err_fn(c, ret);
- percpu_ref_put(&ca->io_ref);
- discard_in_flight_remove(c, bucket);
+ discard_in_flight_remove(ca, bucket);
if (ret)
break;
}
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+ percpu_ref_put(&ca->io_ref);
}
-static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
+static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
{
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode);
- bool dead = !ca || percpu_ref_is_dying(&ca->io_ref);
- rcu_read_unlock();
+ struct bch_fs *c = ca->fs;
+
+ if (discard_in_flight_add(ca, bucket, false))
+ return;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ return;
- if (!dead &&
- !discard_in_flight_add(c, bucket) &&
- bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
- !queue_work(c->write_ref_wq, &c->discard_fast_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
+ goto put_ioref;
+
+ if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
+ return;
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+put_ioref:
+ percpu_ref_put(&ca->io_ref);
}
static int invalidate_one_bucket(struct btree_trans *trans,
@@ -2010,7 +2009,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
return 0;
- a = bch2_trans_start_alloc_update(trans, bucket);
+ a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
@@ -2086,7 +2085,8 @@ again:
static void bch2_do_invalidates_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
+ struct bch_fs *c = ca->fs;
struct btree_trans *trans = bch2_trans_get(c);
int ret = 0;
@@ -2094,50 +2094,63 @@ static void bch2_do_invalidates_work(struct work_struct *work)
if (ret)
goto err;
- for_each_member_device(c, ca) {
- s64 nr_to_invalidate =
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
- struct btree_iter iter;
- bool wrapped = false;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
- lru_pos(ca->dev_idx, 0,
- ((bch2_current_io_time(c, READ) + U32_MAX) &
- LRU_TIME_MAX)), 0);
-
- while (true) {
- bch2_trans_begin(trans);
-
- struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
- if (!k.k)
- break;
+ s64 nr_to_invalidate =
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+ struct btree_iter iter;
+ bool wrapped = false;
- ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+ lru_pos(ca->dev_idx, 0,
+ ((bch2_current_io_time(c, READ) + U32_MAX) &
+ LRU_TIME_MAX)), 0);
- if (ret < 0) {
- bch2_dev_put(ca);
+ while (true) {
+ bch2_trans_begin(trans);
+
+ struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
break;
- }
+ if (!k.k)
+ break;
+
+ ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
+ if (ret)
+ break;
+
+ bch2_btree_iter_advance(&iter);
}
+ bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_dev_do_invalidates(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ return;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
+ goto put_ioref;
+
+ if (queue_work(c->write_ref_wq, &ca->invalidate_work))
+ return;
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+put_ioref:
+ percpu_ref_put(&ca->io_ref);
}
void bch2_do_invalidates(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
- !queue_work(c->write_ref_wq, &c->invalidate_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ for_each_member_device(c, ca)
+ bch2_dev_do_invalidates(ca);
}
int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
@@ -2453,16 +2466,20 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
-void bch2_fs_allocator_background_exit(struct bch_fs *c)
+void bch2_dev_allocator_background_exit(struct bch_dev *ca)
+{
+ darray_exit(&ca->discard_buckets_in_flight);
+}
+
+void bch2_dev_allocator_background_init(struct bch_dev *ca)
{
- darray_exit(&c->discard_buckets_in_flight);
+ mutex_init(&ca->discard_buckets_in_flight_lock);
+ INIT_WORK(&ca->discard_work, bch2_do_discards_work);
+ INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
+ INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
}
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
- mutex_init(&c->discard_buckets_in_flight_lock);
- INIT_WORK(&c->discard_work, bch2_do_discards_work);
- INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
- INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index dcf58c38..8d2b62c9 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -206,7 +206,8 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
struct bkey_i_alloc_v4 *
bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *, struct bpos);
+bch2_trans_start_alloc_update(struct btree_trans *, struct bpos,
+ enum btree_iter_update_trigger_flags);
void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
@@ -299,6 +300,7 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
enum btree_iter_update_trigger_flags);
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_dev_do_discards(struct bch_dev *);
void bch2_do_discards(struct bch_fs *);
static inline u64 should_invalidate_buckets(struct bch_dev *ca,
@@ -313,6 +315,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
}
+void bch2_dev_do_invalidates(struct bch_dev *);
void bch2_do_invalidates(struct bch_fs *);
static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
@@ -336,7 +339,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-void bch2_fs_allocator_background_exit(struct bch_fs *);
+void bch2_dev_allocator_background_exit(struct bch_dev *);
+void bch2_dev_allocator_background_init(struct bch_dev *);
+
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index a352a671..73228b25 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -621,13 +621,13 @@ again:
avail = dev_buckets_free(ca, *usage, watermark);
if (usage->d[BCH_DATA_need_discard].buckets > avail)
- bch2_do_discards(c);
+ bch2_dev_do_discards(ca);
if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
bch2_gc_gens_async(c);
if (should_invalidate_buckets(ca, *usage))
- bch2_do_invalidates(c);
+ bch2_dev_do_invalidates(ca);
if (!avail) {
if (cl && !waiting) {
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 11676678..372bc339 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -496,6 +496,11 @@ struct io_count {
u64 sectors[2][BCH_DATA_NR];
};
+struct discard_in_flight {
+ bool in_progress:1;
+ u64 bucket:63;
+};
+
struct bch_dev {
struct kobject kobj;
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -533,8 +538,8 @@ struct bch_dev {
/*
* Buckets:
* Per-bucket arrays are protected by c->mark_lock, bucket_lock and
- * gc_lock, for device resize - holding any is sufficient for access:
- * Or rcu_read_lock(), but only for dev_ptr_stale():
+ * gc_gens_lock, for device resize - holding any is sufficient for
+ * access: Or rcu_read_lock(), but only for dev_ptr_stale():
*/
struct bucket_array __rcu *buckets_gc;
struct bucket_gens __rcu *bucket_gens;
@@ -555,6 +560,12 @@ struct bch_dev {
size_t inc_gen_really_needs_gc;
size_t buckets_waiting_on_journal;
+ struct work_struct invalidate_work;
+ struct work_struct discard_work;
+ struct mutex discard_buckets_in_flight_lock;
+ DARRAY(struct discard_in_flight) discard_buckets_in_flight;
+ struct work_struct discard_fast_work;
+
atomic64_t rebalance_work;
struct journal_device journal;
@@ -909,11 +920,6 @@ struct bch_fs {
unsigned write_points_nr;
struct buckets_waiting_for_journal buckets_waiting_for_journal;
- struct work_struct invalidate_work;
- struct work_struct discard_work;
- struct mutex discard_buckets_in_flight_lock;
- DARRAY(struct bpos) discard_buckets_in_flight;
- struct work_struct discard_fast_work;
/* GARBAGE COLLECTION */
struct work_struct gc_gens_work;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 66ba8fb4..74a60b1a 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -468,18 +468,6 @@ struct bch_backpointer {
struct bpos pos;
} __packed __aligned(8);
-/* LRU btree: */
-
-struct bch_lru {
- struct bch_val v;
- __le64 idx;
-} __packed __aligned(8);
-
-#define LRU_ID_STRIPES (1U << 16)
-
-#define LRU_TIME_BITS 48
-#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
-
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -516,6 +504,7 @@ struct bch_sb_field {
#include "inode_format.h"
#include "journal_seq_blacklist_format.h"
#include "logged_ops_format.h"
+#include "lru_format.h"
#include "quota_format.h"
#include "reflink_format.h"
#include "replicas_format.h"
@@ -954,8 +943,9 @@ enum bch_version_upgrade_opts {
#define BCH_ERROR_ACTIONS() \
x(continue, 0) \
- x(ro, 1) \
- x(panic, 2)
+ x(fix_safe, 1) \
+ x(panic, 2) \
+ x(ro, 3)
enum bch_error_actions {
#define x(t, n) BCH_ON_ERROR_##t = n,
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 0c2eb756..2e9ccb20 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -1229,7 +1229,7 @@ int bch2_gc_gens(struct bch_fs *c)
int ret;
/*
- * Ideally we would be using state_lock and not gc_lock here, but that
+ * Ideally we would be using state_lock and not gc_gens_lock here, but that
* introduces a deadlock in the RO path - we currently take the state
* lock at the start of going RO, thus the gc thread may get stuck:
*/
@@ -1237,7 +1237,8 @@ int bch2_gc_gens(struct bch_fs *c)
return 0;
trace_and_count(c, gc_gens_start, c);
- down_read(&c->gc_lock);
+
+ down_read(&c->state_lock);
for_each_member_device(c, ca) {
struct bucket_gens *gens = bucket_gens(ca);
@@ -1306,7 +1307,7 @@ err:
ca->oldest_gen = NULL;
}
- up_read(&c->gc_lock);
+ up_read(&c->state_lock);
mutex_unlock(&c->gc_gens_lock);
if (!bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret);
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index c68cc714..80f4a395 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -1801,13 +1801,12 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
goto hole;
} else {
struct bkey_cached *ck = (void *) path->l[0].b;
-
- EBUG_ON(ck &&
- (path->btree_id != ck->key.btree_id ||
- !bkey_eq(path->pos, ck->key.pos)));
- if (!ck || !ck->valid)
+ if (!ck)
return bkey_s_c_null;
+ EBUG_ON(path->btree_id != ck->key.btree_id ||
+ !bkey_eq(path->pos, ck->key.pos));
+
*u = ck->k->k;
k = bkey_i_to_s_c(ck->k);
}
@@ -3131,7 +3130,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
memset(trans, 0, sizeof(*trans));
- closure_init_stack(&trans->ref);
seqmutex_lock(&c->btree_trans_lock);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
@@ -3151,15 +3149,10 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
BUG_ON(pos_task &&
pid == pos_task->pid &&
pos->locked);
-
- if (pos_task && pid < pos_task->pid) {
- list_add_tail(&trans->list, &pos->list);
- goto list_add_done;
- }
}
}
- list_add_tail(&trans->list, &c->btree_trans_list);
-list_add_done:
+
+ list_add(&trans->list, &c->btree_trans_list);
seqmutex_unlock(&c->btree_trans_lock);
got_trans:
trans->c = c;
@@ -3200,6 +3193,8 @@ got_trans:
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
trans->srcu_held = true;
+
+ closure_init_stack_release(&trans->ref);
return trans;
}
@@ -3236,7 +3231,6 @@ void bch2_trans_put(struct btree_trans *trans)
trans_for_each_update(trans, i)
__btree_path_put(trans->paths + i->path, true);
trans->nr_updates = 0;
- trans->locking_wait.task = NULL;
check_btree_paths_leaked(trans);
@@ -3248,6 +3242,13 @@ void bch2_trans_put(struct btree_trans *trans)
if (unlikely(trans->journal_replay_not_finished))
bch2_journal_keys_put(c);
+ /*
+ * trans->ref protects trans->locking_wait.task, btree_paths array; used
+ * by cycle detector
+ */
+ closure_return_sync(&trans->ref);
+ trans->locking_wait.task = NULL;
+
unsigned long *paths_allocated = trans->paths_allocated;
trans->paths_allocated = NULL;
trans->paths = NULL;
@@ -3265,8 +3266,6 @@ void bch2_trans_put(struct btree_trans *trans)
trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
if (trans) {
- closure_sync(&trans->ref);
-
seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock);
@@ -3386,8 +3385,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
if (trans) {
- closure_sync(&trans->ref);
-
seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock);
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 8b2fd0ae..f2f2e525 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -205,9 +205,22 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc,
six_unlock_intent(&ck->c.lock);
}
+static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
+{
+ struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
+ if (unlikely(!ck))
+ return NULL;
+ ck->k = kmalloc(key_u64s * sizeof(u64), gfp);
+ if (unlikely(!ck->k)) {
+ kmem_cache_free(bch2_key_cache, ck);
+ return NULL;
+ }
+ ck->u64s = key_u64s;
+ return ck;
+}
+
static struct bkey_cached *
-bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
- bool *was_new)
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s)
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
@@ -281,8 +294,10 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
}
ck = allocate_dropping_locks(trans, ret,
- kmem_cache_zalloc(bch2_key_cache, _gfp));
+ __bkey_cached_alloc(key_u64s, _gfp));
if (ret) {
+ if (ck)
+ kfree(ck->k);
kmem_cache_free(bch2_key_cache, ck);
return ERR_PTR(ret);
}
@@ -296,7 +311,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
ck->c.cached = true;
BUG_ON(!six_trylock_intent(&ck->c.lock));
BUG_ON(!six_trylock_write(&ck->c.lock));
- *was_new = true;
return ck;
}
@@ -326,71 +340,102 @@ out:
return ck;
}
-static struct bkey_cached *
-btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
+static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
- struct bkey_cached *ck;
- bool was_new = false;
- ck = bkey_cached_alloc(trans, path, &was_new);
- if (IS_ERR(ck))
- return ck;
+ /*
+ * bch2_varint_decode can read past the end of the buffer by at
+ * most 7 bytes (it won't be used):
+ */
+ unsigned key_u64s = k.k->u64s + 1;
+
+ /*
+ * Allocate some extra space so that the transaction commit path is less
+ * likely to have to reallocate, since that requires a transaction
+ * restart:
+ */
+ key_u64s = min(256U, (key_u64s * 3) / 2);
+ key_u64s = roundup_pow_of_two(key_u64s);
+
+ struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s);
+ int ret = PTR_ERR_OR_ZERO(ck);
+ if (ret)
+ return ret;
if (unlikely(!ck)) {
ck = bkey_cached_reuse(bc);
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
bch2_btree_id_str(path->btree_id));
- return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
+ return -BCH_ERR_ENOMEM_btree_key_cache_create;
}
-
- mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
}
ck->c.level = 0;
ck->c.btree_id = path->btree_id;
ck->key.btree_id = path->btree_id;
ck->key.pos = path->pos;
- ck->valid = false;
ck->flags = 1U << BKEY_CACHED_ACCESSED;
- if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
- &ck->hash,
- bch2_btree_key_cache_params))) {
- /* We raced with another fill: */
-
- if (likely(was_new)) {
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
- kfree(ck);
- } else {
- bkey_cached_free_fast(bc, ck);
+ if (unlikely(key_u64s > ck->u64s)) {
+ mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+
+ struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
+ kmalloc(key_u64s * sizeof(u64), _gfp));
+ if (unlikely(!new_k)) {
+ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_id_str(ck->key.btree_id), key_u64s);
+ ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+ } else if (ret) {
+ kfree(new_k);
+ goto err;
}
- mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
- return NULL;
+ kfree(ck->k);
+ ck->k = new_k;
+ ck->u64s = key_u64s;
}
- atomic_long_inc(&bc->nr_keys);
+ bkey_reassemble(ck->k, k);
+ ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
+ if (unlikely(ret)) /* raced with another fill? */
+ goto err;
+
+ atomic_long_inc(&bc->nr_keys);
six_unlock_write(&ck->c.lock);
- return ck;
+ enum six_lock_type lock_want = __btree_lock_want(path, 0);
+ if (lock_want == SIX_LOCK_read)
+ six_lock_downgrade(&ck->c.lock);
+ btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
+ path->uptodate = BTREE_ITER_UPTODATE;
+ return 0;
+err:
+ bkey_cached_free_fast(bc, ck);
+ mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+
+ return ret;
}
-static int btree_key_cache_fill(struct btree_trans *trans,
- struct btree_path *ck_path,
- struct bkey_cached *ck)
+static noinline int btree_key_cache_fill(struct btree_trans *trans,
+ struct btree_path *ck_path,
+ unsigned flags)
{
+ if (flags & BTREE_ITER_cached_nofill) {
+ ck_path->uptodate = BTREE_ITER_UPTODATE;
+ return 0;
+ }
+
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- unsigned new_u64s = 0;
- struct bkey_i *new_k = NULL;
int ret;
- bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
+ bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
BTREE_ITER_key_cache_fill|
BTREE_ITER_cached_nofill);
iter.flags &= ~BTREE_ITER_with_journal;
@@ -399,70 +444,15 @@ static int btree_key_cache_fill(struct btree_trans *trans,
if (ret)
goto err;
- if (!bch2_btree_node_relock(trans, ck_path, 0)) {
- trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
- goto err;
- }
-
- /*
- * bch2_varint_decode can read past the end of the buffer by at
- * most 7 bytes (it won't be used):
- */
- new_u64s = k.k->u64s + 1;
-
- /*
- * Allocate some extra space so that the transaction commit path is less
- * likely to have to reallocate, since that requires a transaction
- * restart:
- */
- new_u64s = min(256U, (new_u64s * 3) / 2);
-
- if (new_u64s > ck->u64s) {
- new_u64s = roundup_pow_of_two(new_u64s);
- new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
- if (!new_k) {
- bch2_trans_unlock(trans);
-
- new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
- if (!new_k) {
- bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_id_str(ck->key.btree_id), new_u64s);
- ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
- goto err;
- }
-
- ret = bch2_trans_relock(trans);
- if (ret) {
- kfree(new_k);
- goto err;
- }
-
- if (!bch2_btree_node_relock(trans, ck_path, 0)) {
- kfree(new_k);
- trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
- goto err;
- }
- }
- }
+ /* Recheck after btree lookup, before allocating: */
+ ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0;
+ if (unlikely(ret))
+ goto out;
- ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
- if (ret) {
- kfree(new_k);
+ ret = btree_key_cache_create(trans, ck_path, k);
+ if (ret)
goto err;
- }
-
- if (new_k) {
- kfree(ck->k);
- ck->u64s = new_u64s;
- ck->k = new_k;
- }
-
- bkey_reassemble(ck->k, k);
- ck->valid = true;
- bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
-
+out:
/* We're not likely to need this iterator again: */
bch2_set_btree_iter_dontneed(&iter);
err:
@@ -470,128 +460,62 @@ err:
return ret;
}
-static noinline int
-bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
- unsigned flags)
+static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
+ struct btree_path *path)
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck;
- int ret = 0;
-
- BUG_ON(path->level);
-
- path->l[1].b = NULL;
-
- if (bch2_btree_node_relock_notrace(trans, path, 0)) {
- ck = (void *) path->l[0].b;
- goto fill;
- }
retry:
ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
- if (!ck) {
- ck = btree_key_cache_create(trans, path);
- ret = PTR_ERR_OR_ZERO(ck);
- if (ret)
- goto err;
- if (!ck)
- goto retry;
-
- btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
- path->locks_want = 1;
- } else {
- enum six_lock_type lock_want = __btree_lock_want(path, 0);
-
- ret = btree_node_lock(trans, path, (void *) ck, 0,
- lock_want, _THIS_IP_);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto err;
-
- BUG_ON(ret);
-
- if (ck->key.btree_id != path->btree_id ||
- !bpos_eq(ck->key.pos, path->pos)) {
- six_unlock_type(&ck->c.lock, lock_want);
- goto retry;
- }
+ if (!ck)
+ return -ENOENT;
- btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
- }
-fill:
- path->uptodate = BTREE_ITER_UPTODATE;
+ enum six_lock_type lock_want = __btree_lock_want(path, 0);
- if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) {
- ret = bch2_btree_path_upgrade(trans, path, 1) ?:
- btree_key_cache_fill(trans, path, ck) ?:
- bch2_btree_path_relock(trans, path, _THIS_IP_);
- if (ret)
- goto err;
+ int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_);
+ if (ret)
+ return ret;
- path->uptodate = BTREE_ITER_UPTODATE;
+ if (ck->key.btree_id != path->btree_id ||
+ !bpos_eq(ck->key.pos, path->pos)) {
+ six_unlock_type(&ck->c.lock, lock_want);
+ goto retry;
}
if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
- BUG_ON(path->uptodate);
-
- return ret;
-err:
- path->uptodate = BTREE_ITER_NEED_TRAVERSE;
- if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- btree_node_unlock(trans, path, 0);
- path->l[0].b = ERR_PTR(ret);
- }
- return ret;
+ btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
+ path->uptodate = BTREE_ITER_UPTODATE;
+ return 0;
}
int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
unsigned flags)
{
- struct bch_fs *c = trans->c;
- struct bkey_cached *ck;
- int ret = 0;
-
EBUG_ON(path->level);
path->l[1].b = NULL;
if (bch2_btree_node_relock_notrace(trans, path, 0)) {
- ck = (void *) path->l[0].b;
- goto fill;
+ path->uptodate = BTREE_ITER_UPTODATE;
+ return 0;
}
-retry:
- ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
- if (!ck)
- return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
-
- enum six_lock_type lock_want = __btree_lock_want(path, 0);
- ret = btree_node_lock(trans, path, (void *) ck, 0,
- lock_want, _THIS_IP_);
- EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
- if (ret)
- return ret;
-
- if (ck->key.btree_id != path->btree_id ||
- !bpos_eq(ck->key.pos, path->pos)) {
- six_unlock_type(&ck->c.lock, lock_want);
- goto retry;
+ int ret;
+ do {
+ ret = btree_path_traverse_cached_fast(trans, path);
+ if (unlikely(ret == -ENOENT))
+ ret = btree_key_cache_fill(trans, path, flags);
+ } while (ret == -EEXIST);
+
+ if (unlikely(ret)) {
+ path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ btree_node_unlock(trans, path, 0);
+ path->l[0].b = ERR_PTR(ret);
+ }
}
-
- btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
-fill:
- if (!ck->valid)
- return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
-
- if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
- set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-
- path->uptodate = BTREE_ITER_UPTODATE;
- EBUG_ON(!ck->valid);
- EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
-
return ret;
}
@@ -630,8 +554,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
goto out;
}
- BUG_ON(!ck->valid);
-
if (journal_seq && ck->journal.seq != journal_seq)
goto out;
@@ -753,7 +675,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
BUG_ON(insert->k.u64s > ck->u64s);
bkey_copy(ck->k, insert);
- ck->valid = true;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
@@ -792,10 +713,9 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
struct btree_path *path)
{
struct bch_fs *c = trans->c;
+ struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck = (void *) path->l[0].b;
- BUG_ON(!ck->valid);
-
/*
* We just did an update to the btree, bypassing the key cache: the key
* cache key is now stale and must be dropped, even if dirty:
@@ -806,7 +726,11 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
bch2_journal_pin_drop(&c->journal, &ck->journal);
}
- ck->valid = false;
+ bkey_cached_evict(bc, ck);
+ bkey_cached_free_fast(bc, ck);
+
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
}
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c
index 8ab85f21..cca336fe 100644
--- a/libbcachefs/btree_trans_commit.c
+++ b/libbcachefs/btree_trans_commit.c
@@ -137,7 +137,8 @@ static inline void bch2_trans_unlock_write(struct btree_trans *trans)
{
if (likely(trans->write_locked)) {
trans_for_each_update(trans, i)
- if (!same_leaf_as_prev(trans, i))
+ if (btree_node_locked_type(trans->paths + i->path, i->level) ==
+ BTREE_NODE_WRITE_LOCKED)
bch2_btree_node_unlock_write_inlined(trans,
trans->paths + i->path, insert_l(trans, i)->b);
trans->write_locked = false;
@@ -777,14 +778,12 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
trans_for_each_update(trans, i) {
struct btree_path *path = trans->paths + i->path;
- if (!i->cached) {
+ if (!i->cached)
bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
- } else if (!i->key_cache_already_flushed)
+ else if (!i->key_cache_already_flushed)
bch2_btree_insert_key_cached(trans, flags, i);
- else {
+ else
bch2_btree_key_cache_drop(trans, path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- }
}
return 0;
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index e7a78ef6..c9c9864a 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -388,7 +388,6 @@ struct bkey_cached {
unsigned long flags;
unsigned long btree_trans_barrier_seq;
u16 u64s;
- bool valid;
struct bkey_cached_key key;
struct rhash_head hash;
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 74c0fce3..25549c23 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -569,7 +569,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
*sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
if (flags & BTREE_TRIGGER_transactional) {
- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket);
+ struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
ret = PTR_ERR_OR_ZERO(a) ?:
__mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v);
if (ret)
@@ -1217,7 +1217,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bucket_gens->nbuckets - bucket_gens->first_bucket;
if (resize) {
- down_write(&c->gc_lock);
down_write(&ca->bucket_lock);
percpu_down_write(&c->mark_lock);
}
@@ -1240,7 +1239,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (resize) {
percpu_up_write(&c->mark_lock);
up_write(&ca->bucket_lock);
- up_write(&c->gc_lock);
}
ret = 0;
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index fa563796..4a14741b 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -85,7 +85,7 @@ static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
return rcu_dereference_check(ca->buckets_gc,
!ca->fs ||
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->gc_lock) ||
+ lockdep_is_held(&ca->fs->state_lock) ||
lockdep_is_held(&ca->bucket_lock));
}
@@ -103,7 +103,7 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
return rcu_dereference_check(ca->bucket_gens,
!ca->fs ||
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->gc_lock) ||
+ lockdep_is_held(&ca->fs->state_lock) ||
lockdep_is_held(&ca->bucket_lock));
}
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index 4248c251..ef1f7486 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -214,22 +214,10 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
if (arg.opts) {
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
- char *ro, *rest;
-
- /*
- * If passed a "read_only" mount option, remove it because it is
- * no longer a valid mount option, and the filesystem will be
- * set "read_only" regardless.
- */
- ro = strstr(optstr, "read_only");
- if (ro) {
- rest = ro + strlen("read_only");
- memmove(ro, rest, strlen(rest) + 1);
- }
-
ret = PTR_ERR_OR_ZERO(optstr) ?:
bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
- kfree(optstr);
+ if (!IS_ERR(optstr))
+ kfree(optstr);
if (ret)
goto err;
@@ -333,7 +321,8 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
return ret;
ret = bch2_dev_add(c, path);
- kfree(path);
+ if (!IS_ERR(path))
+ kfree(path);
return ret;
}
@@ -579,7 +568,6 @@ static long bch2_ioctl_query_accounting(struct bch_fs *c,
ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
err:
- bch_err_fn(c, ret);
darray_exit(&accounting);
return ret;
}
@@ -861,7 +849,8 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
ret = PTR_ERR_OR_ZERO(optstr) ?:
bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
- kfree(optstr);
+ if (!IS_ERR(optstr))
+ kfree(optstr);
if (ret)
goto err;
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 3bd3aba9..e7208bf1 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -10,6 +10,7 @@
#include <linux/xxhash.h>
#include <linux/key.h>
#include <linux/random.h>
+#include <linux/ratelimit.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
#include <crypto/chacha.h>
@@ -436,7 +437,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
struct printbuf buf = PRINTBUF;
prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
- "expected %0llx:%0llx got %0llx:%0llx (old type ",
+ " expected %0llx:%0llx got %0llx:%0llx (old type ",
__func__,
crc_old.csum.hi,
crc_old.csum.lo,
@@ -446,7 +447,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
prt_str(&buf, " new type ");
bch2_prt_csum_type(&buf, new_csum_type);
prt_str(&buf, ")");
- bch_err(c, "%s", buf.buf);
+ WARN_RATELIMIT(1, "%s", buf.buf);
printbuf_exit(&buf);
return -EIO;
}
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index 51cbf392..f0d4727c 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -568,6 +568,32 @@ static const struct file_operations cached_btree_nodes_ops = {
.read = bch2_cached_btree_nodes_read,
};
+typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r);
+
+static void list_sort(struct list_head *head, list_cmp_fn cmp)
+{
+ struct list_head *pos;
+
+ list_for_each(pos, head)
+ while (!list_is_last(pos, head) &&
+ cmp(pos, pos->next) > 0) {
+ struct list_head *pos2, *next = pos->next;
+
+ list_del(next);
+ list_for_each(pos2, head)
+ if (cmp(next, pos2) < 0)
+ goto pos_found;
+ BUG();
+pos_found:
+ list_add_tail(next, pos2);
+ }
+}
+
+static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r)
+{
+ return cmp_int(l, r);
+}
+
static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
@@ -575,41 +601,39 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
struct bch_fs *c = i->c;
struct btree_trans *trans;
ssize_t ret = 0;
- u32 seq;
i->ubuf = buf;
i->size = size;
i->ret = 0;
restart:
seqmutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+ list_sort(&c->btree_trans_list, list_ptr_order_cmp);
- if (!task || task->pid <= i->iter)
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if ((ulong) trans < i->iter)
continue;
- closure_get(&trans->ref);
- seq = seqmutex_seq(&c->btree_trans_lock);
- seqmutex_unlock(&c->btree_trans_lock);
+ i->iter = (ulong) trans;
- ret = flush_buf(i);
- if (ret) {
- closure_put(&trans->ref);
- goto unlocked;
- }
+ if (!closure_get_not_zero(&trans->ref))
+ continue;
+
+ u32 seq = seqmutex_unlock(&c->btree_trans_lock);
bch2_btree_trans_to_text(&i->buf, trans);
prt_printf(&i->buf, "backtrace:\n");
printbuf_indent_add(&i->buf, 2);
- bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
+ bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL);
printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf);
- i->iter = task->pid;
-
closure_put(&trans->ref);
+ ret = flush_buf(i);
+ if (ret)
+ goto unlocked;
+
if (!seqmutex_relock(&c->btree_trans_lock, seq))
goto restart;
}
@@ -804,50 +828,55 @@ static const struct file_operations btree_transaction_stats_op = {
.read = btree_transaction_stats_read,
};
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
+/* walk btree transactions until we find a deadlock and print it */
+static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
struct btree_trans *trans;
- ssize_t ret = 0;
- u32 seq;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- if (i->iter)
- goto out;
+ pid_t iter = 0;
restart:
seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
struct task_struct *task = READ_ONCE(trans->locking_wait.task);
- if (!task || task->pid <= i->iter)
+ if (!task || task->pid <= iter)
continue;
- closure_get(&trans->ref);
- seq = seqmutex_seq(&c->btree_trans_lock);
- seqmutex_unlock(&c->btree_trans_lock);
+ iter = task->pid;
- ret = flush_buf(i);
- if (ret) {
- closure_put(&trans->ref);
- goto out;
- }
+ if (!closure_get_not_zero(&trans->ref))
+ continue;
- bch2_check_for_deadlock(trans, &i->buf);
+ u32 seq = seqmutex_unlock(&c->btree_trans_lock);
- i->iter = task->pid;
+ bool found = bch2_check_for_deadlock(trans, out) != 0;
closure_put(&trans->ref);
+ if (found)
+ return;
+
if (!seqmutex_relock(&c->btree_trans_lock, seq))
goto restart;
}
seqmutex_unlock(&c->btree_trans_lock);
-out:
+}
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (!i->iter) {
+ btree_deadlock_to_text(&i->buf, c);
+ i->iter++;
+ }
+
if (i->buf.allocation_failure)
ret = -ENOMEM;
diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c
index 37037459..dcdd5924 100644
--- a/libbcachefs/disk_accounting.c
+++ b/libbcachefs/disk_accounting.c
@@ -521,8 +521,9 @@ fsck_err:
return ret;
}
-static int accounting_read_key(struct bch_fs *c, struct btree_trans *trans, struct bkey_s_c k)
+static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
{
+ struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
if (k.k->type != KEY_TYPE_accounting)
@@ -557,15 +558,15 @@ fsck_err:
int bch2_accounting_read(struct bch_fs *c)
{
struct bch_accounting_mem *acc = &c->accounting;
+ struct btree_trans *trans = bch2_trans_get(c);
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter,
+ int ret = for_each_btree_key(trans, iter,
BTREE_ID_accounting, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
struct bkey u;
struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
- accounting_read_key(c, trans, k);
- })));
+ accounting_read_key(trans, k);
+ }));
if (ret)
goto err;
@@ -598,7 +599,7 @@ int bch2_accounting_read(struct bch_fs *c)
continue;
}
- ret = accounting_read_key(c, NULL, k);
+ ret = accounting_read_key(trans, k);
if (ret)
goto err;
}
@@ -645,6 +646,7 @@ int bch2_accounting_read(struct bch_fs *c)
preempt_enable();
percpu_up_read(&c->mark_lock);
err:
+ bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
}
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 3c3a2a7e..86948d11 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -283,7 +283,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_transactional) {
struct bkey_i_alloc_v4 *a =
- bch2_trans_start_alloc_update(trans, bucket);
+ bch2_trans_start_alloc_update(trans, bucket, 0);
ret = PTR_ERR_OR_ZERO(a) ?:
__mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
}
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 9d7cc79e..a62b6310 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -16,6 +16,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
switch (c->opts.errors) {
case BCH_ON_ERROR_continue:
return false;
+ case BCH_ON_ERROR_fix_safe:
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
@@ -211,6 +212,12 @@ static void prt_actioning(struct printbuf *out, const char *action)
prt_str(out, "ing");
}
+static const u8 fsck_flags_extra[] = {
+#define x(t, n, flags) [BCH_FSCK_ERR_##t] = flags,
+ BCH_SB_ERRS()
+#undef x
+};
+
int __bch2_fsck_err(struct bch_fs *c,
struct btree_trans *trans,
enum bch_fsck_flags flags,
@@ -226,6 +233,9 @@ int __bch2_fsck_err(struct bch_fs *c,
might_sleep();
+ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
+ flags |= fsck_flags_extra[err];
+
if (!c)
c = trans->c;
@@ -293,7 +303,14 @@ int __bch2_fsck_err(struct bch_fs *c,
prt_printf(out, bch2_log_msg(c, ""));
#endif
- if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
+ if ((flags & FSCK_CAN_FIX) &&
+ (flags & FSCK_AUTOFIX) &&
+ (c->opts.errors == BCH_ON_ERROR_continue ||
+ c->opts.errors == BCH_ON_ERROR_fix_safe)) {
+ prt_str(out, ", ");
+ prt_actioning(out, action);
+ ret = -BCH_ERR_fsck_fix;
+ } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
if (c->opts.errors != BCH_ON_ERROR_continue ||
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
prt_str(out, ", shutting down");
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index ead36936..995e6bba 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -108,13 +108,6 @@ struct fsck_err_state {
char *last_msg;
};
-enum bch_fsck_flags {
- FSCK_CAN_FIX = 1 << 0,
- FSCK_CAN_IGNORE = 1 << 1,
- FSCK_NEED_FSCK = 1 << 2,
- FSCK_NO_RATELIMIT = 1 << 3,
-};
-
#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
__printf(5, 6) __cold
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 410b8bd8..057df38f 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -1034,6 +1034,18 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
--out->atomic;
}
+void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
+{
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
+ crc->compressed_size,
+ crc->uncompressed_size,
+ crc->offset, crc->nonce);
+ bch2_prt_csum_type(out, crc->csum_type);
+ prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
+ prt_str(out, " compress ");
+ bch2_prt_compression_type(out, crc->compression_type);
+}
+
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
@@ -1059,13 +1071,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
- crc.compressed_size,
- crc.uncompressed_size,
- crc.offset, crc.nonce);
- bch2_prt_csum_type(out, crc.csum_type);
- prt_str(out, " compress ");
- bch2_prt_compression_type(out, crc.compression_type);
+ bch2_extent_crc_unpacked_to_text(out, &crc);
break;
}
case BCH_EXTENT_ENTRY_stripe_ptr: {
@@ -1096,6 +1102,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
}
}
+
static int extent_ptr_invalid(struct bch_fs *c,
struct bkey_s_c k,
enum bch_validate_flags flags,
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 1ade9596..530686aa 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -212,6 +212,8 @@ static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
}
+void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *);
+
/* bkey_ptrs: generically over any key type that has ptrs */
struct bkey_ptrs_c {
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
index 54873ecc..cc33d763 100644
--- a/libbcachefs/fs-io-buffered.c
+++ b/libbcachefs/fs-io-buffered.c
@@ -678,8 +678,8 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
bch2_pagecache_add_get(inode);
folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
- FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
- mapping_gfp_mask(mapping));
+ FGP_WRITEBEGIN | fgf_set_order(len),
+ mapping_gfp_mask(mapping));
if (IS_ERR_OR_NULL(folio))
goto err_unlock;
@@ -820,9 +820,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
darray_init(&fs);
ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
- FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
- mapping_gfp_mask(mapping),
- &fs);
+ FGP_WRITEBEGIN | fgf_set_order(len),
+ mapping_gfp_mask(mapping), &fs);
if (ret)
goto out;
@@ -864,24 +863,26 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
f_pos = pos;
f_offset = pos - folio_pos(darray_first(fs));
darray_for_each(fs, fi) {
+ ssize_t f_reserved;
+
f = *fi;
f_len = min(end, folio_end_pos(f)) - f_pos;
+ f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len);
+
+ if (unlikely(f_reserved != f_len)) {
+ if (f_reserved < 0) {
+ if (f == darray_first(fs)) {
+ ret = f_reserved;
+ goto out;
+ }
+
+ folios_trunc(&fs, fi);
+ end = min(end, folio_end_pos(darray_last(fs)));
+ } else {
+ folios_trunc(&fs, fi + 1);
+ end = f_pos + f_reserved;
+ }
- /*
- * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
- * supposed to write as much as we have disk space for.
- *
- * On failure here we should still write out a partial page if
- * we aren't completely out of disk space - we don't do that
- * yet:
- */
- ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
- if (unlikely(ret)) {
- folios_trunc(&fs, fi);
- if (!fs.nr)
- goto out;
-
- end = min(end, folio_end_pos(darray_last(fs)));
break;
}
diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c
index 049b61bc..e246b1e0 100644
--- a/libbcachefs/fs-io-direct.c
+++ b/libbcachefs/fs-io-direct.c
@@ -179,7 +179,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct bch_inode_info *inode = file_bch_inode(file);
struct address_space *mapping = file->f_mapping;
size_t count = iov_iter_count(iter);
- ssize_t ret;
+ ssize_t ret = 0;
if (!count)
return 0; /* skip atime */
@@ -205,7 +205,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
iocb->ki_pos += ret;
} else {
bch2_pagecache_add_get(inode);
- ret = generic_file_read_iter(iocb, iter);
+ ret = filemap_read(iocb, iter, ret);
bch2_pagecache_add_put(inode);
}
out:
diff --git a/libbcachefs/fs-io-pagecache.c b/libbcachefs/fs-io-pagecache.c
index 872283e5..a9cc5cad 100644
--- a/libbcachefs/fs-io-pagecache.c
+++ b/libbcachefs/fs-io-pagecache.c
@@ -423,7 +423,7 @@ int bch2_folio_reservation_get(struct bch_fs *c,
struct bch_inode_info *inode,
struct folio *folio,
struct bch2_folio_reservation *res,
- unsigned offset, unsigned len)
+ size_t offset, size_t len)
{
struct bch_folio *s = bch2_folio_create(folio, 0);
unsigned i, disk_sectors = 0, quota_sectors = 0;
@@ -437,8 +437,7 @@ int bch2_folio_reservation_get(struct bch_fs *c,
for (i = round_down(offset, block_bytes(c)) >> 9;
i < round_up(offset + len, block_bytes(c)) >> 9;
i++) {
- disk_sectors += sectors_to_reserve(&s->s[i],
- res->disk.nr_replicas);
+ disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
quota_sectors += s->s[i].state == SECTOR_unallocated;
}
@@ -449,12 +448,9 @@ int bch2_folio_reservation_get(struct bch_fs *c,
}
if (quota_sectors) {
- ret = bch2_quota_reservation_add(c, inode, &res->quota,
- quota_sectors, true);
+ ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
if (unlikely(ret)) {
- struct disk_reservation tmp = {
- .sectors = disk_sectors
- };
+ struct disk_reservation tmp = { .sectors = disk_sectors };
bch2_disk_reservation_put(c, &tmp);
res->disk.sectors -= disk_sectors;
@@ -465,6 +461,31 @@ int bch2_folio_reservation_get(struct bch_fs *c,
return 0;
}
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct folio *folio,
+ struct bch2_folio_reservation *res,
+ size_t offset, size_t len)
+{
+ size_t l, reserved = 0;
+ int ret;
+
+ while ((l = len - reserved)) {
+ while ((ret = bch2_folio_reservation_get(c, inode, folio, res, offset, l))) {
+ if ((offset & (block_bytes(c) - 1)) + l <= block_bytes(c))
+ return reserved ?: ret;
+
+ len = reserved + l;
+ l /= 2;
+ }
+
+ offset += l;
+ reserved += l;
+ }
+
+ return reserved;
+}
+
static void bch2_clear_folio_bits(struct folio *folio)
{
struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
diff --git a/libbcachefs/fs-io-pagecache.h b/libbcachefs/fs-io-pagecache.h
index 828c3d7c..fd7d692c 100644
--- a/libbcachefs/fs-io-pagecache.h
+++ b/libbcachefs/fs-io-pagecache.h
@@ -153,7 +153,12 @@ int bch2_folio_reservation_get(struct bch_fs *,
struct bch_inode_info *,
struct folio *,
struct bch2_folio_reservation *,
- unsigned, unsigned);
+ size_t, size_t);
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *,
+ struct bch_inode_info *,
+ struct folio *,
+ struct bch2_folio_reservation *,
+ size_t, size_t);
void bch2_set_folio_dirty(struct bch_fs *,
struct bch_inode_info *,
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 4a3e9f42..d34d628f 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -188,6 +188,12 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
BUG_ON(!old);
if (unlikely(old != inode)) {
+ /*
+ * bcachefs doesn't use I_NEW; we have no use for it since we
+ * only insert fully created inodes in the inode hash table. But
+ * discard_new_inode() expects it to be set...
+ */
+ inode->v.i_flags |= I_NEW;
discard_new_inode(&inode->v);
inode = old;
} else {
@@ -195,8 +201,10 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
/*
- * we really don't want insert_inode_locked2() to be setting
- * I_NEW...
+ * Again, I_NEW makes no sense for bcachefs. This is only needed
+ * for clearing I_NEW, but since the inode was already fully
+ * created and initialized we didn't actually want
+ * inode_insert5() to set it for us.
*/
unlock_new_inode(&inode->v);
}
@@ -880,6 +888,16 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->subvol = inode->ei_subvol;
stat->result_mask |= STATX_SUBVOL;
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
+ stat->result_mask |= STATX_DIOALIGN;
+ /*
+ * this is incorrect; we should be tracking this in superblock,
+ * and checking the alignment of open devices
+ */
+ stat->dio_mem_align = SECTOR_SIZE;
+ stat->dio_offset_align = block_bytes(c);
+ }
+
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
@@ -1157,6 +1175,7 @@ static const struct file_operations bch_file_operations = {
.read_iter = bch2_read_iter,
.write_iter = bch2_write_iter,
.mmap = bch2_mmap,
+ .get_unmapped_area = thp_get_unmapped_area,
.fsync = bch2_fsync,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
@@ -1488,11 +1507,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
bch2_iget5_set(&inode->v, &inum);
bch2_inode_update_after_write(trans, inode, bi, ~0);
- if (BCH_SUBVOLUME_SNAP(subvol))
- set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
- else
- clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
-
inode->v.i_blocks = bi->bi_sectors;
inode->v.i_ino = bi->bi_inum;
inode->v.i_rdev = bi->bi_dev;
@@ -1504,6 +1518,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
inode->ei_qid = bch_qid(bi);
inode->ei_subvol = inum.subvol;
+ if (BCH_SUBVOLUME_SNAP(subvol))
+ set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+
inode->v.i_mapping->a_ops = &bch_address_space_operations;
switch (inode->v.i_mode & S_IFMT) {
@@ -1776,7 +1793,8 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
const struct bch_option *opt = &bch2_opt_table[i];
u64 v = bch2_opt_get_by_id(&c->opts, i);
- if (!(opt->flags & OPT_MOUNT))
+ if ((opt->flags & OPT_HIDDEN) ||
+ !(opt->flags & OPT_MOUNT))
continue;
if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 6b807ecb..1e20020e 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -535,12 +535,13 @@ fsck_err:
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
+ prt_printf(out, "\n");
printbuf_indent_add(out, 2);
prt_printf(out, "mode=%o\n", inode->bi_mode);
prt_str(out, "flags=");
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
- prt_printf(out, " (%x)\n", inode->bi_flags);
+ prt_printf(out, "(%x)\n", inode->bi_flags);
prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq);
prt_printf(out, "bi_size=%llu\n", inode->bi_size);
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
index c97fa700..2a5c4371 100644
--- a/libbcachefs/io_read.c
+++ b/libbcachefs/io_read.c
@@ -389,7 +389,6 @@ retry:
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- bch2_trans_unlock(trans);
if (!bch2_bkey_matches_ptr(c, k,
rbio->pick.ptr,
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index c6197e6a..b3b05e93 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -1080,7 +1080,10 @@ do_write:
*_dst = dst;
return more;
csum_err:
- bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)",
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ op->pos.offset << 9,
+ "%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)",
op->flags & BCH_WRITE_MOVE ? "move" : "user");
ret = -EIO;
err:
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index d5a9f3ad..6209d778 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -1520,6 +1520,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
struct journal_entry_pin *pin;
spin_lock(&j->lock);
+ if (!test_bit(JOURNAL_running, &j->flags)) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+
*seq = max(*seq, j->pin.front);
if (*seq >= j->pin.back) {
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 5fda9a93..ff832d20 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -722,13 +722,16 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
+ printbuf_indent_add(out, 2);
for (i = 0; i < nr_types; i++) {
+ prt_newline(out);
bch2_prt_data_type(out, i);
prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
le64_to_cpu(u->d[i].buckets),
le64_to_cpu(u->d[i].sectors),
le64_to_cpu(u->d[i].fragmented));
}
+ printbuf_indent_sub(out, 2);
}
static int journal_entry_log_validate(struct bch_fs *c,
@@ -1678,6 +1681,13 @@ static CLOSURE_CALLBACK(journal_write_done)
mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
}
+ /*
+ * We don't typically trigger journal writes from her - the next journal
+ * write will be triggered immediately after the previous one is
+ * allocated, in bch2_journal_write() - but the journal write error path
+ * is special:
+ */
+ bch2_journal_do_writes(j);
spin_unlock(&j->lock);
}
@@ -1974,7 +1984,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_replicas_padded replicas;
- struct printbuf journal_debug_buf = PRINTBUF;
unsigned nr_rw_members = 0;
int ret;
@@ -2018,11 +2027,16 @@ CLOSURE_CALLBACK(bch2_journal_write)
}
if (ret) {
- __bch2_journal_debug_to_text(&journal_debug_buf, j);
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
+ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"),
+ le64_to_cpu(w->data->seq),
+ bch2_err_str(ret));
+ __bch2_journal_debug_to_text(&buf, j);
spin_unlock(&j->lock);
- bch_err(c, "Unable to allocate journal write:\n%s",
- journal_debug_buf.buf);
- printbuf_exit(&journal_debug_buf);
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
goto err;
}
diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c
index ed484670..1f25c111 100644
--- a/libbcachefs/journal_seq_blacklist.c
+++ b/libbcachefs/journal_seq_blacklist.c
@@ -232,7 +232,7 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c)
BUG_ON(nr != t->nr);
unsigned i;
- for (src = bl->start, i = eytzinger0_first(t->nr);
+ for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr);
src < bl->start + nr;
src++, i = eytzinger0_next(i, nr)) {
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h
index bd71ba77..425ba732 100644
--- a/libbcachefs/lru.h
+++ b/libbcachefs/lru.h
@@ -24,18 +24,6 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
return pos;
}
-#define BCH_LRU_TYPES() \
- x(read) \
- x(fragmentation)
-
-enum bch_lru_type {
-#define x(n) BCH_LRU_##n,
- BCH_LRU_TYPES()
-#undef x
-};
-
-#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
-
static inline enum bch_lru_type lru_type(struct bkey_s_c l)
{
u16 lru_id = l.k->p.inode >> 48;
diff --git a/libbcachefs/lru_format.h b/libbcachefs/lru_format.h
new file mode 100644
index 00000000..f372cb3b
--- /dev/null
+++ b/libbcachefs/lru_format.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_FORMAT_H
+#define _BCACHEFS_LRU_FORMAT_H
+
+struct bch_lru {
+ struct bch_val v;
+ __le64 idx;
+} __packed __aligned(8);
+
+#define BCH_LRU_TYPES() \
+ x(read) \
+ x(fragmentation)
+
+enum bch_lru_type {
+#define x(n) BCH_LRU_##n,
+ BCH_LRU_TYPES()
+#undef x
+};
+
+#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
+
+#define LRU_TIME_BITS 48
+#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
+
+#endif /* _BCACHEFS_LRU_FORMAT_H */
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index cff35845..60b93018 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -63,6 +63,7 @@ enum opt_flags {
OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */
OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */
+ OPT_HIDDEN = (1 << 10),
};
enum opt_type {
@@ -137,7 +138,7 @@ enum fsck_err_opts {
x(errors, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_error_actions), \
- BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \
+ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
NULL, "Action to take on filesystem error") \
x(metadata_replicas, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
@@ -406,7 +407,7 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
"offset", "Sector offset of superblock") \
x(read_only, u8, \
- OPT_FS, \
+ OPT_FS|OPT_MOUNT|OPT_HIDDEN, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, NULL) \
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 097ef7d1..d89eb43c 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -699,10 +699,10 @@ int bch2_fs_recovery(struct bch_fs *c)
if (check_version_upgrade(c))
write_sb = true;
+ c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+
if (write_sb)
bch2_write_super(c);
-
- c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
mutex_unlock(&c->sb_lock);
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
diff --git a/libbcachefs/recovery_passes.c b/libbcachefs/recovery_passes.c
index 4a59f52f..73339a0a 100644
--- a/libbcachefs/recovery_passes.c
+++ b/libbcachefs/recovery_passes.c
@@ -193,6 +193,8 @@ int bch2_run_online_recovery_passes(struct bch_fs *c)
{
int ret = 0;
+ down_read(&c->state_lock);
+
for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
struct recovery_pass_fn *p = recovery_pass_fns + i;
@@ -208,6 +210,8 @@ int bch2_run_online_recovery_passes(struct bch_fs *c)
break;
}
+ up_read(&c->state_lock);
+
return ret;
}
diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c
index be81c8c6..dfbbd33c 100644
--- a/libbcachefs/sb-downgrade.c
+++ b/libbcachefs/sb-downgrade.c
@@ -77,6 +77,7 @@
BCH_FSCK_ERR_fs_usage_cached_wrong, \
BCH_FSCK_ERR_fs_usage_reserved_wrong, \
BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
+ BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
BCH_FSCK_ERR_fs_usage_replicas_wrong, \
BCH_FSCK_ERR_bkey_version_in_future)
diff --git a/libbcachefs/sb-errors.c b/libbcachefs/sb-errors.c
index bda33e59..c1270d79 100644
--- a/libbcachefs/sb-errors.c
+++ b/libbcachefs/sb-errors.c
@@ -110,19 +110,25 @@ out:
void bch2_sb_errors_from_cpu(struct bch_fs *c)
{
bch_sb_errors_cpu *src = &c->fsck_error_counts;
- struct bch_sb_field_errors *dst =
- bch2_sb_field_resize(&c->disk_sb, errors,
- bch2_sb_field_errors_u64s(src->nr));
+ struct bch_sb_field_errors *dst;
unsigned i;
+ mutex_lock(&c->fsck_error_counts_lock);
+
+ dst = bch2_sb_field_resize(&c->disk_sb, errors,
+ bch2_sb_field_errors_u64s(src->nr));
+
if (!dst)
- return;
+ goto err;
for (i = 0; i < src->nr; i++) {
SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
}
+
+err:
+ mutex_unlock(&c->fsck_error_counts_lock);
}
static int bch2_sb_errors_to_cpu(struct bch_fs *c)
diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h
index 9dd2b7ae..67648b77 100644
--- a/libbcachefs/sb-errors_format.h
+++ b/libbcachefs/sb-errors_format.h
@@ -2,286 +2,294 @@
#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H
#define _BCACHEFS_SB_ERRORS_FORMAT_H
-#define BCH_SB_ERRS() \
- x(clean_but_journal_not_empty, 0) \
- x(dirty_but_no_journal_entries, 1) \
- x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \
- x(sb_clean_journal_seq_mismatch, 3) \
- x(sb_clean_btree_root_mismatch, 4) \
- x(sb_clean_missing, 5) \
- x(jset_unsupported_version, 6) \
- x(jset_unknown_csum, 7) \
- x(jset_last_seq_newer_than_seq, 8) \
- x(jset_past_bucket_end, 9) \
- x(jset_seq_blacklisted, 10) \
- x(journal_entries_missing, 11) \
- x(journal_entry_replicas_not_marked, 12) \
- x(journal_entry_past_jset_end, 13) \
- x(journal_entry_replicas_data_mismatch, 14) \
- x(journal_entry_bkey_u64s_0, 15) \
- x(journal_entry_bkey_past_end, 16) \
- x(journal_entry_bkey_bad_format, 17) \
- x(journal_entry_bkey_invalid, 18) \
- x(journal_entry_btree_root_bad_size, 19) \
- x(journal_entry_blacklist_bad_size, 20) \
- x(journal_entry_blacklist_v2_bad_size, 21) \
- x(journal_entry_blacklist_v2_start_past_end, 22) \
- x(journal_entry_usage_bad_size, 23) \
- x(journal_entry_data_usage_bad_size, 24) \
- x(journal_entry_clock_bad_size, 25) \
- x(journal_entry_clock_bad_rw, 26) \
- x(journal_entry_dev_usage_bad_size, 27) \
- x(journal_entry_dev_usage_bad_dev, 28) \
- x(journal_entry_dev_usage_bad_pad, 29) \
- x(btree_node_unreadable, 30) \
- x(btree_node_fault_injected, 31) \
- x(btree_node_bad_magic, 32) \
- x(btree_node_bad_seq, 33) \
- x(btree_node_unsupported_version, 34) \
- x(btree_node_bset_older_than_sb_min, 35) \
- x(btree_node_bset_newer_than_sb, 36) \
- x(btree_node_data_missing, 37) \
- x(btree_node_bset_after_end, 38) \
- x(btree_node_replicas_sectors_written_mismatch, 39) \
- x(btree_node_replicas_data_mismatch, 40) \
- x(bset_unknown_csum, 41) \
- x(bset_bad_csum, 42) \
- x(bset_past_end_of_btree_node, 43) \
- x(bset_wrong_sector_offset, 44) \
- x(bset_empty, 45) \
- x(bset_bad_seq, 46) \
- x(bset_blacklisted_journal_seq, 47) \
- x(first_bset_blacklisted_journal_seq, 48) \
- x(btree_node_bad_btree, 49) \
- x(btree_node_bad_level, 50) \
- x(btree_node_bad_min_key, 51) \
- x(btree_node_bad_max_key, 52) \
- x(btree_node_bad_format, 53) \
- x(btree_node_bkey_past_bset_end, 54) \
- x(btree_node_bkey_bad_format, 55) \
- x(btree_node_bad_bkey, 56) \
- x(btree_node_bkey_out_of_order, 57) \
- x(btree_root_bkey_invalid, 58) \
- x(btree_root_read_error, 59) \
- x(btree_root_bad_min_key, 60) \
- x(btree_root_bad_max_key, 61) \
- x(btree_node_read_error, 62) \
- x(btree_node_topology_bad_min_key, 63) \
- x(btree_node_topology_bad_max_key, 64) \
- x(btree_node_topology_overwritten_by_prev_node, 65) \
- x(btree_node_topology_overwritten_by_next_node, 66) \
- x(btree_node_topology_interior_node_empty, 67) \
- x(fs_usage_hidden_wrong, 68) \
- x(fs_usage_btree_wrong, 69) \
- x(fs_usage_data_wrong, 70) \
- x(fs_usage_cached_wrong, 71) \
- x(fs_usage_reserved_wrong, 72) \
- x(fs_usage_persistent_reserved_wrong, 73) \
- x(fs_usage_nr_inodes_wrong, 74) \
- x(fs_usage_replicas_wrong, 75) \
- x(dev_usage_buckets_wrong, 76) \
- x(dev_usage_sectors_wrong, 77) \
- x(dev_usage_fragmented_wrong, 78) \
- x(dev_usage_buckets_ec_wrong, 79) \
- x(bkey_version_in_future, 80) \
- x(bkey_u64s_too_small, 81) \
- x(bkey_invalid_type_for_btree, 82) \
- x(bkey_extent_size_zero, 83) \
- x(bkey_extent_size_greater_than_offset, 84) \
- x(bkey_size_nonzero, 85) \
- x(bkey_snapshot_nonzero, 86) \
- x(bkey_snapshot_zero, 87) \
- x(bkey_at_pos_max, 88) \
- x(bkey_before_start_of_btree_node, 89) \
- x(bkey_after_end_of_btree_node, 90) \
- x(bkey_val_size_nonzero, 91) \
- x(bkey_val_size_too_small, 92) \
- x(alloc_v1_val_size_bad, 93) \
- x(alloc_v2_unpack_error, 94) \
- x(alloc_v3_unpack_error, 95) \
- x(alloc_v4_val_size_bad, 96) \
- x(alloc_v4_backpointers_start_bad, 97) \
- x(alloc_key_data_type_bad, 98) \
- x(alloc_key_empty_but_have_data, 99) \
- x(alloc_key_dirty_sectors_0, 100) \
- x(alloc_key_data_type_inconsistency, 101) \
- x(alloc_key_to_missing_dev_bucket, 102) \
- x(alloc_key_cached_inconsistency, 103) \
- x(alloc_key_cached_but_read_time_zero, 104) \
- x(alloc_key_to_missing_lru_entry, 105) \
- x(alloc_key_data_type_wrong, 106) \
- x(alloc_key_gen_wrong, 107) \
- x(alloc_key_dirty_sectors_wrong, 108) \
- x(alloc_key_cached_sectors_wrong, 109) \
- x(alloc_key_stripe_wrong, 110) \
- x(alloc_key_stripe_redundancy_wrong, 111) \
- x(bucket_sector_count_overflow, 112) \
- x(bucket_metadata_type_mismatch, 113) \
- x(need_discard_key_wrong, 114) \
- x(freespace_key_wrong, 115) \
- x(freespace_hole_missing, 116) \
- x(bucket_gens_val_size_bad, 117) \
- x(bucket_gens_key_wrong, 118) \
- x(bucket_gens_hole_wrong, 119) \
- x(bucket_gens_to_invalid_dev, 120) \
- x(bucket_gens_to_invalid_buckets, 121) \
- x(bucket_gens_nonzero_for_invalid_buckets, 122) \
- x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \
- x(need_discard_freespace_key_bad, 124) \
- x(backpointer_bucket_offset_wrong, 125) \
- x(backpointer_to_missing_device, 126) \
- x(backpointer_to_missing_alloc, 127) \
- x(backpointer_to_missing_ptr, 128) \
- x(lru_entry_at_time_0, 129) \
- x(lru_entry_to_invalid_bucket, 130) \
- x(lru_entry_bad, 131) \
- x(btree_ptr_val_too_big, 132) \
- x(btree_ptr_v2_val_too_big, 133) \
- x(btree_ptr_has_non_ptr, 134) \
- x(extent_ptrs_invalid_entry, 135) \
- x(extent_ptrs_no_ptrs, 136) \
- x(extent_ptrs_too_many_ptrs, 137) \
- x(extent_ptrs_redundant_crc, 138) \
- x(extent_ptrs_redundant_stripe, 139) \
- x(extent_ptrs_unwritten, 140) \
- x(extent_ptrs_written_and_unwritten, 141) \
- x(ptr_to_invalid_device, 142) \
- x(ptr_to_duplicate_device, 143) \
- x(ptr_after_last_bucket, 144) \
- x(ptr_before_first_bucket, 145) \
- x(ptr_spans_multiple_buckets, 146) \
- x(ptr_to_missing_backpointer, 147) \
- x(ptr_to_missing_alloc_key, 148) \
- x(ptr_to_missing_replicas_entry, 149) \
- x(ptr_to_missing_stripe, 150) \
- x(ptr_to_incorrect_stripe, 151) \
- x(ptr_gen_newer_than_bucket_gen, 152) \
- x(ptr_too_stale, 153) \
- x(stale_dirty_ptr, 154) \
- x(ptr_bucket_data_type_mismatch, 155) \
- x(ptr_cached_and_erasure_coded, 156) \
- x(ptr_crc_uncompressed_size_too_small, 157) \
- x(ptr_crc_csum_type_unknown, 158) \
- x(ptr_crc_compression_type_unknown, 159) \
- x(ptr_crc_redundant, 160) \
- x(ptr_crc_uncompressed_size_too_big, 161) \
- x(ptr_crc_nonce_mismatch, 162) \
- x(ptr_stripe_redundant, 163) \
- x(reservation_key_nr_replicas_invalid, 164) \
- x(reflink_v_refcount_wrong, 165) \
- x(reflink_p_to_missing_reflink_v, 166) \
- x(stripe_pos_bad, 167) \
- x(stripe_val_size_bad, 168) \
- x(stripe_sector_count_wrong, 169) \
- x(snapshot_tree_pos_bad, 170) \
- x(snapshot_tree_to_missing_snapshot, 171) \
- x(snapshot_tree_to_missing_subvol, 172) \
- x(snapshot_tree_to_wrong_subvol, 173) \
- x(snapshot_tree_to_snapshot_subvol, 174) \
- x(snapshot_pos_bad, 175) \
- x(snapshot_parent_bad, 176) \
- x(snapshot_children_not_normalized, 177) \
- x(snapshot_child_duplicate, 178) \
- x(snapshot_child_bad, 179) \
- x(snapshot_skiplist_not_normalized, 180) \
- x(snapshot_skiplist_bad, 181) \
- x(snapshot_should_not_have_subvol, 182) \
- x(snapshot_to_bad_snapshot_tree, 183) \
- x(snapshot_bad_depth, 184) \
- x(snapshot_bad_skiplist, 185) \
- x(subvol_pos_bad, 186) \
- x(subvol_not_master_and_not_snapshot, 187) \
- x(subvol_to_missing_root, 188) \
- x(subvol_root_wrong_bi_subvol, 189) \
- x(bkey_in_missing_snapshot, 190) \
- x(inode_pos_inode_nonzero, 191) \
- x(inode_pos_blockdev_range, 192) \
- x(inode_unpack_error, 193) \
- x(inode_str_hash_invalid, 194) \
- x(inode_v3_fields_start_bad, 195) \
- x(inode_snapshot_mismatch, 196) \
- x(inode_unlinked_but_clean, 197) \
- x(inode_unlinked_but_nlink_nonzero, 198) \
- x(inode_checksum_type_invalid, 199) \
- x(inode_compression_type_invalid, 200) \
- x(inode_subvol_root_but_not_dir, 201) \
- x(inode_i_size_dirty_but_clean, 202) \
- x(inode_i_sectors_dirty_but_clean, 203) \
- x(inode_i_sectors_wrong, 204) \
- x(inode_dir_wrong_nlink, 205) \
- x(inode_dir_multiple_links, 206) \
- x(inode_multiple_links_but_nlink_0, 207) \
- x(inode_wrong_backpointer, 208) \
- x(inode_wrong_nlink, 209) \
- x(inode_unreachable, 210) \
- x(deleted_inode_but_clean, 211) \
- x(deleted_inode_missing, 212) \
- x(deleted_inode_is_dir, 213) \
- x(deleted_inode_not_unlinked, 214) \
- x(extent_overlapping, 215) \
- x(key_in_missing_inode, 216) \
- x(key_in_wrong_inode_type, 217) \
- x(extent_past_end_of_inode, 218) \
- x(dirent_empty_name, 219) \
- x(dirent_val_too_big, 220) \
- x(dirent_name_too_long, 221) \
- x(dirent_name_embedded_nul, 222) \
- x(dirent_name_dot_or_dotdot, 223) \
- x(dirent_name_has_slash, 224) \
- x(dirent_d_type_wrong, 225) \
- x(inode_bi_parent_wrong, 226) \
- x(dirent_in_missing_dir_inode, 227) \
- x(dirent_in_non_dir_inode, 228) \
- x(dirent_to_missing_inode, 229) \
- x(dirent_to_missing_subvol, 230) \
- x(dirent_to_itself, 231) \
- x(quota_type_invalid, 232) \
- x(xattr_val_size_too_small, 233) \
- x(xattr_val_size_too_big, 234) \
- x(xattr_invalid_type, 235) \
- x(xattr_name_invalid_chars, 236) \
- x(xattr_in_missing_inode, 237) \
- x(root_subvol_missing, 238) \
- x(root_dir_missing, 239) \
- x(root_inode_not_dir, 240) \
- x(dir_loop, 241) \
- x(hash_table_key_duplicate, 242) \
- x(hash_table_key_wrong_offset, 243) \
- x(unlinked_inode_not_on_deleted_list, 244) \
- x(reflink_p_front_pad_bad, 245) \
- x(journal_entry_dup_same_device, 246) \
- x(inode_bi_subvol_missing, 247) \
- x(inode_bi_subvol_wrong, 248) \
- x(inode_points_to_missing_dirent, 249) \
- x(inode_points_to_wrong_dirent, 250) \
- x(inode_bi_parent_nonzero, 251) \
- x(dirent_to_missing_parent_subvol, 252) \
- x(dirent_not_visible_in_parent_subvol, 253) \
- x(subvol_fs_path_parent_wrong, 254) \
- x(subvol_root_fs_path_parent_nonzero, 255) \
- x(subvol_children_not_set, 256) \
- x(subvol_children_bad, 257) \
- x(subvol_loop, 258) \
- x(subvol_unreachable, 259) \
- x(btree_node_bkey_bad_u64s, 260) \
- x(btree_node_topology_empty_interior_node, 261) \
- x(btree_ptr_v2_min_key_bad, 262) \
- x(btree_root_unreadable_and_scan_found_nothing, 263) \
- x(snapshot_node_missing, 264) \
- x(dup_backpointer_to_bad_csum_extent, 265) \
- x(btree_bitmap_not_marked, 266) \
- x(sb_clean_entry_overrun, 267) \
- x(btree_ptr_v2_written_0, 268) \
- x(subvol_snapshot_bad, 269) \
- x(subvol_inode_bad, 270) \
- x(alloc_key_stripe_sectors_wrong, 271) \
- x(accounting_mismatch, 272) \
- x(accounting_replicas_not_marked, 273) \
- x(invalid_btree_id, 274) \
- x(alloc_key_io_time_bad, 275)
+enum bch_fsck_flags {
+ FSCK_CAN_FIX = 1 << 0,
+ FSCK_CAN_IGNORE = 1 << 1,
+ FSCK_NEED_FSCK = 1 << 2,
+ FSCK_NO_RATELIMIT = 1 << 3,
+ FSCK_AUTOFIX = 1 << 4,
+};
+
+#define BCH_SB_ERRS() \
+ x(clean_but_journal_not_empty, 0, 0) \
+ x(dirty_but_no_journal_entries, 1, 0) \
+ x(dirty_but_no_journal_entries_post_drop_nonflushes, 2, 0) \
+ x(sb_clean_journal_seq_mismatch, 3, 0) \
+ x(sb_clean_btree_root_mismatch, 4, 0) \
+ x(sb_clean_missing, 5, 0) \
+ x(jset_unsupported_version, 6, 0) \
+ x(jset_unknown_csum, 7, 0) \
+ x(jset_last_seq_newer_than_seq, 8, 0) \
+ x(jset_past_bucket_end, 9, 0) \
+ x(jset_seq_blacklisted, 10, 0) \
+ x(journal_entries_missing, 11, 0) \
+ x(journal_entry_replicas_not_marked, 12, 0) \
+ x(journal_entry_past_jset_end, 13, 0) \
+ x(journal_entry_replicas_data_mismatch, 14, 0) \
+ x(journal_entry_bkey_u64s_0, 15, 0) \
+ x(journal_entry_bkey_past_end, 16, 0) \
+ x(journal_entry_bkey_bad_format, 17, 0) \
+ x(journal_entry_bkey_invalid, 18, 0) \
+ x(journal_entry_btree_root_bad_size, 19, 0) \
+ x(journal_entry_blacklist_bad_size, 20, 0) \
+ x(journal_entry_blacklist_v2_bad_size, 21, 0) \
+ x(journal_entry_blacklist_v2_start_past_end, 22, 0) \
+ x(journal_entry_usage_bad_size, 23, 0) \
+ x(journal_entry_data_usage_bad_size, 24, 0) \
+ x(journal_entry_clock_bad_size, 25, 0) \
+ x(journal_entry_clock_bad_rw, 26, 0) \
+ x(journal_entry_dev_usage_bad_size, 27, 0) \
+ x(journal_entry_dev_usage_bad_dev, 28, 0) \
+ x(journal_entry_dev_usage_bad_pad, 29, 0) \
+ x(btree_node_unreadable, 30, 0) \
+ x(btree_node_fault_injected, 31, 0) \
+ x(btree_node_bad_magic, 32, 0) \
+ x(btree_node_bad_seq, 33, 0) \
+ x(btree_node_unsupported_version, 34, 0) \
+ x(btree_node_bset_older_than_sb_min, 35, 0) \
+ x(btree_node_bset_newer_than_sb, 36, 0) \
+ x(btree_node_data_missing, 37, 0) \
+ x(btree_node_bset_after_end, 38, 0) \
+ x(btree_node_replicas_sectors_written_mismatch, 39, 0) \
+ x(btree_node_replicas_data_mismatch, 40, 0) \
+ x(bset_unknown_csum, 41, 0) \
+ x(bset_bad_csum, 42, 0) \
+ x(bset_past_end_of_btree_node, 43, 0) \
+ x(bset_wrong_sector_offset, 44, 0) \
+ x(bset_empty, 45, 0) \
+ x(bset_bad_seq, 46, 0) \
+ x(bset_blacklisted_journal_seq, 47, 0) \
+ x(first_bset_blacklisted_journal_seq, 48, 0) \
+ x(btree_node_bad_btree, 49, 0) \
+ x(btree_node_bad_level, 50, 0) \
+ x(btree_node_bad_min_key, 51, 0) \
+ x(btree_node_bad_max_key, 52, 0) \
+ x(btree_node_bad_format, 53, 0) \
+ x(btree_node_bkey_past_bset_end, 54, 0) \
+ x(btree_node_bkey_bad_format, 55, 0) \
+ x(btree_node_bad_bkey, 56, 0) \
+ x(btree_node_bkey_out_of_order, 57, 0) \
+ x(btree_root_bkey_invalid, 58, 0) \
+ x(btree_root_read_error, 59, 0) \
+ x(btree_root_bad_min_key, 60, 0) \
+ x(btree_root_bad_max_key, 61, 0) \
+ x(btree_node_read_error, 62, 0) \
+ x(btree_node_topology_bad_min_key, 63, 0) \
+ x(btree_node_topology_bad_max_key, 64, 0) \
+ x(btree_node_topology_overwritten_by_prev_node, 65, 0) \
+ x(btree_node_topology_overwritten_by_next_node, 66, 0) \
+ x(btree_node_topology_interior_node_empty, 67, 0) \
+ x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \
+ x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \
+ x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \
+ x(fs_usage_cached_wrong, 71, FSCK_AUTOFIX) \
+ x(fs_usage_reserved_wrong, 72, FSCK_AUTOFIX) \
+ x(fs_usage_persistent_reserved_wrong, 73, FSCK_AUTOFIX) \
+ x(fs_usage_nr_inodes_wrong, 74, FSCK_AUTOFIX) \
+ x(fs_usage_replicas_wrong, 75, FSCK_AUTOFIX) \
+ x(dev_usage_buckets_wrong, 76, FSCK_AUTOFIX) \
+ x(dev_usage_sectors_wrong, 77, FSCK_AUTOFIX) \
+ x(dev_usage_fragmented_wrong, 78, FSCK_AUTOFIX) \
+ x(dev_usage_buckets_ec_wrong, 79, FSCK_AUTOFIX) \
+ x(bkey_version_in_future, 80, 0) \
+ x(bkey_u64s_too_small, 81, 0) \
+ x(bkey_invalid_type_for_btree, 82, 0) \
+ x(bkey_extent_size_zero, 83, 0) \
+ x(bkey_extent_size_greater_than_offset, 84, 0) \
+ x(bkey_size_nonzero, 85, 0) \
+ x(bkey_snapshot_nonzero, 86, 0) \
+ x(bkey_snapshot_zero, 87, 0) \
+ x(bkey_at_pos_max, 88, 0) \
+ x(bkey_before_start_of_btree_node, 89, 0) \
+ x(bkey_after_end_of_btree_node, 90, 0) \
+ x(bkey_val_size_nonzero, 91, 0) \
+ x(bkey_val_size_too_small, 92, 0) \
+ x(alloc_v1_val_size_bad, 93, 0) \
+ x(alloc_v2_unpack_error, 94, 0) \
+ x(alloc_v3_unpack_error, 95, 0) \
+ x(alloc_v4_val_size_bad, 96, 0) \
+ x(alloc_v4_backpointers_start_bad, 97, 0) \
+ x(alloc_key_data_type_bad, 98, 0) \
+ x(alloc_key_empty_but_have_data, 99, 0) \
+ x(alloc_key_dirty_sectors_0, 100, 0) \
+ x(alloc_key_data_type_inconsistency, 101, 0) \
+ x(alloc_key_to_missing_dev_bucket, 102, 0) \
+ x(alloc_key_cached_inconsistency, 103, 0) \
+ x(alloc_key_cached_but_read_time_zero, 104, 0) \
+ x(alloc_key_to_missing_lru_entry, 105, 0) \
+ x(alloc_key_data_type_wrong, 106, FSCK_AUTOFIX) \
+ x(alloc_key_gen_wrong, 107, FSCK_AUTOFIX) \
+ x(alloc_key_dirty_sectors_wrong, 108, FSCK_AUTOFIX) \
+ x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \
+ x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \
+ x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \
+ x(bucket_sector_count_overflow, 112, 0) \
+ x(bucket_metadata_type_mismatch, 113, 0) \
+ x(need_discard_key_wrong, 114, 0) \
+ x(freespace_key_wrong, 115, 0) \
+ x(freespace_hole_missing, 116, 0) \
+ x(bucket_gens_val_size_bad, 117, 0) \
+ x(bucket_gens_key_wrong, 118, 0) \
+ x(bucket_gens_hole_wrong, 119, 0) \
+ x(bucket_gens_to_invalid_dev, 120, 0) \
+ x(bucket_gens_to_invalid_buckets, 121, 0) \
+ x(bucket_gens_nonzero_for_invalid_buckets, 122, 0) \
+ x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \
+ x(need_discard_freespace_key_bad, 124, 0) \
+ x(backpointer_bucket_offset_wrong, 125, 0) \
+ x(backpointer_to_missing_device, 126, 0) \
+ x(backpointer_to_missing_alloc, 127, 0) \
+ x(backpointer_to_missing_ptr, 128, 0) \
+ x(lru_entry_at_time_0, 129, 0) \
+ x(lru_entry_to_invalid_bucket, 130, 0) \
+ x(lru_entry_bad, 131, 0) \
+ x(btree_ptr_val_too_big, 132, 0) \
+ x(btree_ptr_v2_val_too_big, 133, 0) \
+ x(btree_ptr_has_non_ptr, 134, 0) \
+ x(extent_ptrs_invalid_entry, 135, 0) \
+ x(extent_ptrs_no_ptrs, 136, 0) \
+ x(extent_ptrs_too_many_ptrs, 137, 0) \
+ x(extent_ptrs_redundant_crc, 138, 0) \
+ x(extent_ptrs_redundant_stripe, 139, 0) \
+ x(extent_ptrs_unwritten, 140, 0) \
+ x(extent_ptrs_written_and_unwritten, 141, 0) \
+ x(ptr_to_invalid_device, 142, 0) \
+ x(ptr_to_duplicate_device, 143, 0) \
+ x(ptr_after_last_bucket, 144, 0) \
+ x(ptr_before_first_bucket, 145, 0) \
+ x(ptr_spans_multiple_buckets, 146, 0) \
+ x(ptr_to_missing_backpointer, 147, 0) \
+ x(ptr_to_missing_alloc_key, 148, 0) \
+ x(ptr_to_missing_replicas_entry, 149, 0) \
+ x(ptr_to_missing_stripe, 150, 0) \
+ x(ptr_to_incorrect_stripe, 151, 0) \
+ x(ptr_gen_newer_than_bucket_gen, 152, 0) \
+ x(ptr_too_stale, 153, 0) \
+ x(stale_dirty_ptr, 154, 0) \
+ x(ptr_bucket_data_type_mismatch, 155, 0) \
+ x(ptr_cached_and_erasure_coded, 156, 0) \
+ x(ptr_crc_uncompressed_size_too_small, 157, 0) \
+ x(ptr_crc_csum_type_unknown, 158, 0) \
+ x(ptr_crc_compression_type_unknown, 159, 0) \
+ x(ptr_crc_redundant, 160, 0) \
+ x(ptr_crc_uncompressed_size_too_big, 161, 0) \
+ x(ptr_crc_nonce_mismatch, 162, 0) \
+ x(ptr_stripe_redundant, 163, 0) \
+ x(reservation_key_nr_replicas_invalid, 164, 0) \
+ x(reflink_v_refcount_wrong, 165, 0) \
+ x(reflink_p_to_missing_reflink_v, 166, 0) \
+ x(stripe_pos_bad, 167, 0) \
+ x(stripe_val_size_bad, 168, 0) \
+ x(stripe_sector_count_wrong, 169, 0) \
+ x(snapshot_tree_pos_bad, 170, 0) \
+ x(snapshot_tree_to_missing_snapshot, 171, 0) \
+ x(snapshot_tree_to_missing_subvol, 172, 0) \
+ x(snapshot_tree_to_wrong_subvol, 173, 0) \
+ x(snapshot_tree_to_snapshot_subvol, 174, 0) \
+ x(snapshot_pos_bad, 175, 0) \
+ x(snapshot_parent_bad, 176, 0) \
+ x(snapshot_children_not_normalized, 177, 0) \
+ x(snapshot_child_duplicate, 178, 0) \
+ x(snapshot_child_bad, 179, 0) \
+ x(snapshot_skiplist_not_normalized, 180, 0) \
+ x(snapshot_skiplist_bad, 181, 0) \
+ x(snapshot_should_not_have_subvol, 182, 0) \
+ x(snapshot_to_bad_snapshot_tree, 183, 0) \
+ x(snapshot_bad_depth, 184, 0) \
+ x(snapshot_bad_skiplist, 185, 0) \
+ x(subvol_pos_bad, 186, 0) \
+ x(subvol_not_master_and_not_snapshot, 187, 0) \
+ x(subvol_to_missing_root, 188, 0) \
+ x(subvol_root_wrong_bi_subvol, 189, 0) \
+ x(bkey_in_missing_snapshot, 190, 0) \
+ x(inode_pos_inode_nonzero, 191, 0) \
+ x(inode_pos_blockdev_range, 192, 0) \
+ x(inode_unpack_error, 193, 0) \
+ x(inode_str_hash_invalid, 194, 0) \
+ x(inode_v3_fields_start_bad, 195, 0) \
+ x(inode_snapshot_mismatch, 196, 0) \
+ x(inode_unlinked_but_clean, 197, 0) \
+ x(inode_unlinked_but_nlink_nonzero, 198, 0) \
+ x(inode_checksum_type_invalid, 199, 0) \
+ x(inode_compression_type_invalid, 200, 0) \
+ x(inode_subvol_root_but_not_dir, 201, 0) \
+ x(inode_i_size_dirty_but_clean, 202, 0) \
+ x(inode_i_sectors_dirty_but_clean, 203, 0) \
+ x(inode_i_sectors_wrong, 204, 0) \
+ x(inode_dir_wrong_nlink, 205, 0) \
+ x(inode_dir_multiple_links, 206, 0) \
+ x(inode_multiple_links_but_nlink_0, 207, 0) \
+ x(inode_wrong_backpointer, 208, 0) \
+ x(inode_wrong_nlink, 209, 0) \
+ x(inode_unreachable, 210, 0) \
+ x(deleted_inode_but_clean, 211, 0) \
+ x(deleted_inode_missing, 212, 0) \
+ x(deleted_inode_is_dir, 213, 0) \
+ x(deleted_inode_not_unlinked, 214, 0) \
+ x(extent_overlapping, 215, 0) \
+ x(key_in_missing_inode, 216, 0) \
+ x(key_in_wrong_inode_type, 217, 0) \
+ x(extent_past_end_of_inode, 218, 0) \
+ x(dirent_empty_name, 219, 0) \
+ x(dirent_val_too_big, 220, 0) \
+ x(dirent_name_too_long, 221, 0) \
+ x(dirent_name_embedded_nul, 222, 0) \
+ x(dirent_name_dot_or_dotdot, 223, 0) \
+ x(dirent_name_has_slash, 224, 0) \
+ x(dirent_d_type_wrong, 225, 0) \
+ x(inode_bi_parent_wrong, 226, 0) \
+ x(dirent_in_missing_dir_inode, 227, 0) \
+ x(dirent_in_non_dir_inode, 228, 0) \
+ x(dirent_to_missing_inode, 229, 0) \
+ x(dirent_to_missing_subvol, 230, 0) \
+ x(dirent_to_itself, 231, 0) \
+ x(quota_type_invalid, 232, 0) \
+ x(xattr_val_size_too_small, 233, 0) \
+ x(xattr_val_size_too_big, 234, 0) \
+ x(xattr_invalid_type, 235, 0) \
+ x(xattr_name_invalid_chars, 236, 0) \
+ x(xattr_in_missing_inode, 237, 0) \
+ x(root_subvol_missing, 238, 0) \
+ x(root_dir_missing, 239, 0) \
+ x(root_inode_not_dir, 240, 0) \
+ x(dir_loop, 241, 0) \
+ x(hash_table_key_duplicate, 242, 0) \
+ x(hash_table_key_wrong_offset, 243, 0) \
+ x(unlinked_inode_not_on_deleted_list, 244, 0) \
+ x(reflink_p_front_pad_bad, 245, 0) \
+ x(journal_entry_dup_same_device, 246, 0) \
+ x(inode_bi_subvol_missing, 247, 0) \
+ x(inode_bi_subvol_wrong, 248, 0) \
+ x(inode_points_to_missing_dirent, 249, 0) \
+ x(inode_points_to_wrong_dirent, 250, 0) \
+ x(inode_bi_parent_nonzero, 251, 0) \
+ x(dirent_to_missing_parent_subvol, 252, 0) \
+ x(dirent_not_visible_in_parent_subvol, 253, 0) \
+ x(subvol_fs_path_parent_wrong, 254, 0) \
+ x(subvol_root_fs_path_parent_nonzero, 255, 0) \
+ x(subvol_children_not_set, 256, 0) \
+ x(subvol_children_bad, 257, 0) \
+ x(subvol_loop, 258, 0) \
+ x(subvol_unreachable, 259, 0) \
+ x(btree_node_bkey_bad_u64s, 260, 0) \
+ x(btree_node_topology_empty_interior_node, 261, 0) \
+ x(btree_ptr_v2_min_key_bad, 262, 0) \
+ x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \
+ x(snapshot_node_missing, 264, 0) \
+ x(dup_backpointer_to_bad_csum_extent, 265, 0) \
+ x(btree_bitmap_not_marked, 266, 0) \
+ x(sb_clean_entry_overrun, 267, 0) \
+ x(btree_ptr_v2_written_0, 268, 0) \
+ x(subvol_snapshot_bad, 269, 0) \
+ x(subvol_inode_bad, 270, 0) \
+ x(alloc_key_stripe_sectors_wrong, 271, 0) \
+ x(accounting_mismatch, 272, 0) \
+ x(accounting_replicas_not_marked, 273, 0) \
+ x(invalid_btree_id, 274, 0) \
+ x(alloc_key_io_time_bad, 275, 0)
enum bch_sb_error_id {
-#define x(t, n) BCH_FSCK_ERR_##t = n,
+#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
BCH_SB_ERRS()
#undef x
BCH_SB_ERR_MAX
diff --git a/libbcachefs/seqmutex.h b/libbcachefs/seqmutex.h
index c1860d81..c4b3d8d3 100644
--- a/libbcachefs/seqmutex.h
+++ b/libbcachefs/seqmutex.h
@@ -19,17 +19,14 @@ static inline bool seqmutex_trylock(struct seqmutex *lock)
static inline void seqmutex_lock(struct seqmutex *lock)
{
mutex_lock(&lock->lock);
-}
-
-static inline void seqmutex_unlock(struct seqmutex *lock)
-{
lock->seq++;
- mutex_unlock(&lock->lock);
}
-static inline u32 seqmutex_seq(struct seqmutex *lock)
+static inline u32 seqmutex_unlock(struct seqmutex *lock)
{
- return lock->seq;
+ u32 seq = lock->seq;
+ mutex_unlock(&lock->lock);
+ return seq;
}
static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c
index fa7ad586..96744b1a 100644
--- a/libbcachefs/snapshot.c
+++ b/libbcachefs/snapshot.c
@@ -168,6 +168,9 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
+ if (unlikely(new_bytes > INT_MAX))
+ return NULL;
+
new = kvzalloc(new_bytes, GFP_KERNEL);
if (!new)
return NULL;
@@ -1682,6 +1685,8 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+ set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
+
bch2_delete_dead_snapshots(c);
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index bfdec48e..ced63397 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -535,7 +535,6 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
- bch2_fs_allocator_background_exit(c);
bch2_fs_accounting_exit(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
@@ -564,8 +563,11 @@ static void __bch2_fs_free(struct bch_fs *c)
BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
- EBUG_ON(c->online_reserved && percpu_u64_get(c->online_reserved));
- free_percpu(c->online_reserved);
+ if (c->online_reserved) {
+ u64 v = percpu_u64_get(c->online_reserved);
+ WARN(v, "online_reserved not 0 at shutdown: %lli", v);
+ free_percpu(c->online_reserved);
+ }
darray_exit(&c->btree_roots_extra);
free_percpu(c->pcpu);
@@ -1193,6 +1195,7 @@ static void bch2_dev_free(struct bch_dev *ca)
kfree(ca->buckets_nouse);
bch2_free_super(&ca->disk_sb);
+ bch2_dev_allocator_background_exit(ca);
bch2_dev_journal_exit(ca);
free_percpu(ca->io_done);
@@ -1315,6 +1318,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
atomic_long_set(&ca->ref, 1);
#endif
+ bch2_dev_allocator_background_init(ca);
+
if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
@@ -1527,6 +1532,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
* The allocator thread itself allocates btree nodes, so stop it first:
*/
bch2_dev_allocator_remove(c, ca);
+ bch2_recalc_capacity(c);
bch2_dev_journal_stop(&c->journal, ca);
}
@@ -1538,6 +1544,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+ bch2_dev_do_discards(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index f4dd09c4..76ffe08e 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -697,14 +697,19 @@ do { \
} \
} while (0)
+#define per_cpu_sum(_p) \
+({ \
+ typeof(*_p) _ret = 0; \
+ \
+ int cpu; \
+ for_each_possible_cpu(cpu) \
+ _ret += *per_cpu_ptr(_p, cpu); \
+ _ret; \
+})
+
static inline u64 percpu_u64_get(u64 __percpu *src)
{
- u64 ret = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- ret += *per_cpu_ptr(src, cpu);
- return ret;
+ return per_cpu_sum(src);
}
static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
diff --git a/linux/closure.c b/linux/closure.c
index 07409e9e..c971216d 100644
--- a/linux/closure.c
+++ b/linux/closure.c
@@ -13,14 +13,25 @@
#include <linux/seq_file.h>
#include <linux/sched/debug.h>
-static inline void closure_put_after_sub(struct closure *cl, int flags)
+static inline void closure_put_after_sub_checks(int flags)
{
int r = flags & CLOSURE_REMAINING_MASK;
- BUG_ON(flags & CLOSURE_GUARD_MASK);
- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
+ if (WARN(flags & CLOSURE_GUARD_MASK,
+ "closure has guard bits set: %x (%u)",
+ flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r)))
+ r &= ~CLOSURE_GUARD_MASK;
+
+ WARN(!r && (flags & ~CLOSURE_DESTRUCTOR),
+ "closure ref hit 0 with incorrect flags set: %x (%u)",
+ flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags));
+}
+
+static inline void closure_put_after_sub(struct closure *cl, int flags)
+{
+ closure_put_after_sub_checks(flags);
- if (!r) {
+ if (!(flags & CLOSURE_REMAINING_MASK)) {
smp_acquire__after_ctrl_dep();
cl->closure_get_happened = false;
@@ -139,6 +150,41 @@ void __sched __closure_sync(struct closure *cl)
}
EXPORT_SYMBOL(__closure_sync);
+/*
+ * closure_return_sync - finish running a closure, synchronously (i.e. waiting
+ * for outstanding get()s to finish) and returning once closure refcount is 0.
+ *
+ * Unlike closure_sync() this doesn't reinit the ref to 1; subsequent
+ * closure_get_not_zero() calls waill fail.
+ */
+void __sched closure_return_sync(struct closure *cl)
+{
+ struct closure_syncer s = { .task = current };
+
+ cl->s = &s;
+ set_closure_fn(cl, closure_sync_fn, NULL);
+
+ unsigned flags = atomic_sub_return_release(1 + CLOSURE_RUNNING - CLOSURE_DESTRUCTOR,
+ &cl->remaining);
+
+ closure_put_after_sub_checks(flags);
+
+ if (unlikely(flags & CLOSURE_REMAINING_MASK)) {
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (s.done)
+ break;
+ schedule();
+ }
+
+ __set_current_state(TASK_RUNNING);
+ }
+
+ if (cl->parent)
+ closure_put(cl->parent);
+}
+EXPORT_SYMBOL(closure_return_sync);
+
int __sched __closure_sync_timeout(struct closure *cl, unsigned long timeout)
{
struct closure_syncer s = { .task = current };