summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.bcachefs_revision2
-rw-r--r--Makefile2
-rw-r--r--include/linux/atomic.h12
-rw-r--r--include/linux/kernel.h2
-rw-r--r--include/linux/mean_and_variance.h2
-rw-r--r--include/linux/wait.h1
-rw-r--r--include/trace/events/bcachefs.h27
-rw-r--r--libbcachefs/alloc_background.c16
-rw-r--r--libbcachefs/alloc_foreground.c167
-rw-r--r--libbcachefs/alloc_foreground.h25
-rw-r--r--libbcachefs/alloc_types.h13
-rw-r--r--libbcachefs/backpointers.c23
-rw-r--r--libbcachefs/backpointers.h26
-rw-r--r--libbcachefs/bcachefs.h33
-rw-r--r--libbcachefs/bkey_sort.c2
-rw-r--r--libbcachefs/btree_cache.c11
-rw-r--r--libbcachefs/btree_io.c78
-rw-r--r--libbcachefs/btree_io.h10
-rw-r--r--libbcachefs/btree_iter.c45
-rw-r--r--libbcachefs/btree_iter.h5
-rw-r--r--libbcachefs/btree_locking.c3
-rw-r--r--libbcachefs/btree_types.h1
-rw-r--r--libbcachefs/btree_update_interior.c13
-rw-r--r--libbcachefs/btree_update_interior.h1
-rw-r--r--libbcachefs/btree_update_leaf.c81
-rw-r--r--libbcachefs/buckets.c19
-rw-r--r--libbcachefs/buckets.h34
-rw-r--r--libbcachefs/checksum.c2
-rw-r--r--libbcachefs/checksum.h12
-rw-r--r--libbcachefs/data_update.c12
-rw-r--r--libbcachefs/data_update.h5
-rw-r--r--libbcachefs/error.c2
-rw-r--r--libbcachefs/fs-io.c367
-rw-r--r--libbcachefs/fs.c54
-rw-r--r--libbcachefs/fs.h35
-rw-r--r--libbcachefs/io.c314
-rw-r--r--libbcachefs/io.h36
-rw-r--r--libbcachefs/io_types.h14
-rw-r--r--libbcachefs/keylist.h1
-rw-r--r--libbcachefs/move.c31
-rw-r--r--libbcachefs/move.h3
-rw-r--r--libbcachefs/movinggc.c2
-rw-r--r--libbcachefs/rebalance.c2
-rw-r--r--libbcachefs/recovery.c5
-rw-r--r--libbcachefs/reflink.c2
-rw-r--r--libbcachefs/sysfs.c16
-rw-r--r--libbcachefs/two_state_shared_lock.c33
-rw-r--r--libbcachefs/two_state_shared_lock.h28
-rw-r--r--linux/mean_and_variance.c2
-rw-r--r--linux/printbuf_userspace.c5
-rw-r--r--linux/six.c6
-rw-r--r--linux/wait.c5
52 files changed, 900 insertions, 748 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 3a147c61..1b5f928a 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-61ebcb532a1266e5e36f354858b552e2a4fb9925
+8d3fc97ca3f24d8f7ab1e9ed04d8ca354c44dd8c
diff --git a/Makefile b/Makefile
index d460a6d3..01aa0b71 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ PREFIX?=/usr/local
PKG_CONFIG?=pkg-config
INSTALL=install
-CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall -fPIC \
+CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC \
-Wno-pointer-sign \
-fno-strict-aliasing \
-fno-delete-null-pointer-checks \
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 38a364c0..a9852fa1 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -54,6 +54,8 @@ typedef struct {
#define __ATOMIC_ADD_RETURN_RELEASE(v, p) \
__atomic_add_fetch(p, v, __ATOMIC_RELEASE)
#define __ATOMIC_SUB_RETURN(v, p) __atomic_sub_fetch(p, v, __ATOMIC_RELAXED)
+#define __ATOMIC_SUB_RETURN_RELEASE(v, p) \
+ __atomic_sub_fetch(p, v, __ATOMIC_RELEASE)
#define xchg(p, v) __atomic_exchange_n(p, v, __ATOMIC_SEQ_CST)
#define xchg_acquire(p, v) __atomic_exchange_n(p, v, __ATOMIC_ACQUIRE)
@@ -123,6 +125,11 @@ do { \
({ smp_mb__before_atomic(); __ATOMIC_ADD_RETURN(i, v); })
#endif
+#ifndef __ATOMIC_SUB_RETURN_RELEASE
+#define __ATOMIC_SUB_RETURN_RELEASE(i, v) \
+ ({ smp_mb__before_atomic(); __ATOMIC_SUB_RETURN(i, v); })
+#endif
+
#ifndef __ATOMIC_SUB
#define __ATOMIC_SUB(i, v) __ATOMIC_SUB_RETURN(i, v)
#endif
@@ -164,6 +171,11 @@ static inline i_type a_type##_add_return_release(i_type i, a_type##_t *v)\
return __ATOMIC_ADD_RETURN_RELEASE(i, &v->counter); \
} \
\
+static inline i_type a_type##_sub_return_release(i_type i, a_type##_t *v)\
+{ \
+ return __ATOMIC_SUB_RETURN_RELEASE(i, &v->counter); \
+} \
+ \
static inline i_type a_type##_sub_return(i_type i, a_type##_t *v) \
{ \
return __ATOMIC_SUB_RETURN(i, &v->counter); \
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d31b5f56..b2c1751c 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -229,6 +229,8 @@ static inline int __must_check kstrtos32(const char *s, unsigned int base, s32 *
}
struct printbuf;
+extern void prt_u64(struct printbuf *out, u64 num);
+
extern __printf(2, 0) void prt_vprintf(struct printbuf *out, const char *fmt, va_list args);
extern __printf(2, 3) void prt_printf(struct printbuf *out, const char *fmt, ...);
diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h
index 3d62abe7..cbac6ac8 100644
--- a/include/linux/mean_and_variance.h
+++ b/include/linux/mean_and_variance.h
@@ -155,7 +155,7 @@ struct mean_and_variance_weighted {
u64 variance;
};
-inline s64 fast_divpow2(s64 n, u8 d);
+s64 fast_divpow2(s64 n, u8 d);
struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1);
s64 mean_and_variance_get_mean(struct mean_and_variance s);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index d1d33e67..d30fb10d 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -24,6 +24,7 @@ typedef struct {
} wait_queue_head_t;
void wake_up(wait_queue_head_t *);
+void wake_up_all(wait_queue_head_t *);
void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index 444f43f0..47ba750d 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -344,25 +344,29 @@ DEFINE_EVENT(btree_node, btree_node_free,
TRACE_EVENT(btree_reserve_get_fail,
TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
- size_t required),
- TP_ARGS(trans_fn, caller_ip, required),
+ size_t required,
+ int ret),
+ TP_ARGS(trans_fn, caller_ip, required, ret),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(size_t, required )
+ __array(char, ret, 32 )
),
TP_fast_assign(
strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->required = required;
+ strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
),
- TP_printk("%s %pS required %zu",
+ TP_printk("%s %pS required %zu ret %s",
__entry->trans_fn,
(void *) __entry->caller_ip,
- __entry->required)
+ __entry->required,
+ __entry->ret)
);
DEFINE_EVENT(btree_node, btree_node_compact,
@@ -542,14 +546,11 @@ TRACE_EVENT(bucket_alloc_fail,
u64 avail,
u64 copygc_wait_amount,
s64 copygc_waiting_for,
- u64 seen,
- u64 open,
- u64 need_journal_commit,
- u64 nouse,
+ struct bucket_alloc_state *s,
bool nonblocking,
const char *err),
TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for,
- seen, open, need_journal_commit, nouse, nonblocking, err),
+ s, nonblocking, err),
TP_STRUCT__entry(
__field(dev_t, dev )
@@ -573,10 +574,10 @@ TRACE_EVENT(bucket_alloc_fail,
__entry->avail = avail;
__entry->copygc_wait_amount = copygc_wait_amount;
__entry->copygc_waiting_for = copygc_waiting_for;
- __entry->seen = seen;
- __entry->open = open;
- __entry->need_journal_commit = need_journal_commit;
- __entry->nouse = nouse;
+ __entry->seen = s->buckets_seen;
+ __entry->open = s->skipped_open;
+ __entry->need_journal_commit = s->skipped_need_journal_commit;
+ __entry->nouse = s->skipped_nouse;
__entry->nonblocking = nonblocking;
strscpy(__entry->err, err, sizeof(__entry->err));
),
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 796b9f5a..742313c2 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -279,6 +279,22 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
return -EINVAL;
}
+ /*
+ * XXX this is wrong, we'll be checking updates that happened from
+ * before BCH_FS_CHECK_BACKPOINTERS_DONE
+ */
+ if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+ unsigned i, bp_len = 0;
+
+ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
+ bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len;
+
+ if (bp_len > a.v->dirty_sectors) {
+ prt_printf(err, "too many backpointers");
+ return -EINVAL;
+ }
+ }
+
if (rw == WRITE) {
if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
prt_printf(err, "invalid data type (got %u should be %u)",
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index 186c2ed4..949c068e 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -195,26 +195,24 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
u64 bucket,
enum alloc_reserve reserve,
struct bch_alloc_v4 *a,
- u64 *skipped_open,
- u64 *skipped_need_journal_commit,
- u64 *skipped_nouse,
+ struct bucket_alloc_state *s,
struct closure *cl)
{
struct open_bucket *ob;
if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
- (*skipped_nouse)++;
+ s->skipped_nouse++;
return NULL;
}
if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
- (*skipped_open)++;
+ s->skipped_open++;
return NULL;
}
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
- (*skipped_need_journal_commit)++;
+ s->skipped_need_journal_commit++;
return NULL;
}
@@ -234,7 +232,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
/* Recheck under lock: */
if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
spin_unlock(&c->freelist_lock);
- (*skipped_open)++;
+ s->skipped_open++;
return NULL;
}
@@ -274,9 +272,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
enum alloc_reserve reserve, u64 free_entry,
- u64 *skipped_open,
- u64 *skipped_need_journal_commit,
- u64 *skipped_nouse,
+ struct bucket_alloc_state *s,
struct bkey_s_c freespace_k,
struct closure *cl)
{
@@ -339,7 +335,8 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
u64 bp_offset = 0;
ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
- &bp_offset, &bp, 0);
+ &bp_offset, &bp,
+ BTREE_ITER_NOPRESERVE);
if (ret) {
ob = ERR_PTR(ret);
goto err;
@@ -356,11 +353,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
}
}
- ob = __try_alloc_bucket(c, ca, b, reserve, &a,
- skipped_open,
- skipped_need_journal_commit,
- skipped_nouse,
- cl);
+ ob = __try_alloc_bucket(c, ca, b, reserve, &a, s, cl);
if (!ob)
iter.path->preserve = false;
err:
@@ -406,11 +399,7 @@ static noinline struct open_bucket *
bch2_bucket_alloc_early(struct btree_trans *trans,
struct bch_dev *ca,
enum alloc_reserve reserve,
- u64 *cur_bucket,
- u64 *buckets_seen,
- u64 *skipped_open,
- u64 *skipped_need_journal_commit,
- u64 *skipped_nouse,
+ struct bucket_alloc_state *s,
struct closure *cl)
{
struct btree_iter iter;
@@ -418,10 +407,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
struct open_bucket *ob = NULL;
int ret;
- *cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket);
- *cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx);
+ s->cur_bucket = max_t(u64, s->cur_bucket, ca->mi.first_bucket);
+ s->cur_bucket = max_t(u64, s->cur_bucket, ca->new_fs_bucket_idx);
- for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket),
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, s->cur_bucket),
BTREE_ITER_SLOTS, k, ret) {
struct bch_alloc_v4 a;
@@ -437,19 +426,15 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
if (a.data_type != BCH_DATA_free)
continue;
- (*buckets_seen)++;
+ s->buckets_seen++;
- ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
- skipped_open,
- skipped_need_journal_commit,
- skipped_nouse,
- cl);
+ ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, s, cl);
if (ob)
break;
}
bch2_trans_iter_exit(trans, &iter);
- *cur_bucket = iter.pos.offset;
+ s->cur_bucket = iter.pos.offset;
return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found);
}
@@ -457,11 +442,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
struct bch_dev *ca,
enum alloc_reserve reserve,
- u64 *cur_bucket,
- u64 *buckets_seen,
- u64 *skipped_open,
- u64 *skipped_need_journal_commit,
- u64 *skipped_nouse,
+ struct bucket_alloc_state *s,
struct closure *cl)
{
struct btree_iter iter;
@@ -477,25 +458,21 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
* at previously
*/
for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
- POS(ca->dev_idx, *cur_bucket), 0, k, ret) {
+ POS(ca->dev_idx, s->cur_bucket), 0, k, ret) {
if (k.k->p.inode != ca->dev_idx)
break;
- for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k));
- *cur_bucket < k.k->p.offset;
- (*cur_bucket)++) {
+ for (s->cur_bucket = max(s->cur_bucket, bkey_start_offset(k.k));
+ s->cur_bucket < k.k->p.offset;
+ s->cur_bucket++) {
ret = btree_trans_too_many_iters(trans);
if (ret)
break;
- (*buckets_seen)++;
+ s->buckets_seen++;
ob = try_alloc_bucket(trans, ca, reserve,
- *cur_bucket,
- skipped_open,
- skipped_need_journal_commit,
- skipped_nouse,
- k, cl);
+ s->cur_bucket, s, k, cl);
if (ob)
break;
}
@@ -525,11 +502,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized);
u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor;
u64 avail;
- u64 cur_bucket = start;
- u64 buckets_seen = 0;
- u64 skipped_open = 0;
- u64 skipped_need_journal_commit = 0;
- u64 skipped_nouse = 0;
+ struct bucket_alloc_state s = { .cur_bucket = start };
bool waiting = false;
again:
bch2_dev_usage_read_fast(ca, usage);
@@ -568,31 +541,19 @@ again:
}
ob = likely(ca->mi.freespace_initialized)
- ? bch2_bucket_alloc_freelist(trans, ca, reserve,
- &cur_bucket,
- &buckets_seen,
- &skipped_open,
- &skipped_need_journal_commit,
- &skipped_nouse,
- cl)
- : bch2_bucket_alloc_early(trans, ca, reserve,
- &cur_bucket,
- &buckets_seen,
- &skipped_open,
- &skipped_need_journal_commit,
- &skipped_nouse,
- cl);
-
- if (skipped_need_journal_commit * 2 > avail)
+ ? bch2_bucket_alloc_freelist(trans, ca, reserve, &s, cl)
+ : bch2_bucket_alloc_early(trans, ca, reserve, &s, cl);
+
+ if (s.skipped_need_journal_commit * 2 > avail)
bch2_journal_flush_async(&c->journal, NULL);
if (!ob && !freespace_initialized && start) {
- start = cur_bucket = 0;
+ start = s.cur_bucket = 0;
goto again;
}
if (!freespace_initialized)
- ca->bucket_alloc_trans_early_cursor = cur_bucket;
+ ca->bucket_alloc_trans_early_cursor = s.cur_bucket;
err:
if (!ob)
ob = ERR_PTR(-BCH_ERR_no_buckets_found);
@@ -607,10 +568,7 @@ err:
avail,
bch2_copygc_wait_amount(c),
c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
- buckets_seen,
- skipped_open,
- skipped_need_journal_commit,
- skipped_nouse,
+ &s,
cl == NULL,
bch2_err_str(PTR_ERR(ob)));
@@ -1152,16 +1110,17 @@ out:
/*
* Get us an open_bucket we can allocate from, return with it locked:
*/
-struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *trans,
- unsigned target,
- unsigned erasure_code,
- struct write_point_specifier write_point,
- struct bch_devs_list *devs_have,
- unsigned nr_replicas,
- unsigned nr_replicas_required,
- enum alloc_reserve reserve,
- unsigned flags,
- struct closure *cl)
+int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
+ unsigned target,
+ unsigned erasure_code,
+ struct write_point_specifier write_point,
+ struct bch_devs_list *devs_have,
+ unsigned nr_replicas,
+ unsigned nr_replicas_required,
+ enum alloc_reserve reserve,
+ unsigned flags,
+ struct closure *cl,
+ struct write_point **wp_ret)
{
struct bch_fs *c = trans->c;
struct write_point *wp;
@@ -1183,7 +1142,7 @@ retry:
write_points_nr = c->write_points_nr;
have_cache = false;
- wp = writepoint_find(trans, write_point.v);
+ *wp_ret = wp = writepoint_find(trans, write_point.v);
if (wp->data_type == BCH_DATA_user)
ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
@@ -1240,7 +1199,7 @@ alloc_done:
BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
- return wp;
+ return 0;
err:
open_bucket_for_each(c, &wp->ptrs, ob, i)
if (ptrs.nr < ARRAY_SIZE(ptrs.v))
@@ -1258,39 +1217,13 @@ err:
if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
bch2_err_matches(ret, BCH_ERR_freelist_empty))
return cl
- ? ERR_PTR(-EAGAIN)
- : ERR_PTR(-BCH_ERR_ENOSPC_bucket_alloc);
+ ? -EAGAIN
+ : -BCH_ERR_ENOSPC_bucket_alloc;
if (bch2_err_matches(ret, BCH_ERR_insufficient_devices))
- return ERR_PTR(-EROFS);
-
- return ERR_PTR(ret);
-}
-
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
- unsigned target,
- unsigned erasure_code,
- struct write_point_specifier write_point,
- struct bch_devs_list *devs_have,
- unsigned nr_replicas,
- unsigned nr_replicas_required,
- enum alloc_reserve reserve,
- unsigned flags,
- struct closure *cl)
-{
- struct write_point *wp;
-
- bch2_trans_do(c, NULL, NULL, 0,
- PTR_ERR_OR_ZERO(wp = bch2_alloc_sectors_start_trans(&trans, target,
- erasure_code,
- write_point,
- devs_have,
- nr_replicas,
- nr_replicas_required,
- reserve,
- flags, cl)));
- return wp;
+ return -EROFS;
+ return ret;
}
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
@@ -1361,6 +1294,10 @@ static inline void writepoint_init(struct write_point *wp,
{
mutex_init(&wp->lock);
wp->data_type = type;
+
+ INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
+ INIT_LIST_HEAD(&wp->writes);
+ spin_lock_init(&wp->writes_lock);
}
void bch2_fs_allocator_foreground_init(struct bch_fs *c)
diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h
index 6de63a35..16490ffb 100644
--- a/libbcachefs/alloc_foreground.h
+++ b/libbcachefs/alloc_foreground.h
@@ -136,22 +136,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
unsigned, unsigned *, bool *, enum alloc_reserve,
unsigned, struct closure *);
-struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *,
- unsigned, unsigned,
- struct write_point_specifier,
- struct bch_devs_list *,
- unsigned, unsigned,
- enum alloc_reserve,
- unsigned,
- struct closure *);
-struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
- unsigned, unsigned,
- struct write_point_specifier,
- struct bch_devs_list *,
- unsigned, unsigned,
- enum alloc_reserve,
- unsigned,
- struct closure *);
+int bch2_alloc_sectors_start_trans(struct btree_trans *,
+ unsigned, unsigned,
+ struct write_point_specifier,
+ struct bch_devs_list *,
+ unsigned, unsigned,
+ enum alloc_reserve,
+ unsigned,
+ struct closure *,
+ struct write_point **);
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h
index e078584d..e66a85f7 100644
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -8,6 +8,14 @@
#include "clock_types.h"
#include "fifo.h"
+struct bucket_alloc_state {
+ u64 cur_bucket;
+ u64 buckets_seen;
+ u64 skipped_open;
+ u64 skipped_need_journal_commit;
+ u64 skipped_nouse;
+};
+
struct ec_bucket_buf;
#define BCH_ALLOC_RESERVES() \
@@ -78,6 +86,11 @@ struct write_point {
struct open_buckets ptrs;
struct dev_stripe_state stripe;
+
+ struct work_struct index_update_work;
+
+ struct list_head writes;
+ spinlock_t writes_lock;
};
struct write_point_specifier {
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c
index dace68e2..614811ea 100644
--- a/libbcachefs/backpointers.c
+++ b/libbcachefs/backpointers.c
@@ -9,8 +9,6 @@
#include <linux/mm.h>
-#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10
-
/*
* Convert from pos in backpointer btree to pos of corresponding bucket in alloc
* btree:
@@ -43,27 +41,6 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
return ret;
}
-void bch2_extent_ptr_to_bp(struct bch_fs *c,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- struct bpos *bucket_pos, struct bch_backpointer *bp)
-{
- enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
- s64 sectors = level ? btree_sectors(c) : k.k->size;
- u32 bucket_offset;
-
- *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
- *bp = (struct bch_backpointer) {
- .btree_id = btree_id,
- .level = level,
- .data_type = data_type,
- .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
- p.crc.offset,
- .bucket_len = ptr_disk_sectors(sectors, p),
- .pos = k.k->p,
- };
-}
-
static bool extent_matches_bp(struct bch_fs *c,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k,
diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h
index 8c58f929..48a48b75 100644
--- a/libbcachefs/backpointers.h
+++ b/libbcachefs/backpointers.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#include "buckets.h"
#include "super.h"
int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
@@ -16,9 +17,28 @@ void bch2_backpointer_swab(struct bkey_s);
.swab = bch2_backpointer_swab, \
})
-void bch2_extent_ptr_to_bp(struct bch_fs *, enum btree_id, unsigned,
- struct bkey_s_c, struct extent_ptr_decoded,
- struct bpos *, struct bch_backpointer *);
+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10
+
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ struct bpos *bucket_pos, struct bch_backpointer *bp)
+{
+ enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
+ s64 sectors = level ? btree_sectors(c) : k.k->size;
+ u32 bucket_offset;
+
+ *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
+ *bp = (struct bch_backpointer) {
+ .btree_id = btree_id,
+ .level = level,
+ .data_type = data_type,
+ .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
+ p.crc.offset,
+ .bucket_len = ptr_disk_sectors(sectors, p),
+ .pos = k.k->p,
+ };
+}
int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *,
struct bch_backpointer, struct bkey_s_c);
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index fcbe8f8c..8a43fcfa 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -226,6 +226,10 @@ do { \
dynamic_fault("bcachefs:meta:write:" name)
#ifdef __KERNEL__
+#define BCACHEFS_LOG_PREFIX
+#endif
+
+#ifdef BCACHEFS_LOG_PREFIX
#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name)
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
@@ -598,6 +602,23 @@ typedef struct {
#define BCACHEFS_ROOT_SUBVOL_INUM \
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
+#define BCH_BTREE_WRITE_TYPES() \
+ x(initial, 0) \
+ x(init_next_bset, 1) \
+ x(cache_reclaim, 2) \
+ x(journal_reclaim, 3) \
+ x(interior, 4)
+
+enum btree_write_type {
+#define x(t, n) BTREE_WRITE_##t,
+ BCH_BTREE_WRITE_TYPES()
+#undef x
+ BTREE_WRITE_TYPE_NR,
+};
+
+#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
+#define BTREE_WRITE_TYPE_BITS ilog2(BTREE_WRITE_TYPE_MASK)
+
struct bch_fs {
struct closure cl;
@@ -707,6 +728,13 @@ struct bch_fs {
struct workqueue_struct *btree_interior_update_worker;
struct work_struct btree_interior_update_work;
+ /* btree_io.c: */
+ spinlock_t btree_write_error_lock;
+ struct btree_write_stats {
+ atomic64_t nr;
+ atomic64_t bytes;
+ } btree_write_stats[BTREE_WRITE_TYPE_NR];
+
/* btree_iter.c: */
struct mutex btree_trans_lock;
struct list_head btree_trans_list;
@@ -881,11 +909,6 @@ struct bch_fs {
struct bio_set dio_write_bioset;
struct bio_set dio_read_bioset;
-
- atomic64_t btree_writes_nr;
- atomic64_t btree_writes_sectors;
- spinlock_t btree_write_error_lock;
-
/* ERRORS */
struct list_head fsck_errors;
struct mutex fsck_error_lock;
diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c
index 8518054a..be0d4bc1 100644
--- a/libbcachefs/bkey_sort.c
+++ b/libbcachefs/bkey_sort.c
@@ -178,7 +178,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
continue;
while ((next = sort_iter_peek(iter)) &&
- !bch2_bkey_cmp_packed(iter->b, in, next)) {
+ !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) {
BUG_ON(in->needs_whiteout &&
next->needs_whiteout);
needs_whiteout |= in->needs_whiteout;
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index d1cbf926..75e74479 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -280,9 +280,11 @@ wait_on_io:
* the post write cleanup:
*/
if (bch2_verify_btree_ondisk)
- bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
+ bch2_btree_node_write(c, b, SIX_LOCK_intent,
+ BTREE_WRITE_cache_reclaim);
else
- __bch2_btree_node_write(c, b, 0);
+ __bch2_btree_node_write(c, b,
+ BTREE_WRITE_cache_reclaim);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -389,7 +391,7 @@ restart:
six_trylock_read(&b->c.lock)) {
list_move(&bc->live, &b->list);
mutex_unlock(&bc->lock);
- __bch2_btree_node_write(c, b, 0);
+ __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_read(&b->c.lock);
if (touched >= nr)
goto out_nounlock;
@@ -675,6 +677,7 @@ out:
b->flags = 0;
b->written = 0;
b->nsets = 0;
+ b->write_type = 0;
b->sib_u64s[0] = 0;
b->sib_u64s[1] = 0;
b->whiteout_u64s = 0;
@@ -1118,7 +1121,7 @@ wait_on_io:
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
if (btree_node_dirty(b)) {
- __bch2_btree_node_write(c, b, 0);
+ __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 90f67ccd..cee3b500 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -451,6 +451,24 @@ void bch2_btree_build_aux_trees(struct btree *b)
}
/*
+ * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
+ *
+ * The first bset is going to be of similar order to the size of the node, the
+ * last bset is bounded by btree_write_set_buffer(), which is set to keep the
+ * memmove on insert from being too expensive: the middle bset should, ideally,
+ * be the geometric mean of the first and the last.
+ *
+ * Returns true if the middle bset is greater than that geometric mean:
+ */
+static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
+{
+ unsigned mid_u64s_bits =
+ (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
+
+ return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
+}
+
+/*
* @bch_btree_init_next - initialize a new (unwritten) bset that can then be
* inserted into
*
@@ -467,19 +485,14 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
EBUG_ON(!(b->c.lock.state.seq & 1));
BUG_ON(bset_written(b, bset(b, &b->set[1])));
+ BUG_ON(btree_node_just_written(b));
if (b->nsets == MAX_BSETS &&
- !btree_node_write_in_flight(b)) {
- unsigned log_u64s[] = {
- ilog2(bset_u64s(&b->set[0])),
- ilog2(bset_u64s(&b->set[1])),
- ilog2(bset_u64s(&b->set[2])),
- };
-
- if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
- bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
- reinit_iter = true;
- }
+ !btree_node_write_in_flight(b) &&
+ should_compact_all(c, b)) {
+ bch2_btree_node_write(c, b, SIX_LOCK_write,
+ BTREE_WRITE_init_next_bset);
+ reinit_iter = true;
}
if (b->nsets == MAX_BSETS &&
@@ -1653,7 +1666,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
} while ((v = cmpxchg(&b->flags, old, new)) != old);
if (new & (1U << BTREE_NODE_write_in_flight))
- __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
+ __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|b->write_type);
else
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
@@ -1802,6 +1815,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
bool used_mempool;
unsigned long old, new;
bool validate_before_checksum = false;
+ enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
void *data;
int ret;
@@ -1848,6 +1862,12 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
if (new & (1U << BTREE_NODE_need_write))
return;
do_write:
+ if ((flags & BTREE_WRITE_ONLY_IF_NEED))
+ type = b->write_type;
+ b->write_type = 0;
+
+ BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
+
atomic_dec(&c->btree_cache.dirty);
BUG_ON(btree_node_fake(b));
@@ -2022,8 +2042,8 @@ do_write:
bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
cpu_to_le16(b->written);
- atomic64_inc(&c->btree_writes_nr);
- atomic64_add(sectors_to_write, &c->btree_writes_sectors);
+ atomic64_inc(&c->btree_write_stats[type].nr);
+ atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
INIT_WORK(&wbio->work, btree_write_submit);
queue_work(c->io_complete_wq, &wbio->work);
@@ -2151,3 +2171,33 @@ bool bch2_btree_flush_all_writes(struct bch_fs *c)
{
return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
}
+
+const char * const bch2_btree_write_types[] = {
+#define x(t, n) [n] = #t,
+ BCH_BTREE_WRITE_TYPES()
+ NULL
+};
+
+void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ printbuf_tabstop_push(out, 20);
+ printbuf_tabstop_push(out, 10);
+
+ prt_tab(out);
+ prt_str(out, "nr");
+ prt_tab(out);
+ prt_str(out, "size");
+ prt_newline(out);
+
+ for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
+ u64 nr = atomic64_read(&c->btree_write_stats[i].nr);
+ u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes);
+
+ prt_printf(out, "%s:", bch2_btree_write_types[i]);
+ prt_tab(out);
+ prt_u64(out, nr);
+ prt_tab(out);
+ prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
+ prt_newline(out);
+ }
+}
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 8af85364..4b1810ad 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -139,8 +139,12 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *,
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
-#define BTREE_WRITE_ONLY_IF_NEED (1U << 0)
-#define BTREE_WRITE_ALREADY_STARTED (1U << 1)
+enum btree_write_flags {
+ __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
+ __BTREE_WRITE_ALREADY_STARTED,
+};
+#define BTREE_WRITE_ONLY_IF_NEED (1U << __BTREE_WRITE_ONLY_IF_NEED )
+#define BTREE_WRITE_ALREADY_STARTED (1U << __BTREE_WRITE_ALREADY_STARTED)
void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
@@ -219,4 +223,6 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
bn->min_key = bpos_nosnap_successor(bn->min_key);
}
+void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
+
#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 99a92a89..5080f56b 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -646,9 +646,9 @@ static inline void __btree_path_level_init(struct btree_path *path,
bch2_btree_node_iter_peek(&l->iter, l->b);
}
-inline void bch2_btree_path_level_init(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+void bch2_btree_path_level_init(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
BUG_ON(path->cached);
@@ -1172,11 +1172,10 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
btree_path_traverse_one(trans, path, flags, _RET_IP_);
}
-static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
+static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
struct btree_path *src)
{
unsigned i, offset = offsetof(struct btree_path, pos);
- int cmp = btree_path_cmp(dst, src);
memcpy((void *) dst + offset,
(void *) src + offset,
@@ -1188,9 +1187,6 @@ static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
if (t != BTREE_NODE_UNLOCKED)
six_lock_increment(&dst->l[i].b->c.lock, t);
}
-
- if (cmp)
- bch2_btree_path_check_sort_fast(trans, dst, cmp);
}
static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
@@ -1203,21 +1199,18 @@ static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btr
return new;
}
+__flatten
struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
struct btree_path *path, bool intent,
unsigned long ip)
{
- if (path->ref > 1 || path->preserve) {
- __btree_path_put(path, intent);
- path = btree_path_clone(trans, path, intent);
- path->preserve = false;
+ __btree_path_put(path, intent);
+ path = btree_path_clone(trans, path, intent);
+ path->preserve = false;
#ifdef CONFIG_BCACHEFS_DEBUG
- path->ip_allocated = ip;
+ path->ip_allocated = ip;
#endif
- btree_trans_verify_sorted(trans);
- }
-
- path->should_be_locked = false;
+ btree_trans_verify_sorted(trans);
return path;
}
@@ -1554,7 +1547,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
return path;
}
-inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
{
struct btree_path_level *l = path_l(path);
@@ -2536,6 +2529,18 @@ static inline void btree_path_swap(struct btree_trans *trans,
btree_path_verify_sorted_ref(trans, r);
}
+static inline struct btree_path *sib_btree_path(struct btree_trans *trans,
+ struct btree_path *path, int sib)
+{
+ unsigned idx = (unsigned) path->sorted_idx + sib;
+
+ EBUG_ON(sib != -1 && sib != 1);
+
+ return idx < trans->nr_sorted
+ ? trans->paths + trans->sorted[idx]
+ : NULL;
+}
+
static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans,
struct btree_path *path,
int cmp)
@@ -2545,9 +2550,7 @@ static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *
EBUG_ON(!cmp);
- while ((n = cmp < 0
- ? prev_btree_path(trans, path)
- : next_btree_path(trans, path)) &&
+ while ((n = sib_btree_path(trans, path, cmp)) &&
(cmp2 = btree_path_cmp(n, path)) &&
cmp2 != cmp)
btree_path_swap(trans, n, path);
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 8c35d7d4..bad51ceb 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -165,13 +165,12 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *,
struct btree_path *, unsigned);
struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
unsigned, unsigned, unsigned, unsigned long);
-inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
struct btree_iter *, struct bpos);
-inline void bch2_btree_path_level_init(struct btree_trans *,
- struct btree_path *, struct btree *);
+void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_trans_verify_paths(struct btree_trans *);
diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c
index 9d090437..dce2dc0c 100644
--- a/libbcachefs/btree_locking.c
+++ b/libbcachefs/btree_locking.c
@@ -173,10 +173,9 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
}
if (unlikely(!best)) {
- struct bch_fs *c = g->g->trans->c;
struct printbuf buf = PRINTBUF;
- bch_err(c, "cycle of nofail locks");
+ prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
for (i = g->g; i < g->g + g->nr; i++) {
struct btree_trans *trans = i->trans;
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index cab3de0d..d89489e4 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -77,6 +77,7 @@ struct btree {
u8 nsets;
u8 nr_key_bits;
u16 version_ondisk;
+ u8 write_type;
struct bkey_format format;
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 5ce91ae6..dac2fa6b 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -246,6 +246,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
unsigned nr_reserve;
enum alloc_reserve alloc_reserve;
+ int ret;
if (flags & BTREE_INSERT_USE_RESERVE) {
nr_reserve = 0;
@@ -268,7 +269,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
- wp = bch2_alloc_sectors_start_trans(trans,
+ ret = bch2_alloc_sectors_start_trans(trans,
c->opts.metadata_target ?:
c->opts.foreground_target,
0,
@@ -276,9 +277,9 @@ retry:
&devs_have,
res->nr_replicas,
c->opts.metadata_replicas_required,
- alloc_reserve, 0, cl);
- if (IS_ERR(wp))
- return ERR_CAST(wp);
+ alloc_reserve, 0, cl, &wp);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
if (wp->sectors_free < btree_sectors(c)) {
struct open_bucket *ob;
@@ -1178,7 +1179,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
}
if (ret) {
- trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]);
+ trace_and_count(c, btree_reserve_get_fail, trans->fn,
+ _RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
goto err;
}
@@ -1307,6 +1309,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
set_btree_node_dirty_acct(c, b);
set_btree_node_need_write(b);
+ b->write_type = BTREE_WRITE_interior;
printbuf_exit(&buf);
}
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index dabe8159..2e6d220c 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -282,6 +282,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
struct bkey_packed k;
BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+ EBUG_ON(btree_node_just_written(b));
if (!bkey_pack_pos(&k, pos, b)) {
struct bkey *u = (void *) &k;
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 3a683820..b930b788 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -181,6 +181,8 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
new |= 1 << BTREE_NODE_need_write;
} while ((v = cmpxchg(&b->flags, old, new)) != old);
+ b->write_type = BTREE_WRITE_journal_reclaim;
+
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
@@ -289,7 +291,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
return 0;
}
-static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
@@ -721,33 +723,34 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
return ret;
}
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+ while (--i >= trans->updates) {
+ if (same_leaf_as_prev(trans, i))
+ continue;
+
+ bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+ }
+
+ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
static inline int trans_lock_write(struct btree_trans *trans)
{
struct btree_insert_entry *i;
- int ret;
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
- ret = bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c);
- if (ret)
- goto fail;
+ if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+ return trans_lock_write_fail(trans, i);
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
return 0;
-fail:
- while (--i >= trans->updates) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
- }
-
- trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
@@ -758,6 +761,33 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
}
+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
+ struct btree_insert_entry *i,
+ struct printbuf *err)
+{
+ struct bch_fs *c = trans->c;
+ int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
+
+ printbuf_reset(err);
+ prt_printf(err, "invalid bkey on insert from %s -> %ps",
+ trans->fn, (void *) i->ip_allocated);
+ prt_newline(err);
+ printbuf_indent_add(err, 2);
+
+ bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
+ prt_newline(err);
+
+ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+ i->bkey_type, rw, err);
+ bch2_print_string_as_lines(KERN_ERR, err->buf);
+
+ bch2_inconsistent_error(c);
+ bch2_dump_trans_updates(trans);
+ printbuf_exit(err);
+
+ return -EINVAL;
+}
+
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
@@ -772,24 +802,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
trans_for_each_update(trans, i) {
- if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
- i->bkey_type, rw, &buf)) {
- printbuf_reset(&buf);
- prt_printf(&buf, "invalid bkey on insert from %s -> %ps",
- trans->fn, (void *) i->ip_allocated);
- prt_newline(&buf);
- printbuf_indent_add(&buf, 2);
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
- prt_newline(&buf);
-
- bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
- i->bkey_type, rw, &buf);
-
- bch2_trans_inconsistent(trans, "%s", buf.buf);
- printbuf_exit(&buf);
- return -EINVAL;
- }
+ if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+ i->bkey_type, rw, &buf)))
+ return bch2_trans_commit_bkey_invalid(trans, i, &buf);
btree_insert_entry_checks(trans, i);
}
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index cd297941..bf01837e 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -1263,23 +1263,24 @@ void fs_usage_apply_warn(struct btree_trans *trans,
struct btree_insert_entry *i;
struct printbuf buf = PRINTBUF;
- bch_err(c, "disk usage increased %lli more than %u sectors reserved",
- should_not_have_added, disk_res_sectors);
+ prt_printf(&buf,
+ bch2_fmt(c, "disk usage increased %lli more than %u sectors reserved)"),
+ should_not_have_added, disk_res_sectors);
trans_for_each_update(trans, i) {
struct bkey_s_c old = { &i->old_k, i->old_v };
- pr_err("while inserting");
- printbuf_reset(&buf);
+ prt_str(&buf, "new ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
- pr_err(" %s", buf.buf);
- pr_err("overlapping with");
- printbuf_reset(&buf);
+ prt_newline(&buf);
+
+ prt_str(&buf, "old ");
bch2_bkey_val_to_text(&buf, c, old);
- pr_err(" %s", buf.buf);
+ prt_newline(&buf);
}
__WARN();
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
@@ -1949,7 +1950,7 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
#define SECTORS_CACHE 1024
-int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
u64 sectors, int flags)
{
struct bch_fs_pcpu *pcpu;
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 56c06ccd..b6a1db76 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -259,15 +259,39 @@ int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
static inline void bch2_disk_reservation_put(struct bch_fs *c,
struct disk_reservation *res)
{
- this_cpu_sub(*c->online_reserved, res->sectors);
- res->sectors = 0;
+ if (res->sectors) {
+ this_cpu_sub(*c->online_reserved, res->sectors);
+ res->sectors = 0;
+ }
}
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
-int bch2_disk_reservation_add(struct bch_fs *,
- struct disk_reservation *,
- u64, int);
+int __bch2_disk_reservation_add(struct bch_fs *,
+ struct disk_reservation *,
+ u64, int);
+
+static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+ u64 sectors, int flags)
+{
+#ifdef __KERNEL__
+ u64 old, new;
+
+ do {
+ old = this_cpu_read(c->pcpu->sectors_available);
+ if (sectors > old)
+ return __bch2_disk_reservation_add(c, res, sectors, flags);
+
+ new = old - sectors;
+ } while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
+
+ this_cpu_add(*c->online_reserved, sectors);
+ res->sectors += sectors;
+ return 0;
+#else
+ return __bch2_disk_reservation_add(c, res, sectors, flags);
+#endif
+}
static inline struct disk_reservation
bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 3268e8d4..43d22fe8 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -316,7 +316,7 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
return __bch2_checksum_bio(c, type, nonce, bio, &iter);
}
-int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
struct nonce nonce, struct bio *bio)
{
struct bio_vec bv;
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h
index 3d6d13bc..f7ccef7a 100644
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -61,8 +61,16 @@ int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
struct bch_extent_crc_unpacked *,
unsigned, unsigned, unsigned);
-int bch2_encrypt_bio(struct bch_fs *, unsigned,
- struct nonce, struct bio *);
+int __bch2_encrypt_bio(struct bch_fs *, unsigned,
+ struct nonce, struct bio *);
+
+static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+ struct nonce nonce, struct bio *bio)
+{
+ return bch2_csum_type_is_encryption(type)
+ ? __bch2_encrypt_bio(c, type, nonce, bio)
+ : 0;
+}
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
struct bch_key *);
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index b75ff07e..be45cf54 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -97,7 +97,7 @@ static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
ptr->cached = true;
}
-static int bch2_data_update_index_update(struct bch_write_op *op)
+int bch2_data_update_index_update(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct btree_trans trans;
@@ -225,7 +225,7 @@ static int bch2_data_update_index_update(struct bch_write_op *op)
bch2_trans_update(&trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(&trans, &op->res,
- op_journal_seq(op),
+ NULL,
BTREE_INSERT_NOFAIL|
m->data_opts.btree_insert_flags);
if (!ret) {
@@ -270,8 +270,7 @@ out:
}
void bch2_data_update_read_done(struct data_update *m,
- struct bch_extent_crc_unpacked crc,
- struct closure *cl)
+ struct bch_extent_crc_unpacked crc)
{
/* write bio must own pages: */
BUG_ON(!m->op.wbio.bio.bi_vcnt);
@@ -279,7 +278,7 @@ void bch2_data_update_read_done(struct data_update *m,
m->op.crc = crc;
m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
- closure_call(&m->op.cl, bch2_write, NULL, cl);
+ closure_call(&m->op.cl, bch2_write, NULL, NULL);
}
void bch2_data_update_exit(struct data_update *update)
@@ -317,14 +316,13 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m,
m->op.flags |= BCH_WRITE_PAGES_STABLE|
BCH_WRITE_PAGES_OWNED|
BCH_WRITE_DATA_ENCODED|
- BCH_WRITE_FROM_INTERNAL|
+ BCH_WRITE_MOVE|
m->data_opts.write_flags;
m->op.compression_type =
bch2_compression_opt_to_type[io_opts.background_compression ?:
io_opts.compression];
if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
m->op.alloc_reserve = RESERVE_movinggc;
- m->op.index_update_fn = bch2_data_update_index_update;
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h
index 6793aa57..5d869079 100644
--- a/libbcachefs/data_update.h
+++ b/libbcachefs/data_update.h
@@ -26,9 +26,10 @@ struct data_update {
struct bch_write_op op;
};
+int bch2_data_update_index_update(struct bch_write_op *);
+
void bch2_data_update_read_done(struct data_update *,
- struct bch_extent_crc_unpacked,
- struct closure *);
+ struct bch_extent_crc_unpacked);
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct bch_fs *, struct data_update *,
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 2fb5102e..3e49d72d 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -125,8 +125,10 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
s->nr++;
}
+#ifdef BCACHEFS_LOG_PREFIX
if (!strncmp(fmt, "bcachefs:", 9))
prt_printf(out, bch2_log_msg(c, ""));
+#endif
va_start(args, fmt);
prt_vprintf(out, fmt, args);
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 3900995d..6d0a6dec 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -65,7 +65,6 @@ struct quota_res {
};
struct bch_writepage_io {
- struct closure cl;
struct bch_inode_info *inode;
/* must be last: */
@@ -73,11 +72,13 @@ struct bch_writepage_io {
};
struct dio_write {
- struct completion done;
struct kiocb *req;
+ struct address_space *mapping;
+ struct bch_inode_info *inode;
struct mm_struct *mm;
unsigned loop:1,
sync:1,
+ flush:1,
free_iov:1;
struct quota_res quota_res;
u64 written;
@@ -98,7 +99,7 @@ struct dio_read {
};
/* pagecache_block must be held */
-static int write_invalidate_inode_pages_range(struct address_space *mapping,
+static noinline int write_invalidate_inode_pages_range(struct address_space *mapping,
loff_t start, loff_t end)
{
int ret;
@@ -750,25 +751,25 @@ vm_fault_t bch2_page_fault(struct vm_fault *vmf)
if (fdm > mapping) {
struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
- if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
+ if (bch2_pagecache_add_tryget(inode))
goto got_lock;
- bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
+ bch2_pagecache_block_put(fdm_host);
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
+ bch2_pagecache_add_put(inode);
- bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
+ bch2_pagecache_block_get(fdm_host);
/* Signal that lock has been dropped: */
set_fdm_dropped_locks();
return VM_FAULT_SIGBUS;
}
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
got_lock:
ret = filemap_fault(vmf);
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
return ret;
}
@@ -796,7 +797,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
* a write_invalidate_inode_pages_range() that works without dropping
* page lock before invalidating page
*/
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
lock_page(page);
isize = i_size_read(&inode->v);
@@ -829,7 +830,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
wait_for_stable_page(page);
ret = VM_FAULT_LOCKED;
out:
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
sb_end_pagefault(inode->v.i_sb);
return ret;
@@ -1097,7 +1098,7 @@ void bch2_readahead(struct readahead_control *ractl)
bch2_trans_init(&trans, c, 0, 0);
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
while ((page = readpage_iter_next(&readpages_iter))) {
pgoff_t index = readpages_iter.offset + readpages_iter.idx;
@@ -1120,7 +1121,7 @@ void bch2_readahead(struct readahead_control *ractl)
&readpages_iter);
}
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
@@ -1200,18 +1201,10 @@ static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs
};
}
-static void bch2_writepage_io_free(struct closure *cl)
+static void bch2_writepage_io_done(struct bch_write_op *op)
{
- struct bch_writepage_io *io = container_of(cl,
- struct bch_writepage_io, cl);
-
- bio_put(&io->op.wbio.bio);
-}
-
-static void bch2_writepage_io_done(struct closure *cl)
-{
- struct bch_writepage_io *io = container_of(cl,
- struct bch_writepage_io, cl);
+ struct bch_writepage_io *io =
+ container_of(op, struct bch_writepage_io, op);
struct bch_fs *c = io->op.c;
struct bio *bio = &io->op.wbio.bio;
struct bvec_iter_all iter;
@@ -1273,7 +1266,7 @@ static void bch2_writepage_io_done(struct closure *cl)
end_page_writeback(bvec->bv_page);
}
- closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
+ bio_put(&io->op.wbio.bio);
}
static void bch2_writepage_do_io(struct bch_writepage_state *w)
@@ -1281,8 +1274,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
struct bch_writepage_io *io = w->io;
w->io = NULL;
- closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
- continue_at(&io->cl, bch2_writepage_io_done, NULL);
+ closure_call(&io->op.cl, bch2_write, NULL, NULL);
}
/*
@@ -1304,9 +1296,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
&c->writepage_bioset),
struct bch_writepage_io, op.wbio.bio);
- closure_init(&w->io->cl, NULL);
w->io->inode = inode;
-
op = &w->io->op;
bch2_write_op_init(op, c, w->opts);
op->target = w->opts.foreground_target;
@@ -1315,6 +1305,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
op->subvol = inode->ei_subvol;
op->pos = POS(inode->v.i_ino, sector);
+ op->end_io = bch2_writepage_io_done;
op->wbio.bio.bi_iter.bi_sector = sector;
op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
}
@@ -1438,7 +1429,8 @@ do_io:
/* Check for writing past i_size: */
WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
- round_up(i_size, block_bytes(c)));
+ round_up(i_size, block_bytes(c)) &&
+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
w->io->op.res.sectors += reserved_sectors;
w->io->op.i_sectors_delta -= dirty_sectors;
@@ -1490,7 +1482,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
bch2_page_reservation_init(c, inode, res);
*fsdata = res;
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
page = grab_cache_page_write_begin(mapping, index);
if (!page)
@@ -1547,7 +1539,7 @@ err:
put_page(page);
*pagep = NULL;
err_unlock:
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
kfree(res);
*fsdata = NULL;
return bch2_err_class(ret);
@@ -1591,7 +1583,7 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
unlock_page(page);
put_page(page);
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
bch2_page_reservation_put(c, inode, res);
kfree(res);
@@ -1760,7 +1752,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
ssize_t written = 0;
int ret = 0;
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
do {
unsigned offset = pos & (PAGE_SIZE - 1);
@@ -1818,7 +1810,7 @@ again:
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(iter));
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
return written ? written : ret;
}
@@ -1981,11 +1973,13 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (iocb->ki_flags & IOCB_DIRECT) {
struct blk_plug plug;
- ret = filemap_write_and_wait_range(mapping,
- iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (ret < 0)
- goto out;
+ if (unlikely(mapping->nrpages)) {
+ ret = filemap_write_and_wait_range(mapping,
+ iocb->ki_pos,
+ iocb->ki_pos + count - 1);
+ if (ret < 0)
+ goto out;
+ }
file_accessed(file);
@@ -1996,9 +1990,9 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (ret >= 0)
iocb->ki_pos += ret;
} else {
- bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_get(inode);
ret = generic_file_read_iter(iocb, iter);
- bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(inode);
}
out:
return bch2_err_class(ret);
@@ -2050,31 +2044,154 @@ err:
return err ? false : ret;
}
+static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
+ struct bch_inode_info *inode = dio->inode;
+ struct bio *bio = &dio->op.wbio.bio;
+
+ return bch2_check_range_allocated(c, inode_inum(inode),
+ dio->op.pos.offset, bio_sectors(bio),
+ dio->op.opts.data_replicas,
+ dio->op.opts.compression != 0);
+}
+
static void bch2_dio_write_loop_async(struct bch_write_op *);
+static __always_inline long bch2_dio_write_done(struct dio_write *dio);
-static long bch2_dio_write_loop(struct dio_write *dio)
+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
+{
+ struct iovec *iov = dio->inline_vecs;
+
+ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+ iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+ GFP_KERNEL);
+ if (unlikely(!iov))
+ return -ENOMEM;
+
+ dio->free_iov = true;
+ }
+
+ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
+ dio->iter.iov = iov;
+ return 0;
+}
+
+static void bch2_dio_write_flush_done(struct closure *cl)
+{
+ struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
+ struct bch_fs *c = dio->op.c;
+
+ closure_debug_destroy(cl);
+
+ dio->op.error = bch2_journal_error(&c->journal);
+
+ bch2_dio_write_done(dio);
+}
+
+static noinline void bch2_dio_write_flush(struct dio_write *dio)
{
- bool kthread = (current->flags & PF_KTHREAD) != 0;
+ struct bch_fs *c = dio->op.c;
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ dio->flush = 0;
+
+ closure_init(&dio->op.cl, NULL);
+
+ if (!dio->op.error) {
+ ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
+ if (ret)
+ dio->op.error = ret;
+ else
+ bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
+ }
+
+ if (dio->sync) {
+ closure_sync(&dio->op.cl);
+ closure_debug_destroy(&dio->op.cl);
+ } else {
+ continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
+ }
+}
+
+static __always_inline long bch2_dio_write_done(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
struct kiocb *req = dio->req;
- struct address_space *mapping = req->ki_filp->f_mapping;
- struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_inode_info *inode = dio->inode;
+ bool sync = dio->sync;
+ long ret;
+
+ if (unlikely(dio->flush)) {
+ bch2_dio_write_flush(dio);
+ if (!sync)
+ return -EIOCBQUEUED;
+ }
+
+ bch2_pagecache_block_put(inode);
+ bch2_quota_reservation_put(c, inode, &dio->quota_res);
+
+ if (dio->free_iov)
+ kfree(dio->iter.iov);
+
+ ret = dio->op.error ?: ((long) dio->written << 9);
+ bio_put(&dio->op.wbio.bio);
+
+ /* inode->i_dio_count is our ref on inode and thus bch_fs */
+ inode_dio_end(&inode->v);
+
+ if (ret < 0)
+ ret = bch2_err_class(ret);
+
+ if (!sync) {
+ req->ki_complete(req, ret);
+ ret = -EIOCBQUEUED;
+ }
+ return ret;
+}
+
+static __always_inline void bch2_dio_write_end(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
+ struct kiocb *req = dio->req;
+ struct bch_inode_info *inode = dio->inode;
struct bio *bio = &dio->op.wbio.bio;
struct bvec_iter_all iter;
struct bio_vec *bv;
+
+ i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
+ req->ki_pos += (u64) dio->op.written << 9;
+ dio->written += dio->op.written;
+
+ spin_lock(&inode->v.i_lock);
+ if (req->ki_pos > inode->v.i_size)
+ i_size_write(&inode->v, req->ki_pos);
+ spin_unlock(&inode->v.i_lock);
+
+ if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
+ bio_for_each_segment_all(bv, bio, iter)
+ put_page(bv->bv_page);
+
+ if (unlikely(dio->op.error))
+ set_bit(EI_INODE_ERROR, &inode->ei_flags);
+}
+
+static long bch2_dio_write_loop(struct dio_write *dio)
+{
+ struct bch_fs *c = dio->op.c;
+ struct kiocb *req = dio->req;
+ struct address_space *mapping = dio->mapping;
+ struct bch_inode_info *inode = dio->inode;
+ struct bio *bio = &dio->op.wbio.bio;
unsigned unaligned, iter_count;
bool sync = dio->sync, dropped_locks;
long ret;
- if (dio->loop)
- goto loop;
-
while (1) {
iter_count = dio->iter.count;
- if (kthread && dio->mm)
- kthread_use_mm(dio->mm);
- BUG_ON(current->faults_disabled_mapping);
+ EBUG_ON(current->faults_disabled_mapping);
current->faults_disabled_mapping = mapping;
ret = bio_iov_iter_get_pages(bio, &dio->iter);
@@ -2082,8 +2199,6 @@ static long bch2_dio_write_loop(struct dio_write *dio)
dropped_locks = fdm_dropped_locks();
current->faults_disabled_mapping = NULL;
- if (kthread && dio->mm)
- kthread_unuse_mm(dio->mm);
/*
* If the fault handler returned an error but also signalled
@@ -2121,116 +2236,80 @@ static long bch2_dio_write_loop(struct dio_write *dio)
}
bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
- dio->op.end_io = bch2_dio_write_loop_async;
+ dio->op.end_io = sync
+ ? NULL
+ : bch2_dio_write_loop_async;
dio->op.target = dio->op.opts.foreground_target;
dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas;
dio->op.subvol = inode->ei_subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
- if ((req->ki_flags & IOCB_DSYNC) &&
- !c->opts.journal_flush_disabled)
- dio->op.flags |= BCH_WRITE_FLUSH;
+ if (sync)
+ dio->op.flags |= BCH_WRITE_SYNC;
dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
dio->op.opts.data_replicas, 0);
if (unlikely(ret) &&
- !bch2_check_range_allocated(c, inode_inum(inode),
- dio->op.pos.offset, bio_sectors(bio),
- dio->op.opts.data_replicas,
- dio->op.opts.compression != 0))
+ !bch2_dio_write_check_allocated(dio))
goto err;
task_io_account_write(bio->bi_iter.bi_size);
- if (!dio->sync && !dio->loop && dio->iter.count) {
- struct iovec *iov = dio->inline_vecs;
-
- if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
- iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
- GFP_KERNEL);
- if (unlikely(!iov)) {
- dio->sync = sync = true;
- goto do_io;
- }
+ if (unlikely(dio->iter.count) &&
+ !dio->sync &&
+ !dio->loop &&
+ bch2_dio_write_copy_iov(dio))
+ dio->sync = sync = true;
- dio->free_iov = true;
- }
-
- memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
- dio->iter.iov = iov;
- }
-do_io:
dio->loop = true;
closure_call(&dio->op.cl, bch2_write, NULL, NULL);
- if (sync)
- wait_for_completion(&dio->done);
- else
+ if (!sync)
return -EIOCBQUEUED;
-loop:
- i_sectors_acct(c, inode, &dio->quota_res,
- dio->op.i_sectors_delta);
- req->ki_pos += (u64) dio->op.written << 9;
- dio->written += dio->op.written;
- spin_lock(&inode->v.i_lock);
- if (req->ki_pos > inode->v.i_size)
- i_size_write(&inode->v, req->ki_pos);
- spin_unlock(&inode->v.i_lock);
+ bch2_dio_write_end(dio);
- if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
- bio_for_each_segment_all(bv, bio, iter)
- put_page(bv->bv_page);
- bio->bi_vcnt = 0;
-
- if (dio->op.error) {
- set_bit(EI_INODE_ERROR, &inode->ei_flags);
- break;
- }
-
- if (!dio->iter.count)
+ if (likely(!dio->iter.count) || dio->op.error)
break;
bio_reset(bio, NULL, REQ_OP_WRITE);
- reinit_completion(&dio->done);
}
-
- ret = dio->op.error ?: ((long) dio->written << 9);
+out:
+ return bch2_dio_write_done(dio);
err:
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
- bch2_quota_reservation_put(c, inode, &dio->quota_res);
+ dio->op.error = ret;
- if (dio->free_iov)
- kfree(dio->iter.iov);
+ if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+ struct bvec_iter_all iter;
+ struct bio_vec *bv;
- if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
- bio_put(bio);
-
- /* inode->i_dio_count is our ref on inode and thus bch_fs */
- inode_dio_end(&inode->v);
-
- if (ret < 0)
- ret = bch2_err_class(ret);
-
- if (!sync) {
- req->ki_complete(req, ret);
- ret = -EIOCBQUEUED;
}
- return ret;
+ goto out;
}
static void bch2_dio_write_loop_async(struct bch_write_op *op)
{
struct dio_write *dio = container_of(op, struct dio_write, op);
+ struct mm_struct *mm = dio->mm;
- if (dio->sync)
- complete(&dio->done);
- else
- bch2_dio_write_loop(dio);
+ bch2_dio_write_end(dio);
+
+ if (likely(!dio->iter.count) || dio->op.error) {
+ bch2_dio_write_done(dio);
+ return;
+ }
+
+ bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+
+ if (mm)
+ kthread_use_mm(mm);
+ bch2_dio_write_loop(dio);
+ if (mm)
+ kthread_unuse_mm(mm);
}
static noinline
@@ -2268,7 +2347,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
goto err;
inode_dio_begin(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_block_get(inode);
extending = req->ki_pos + iter->count > inode->v.i_size;
if (!extending) {
@@ -2282,26 +2361,31 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
GFP_KERNEL,
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, op.wbio.bio);
- init_completion(&dio->done);
dio->req = req;
+ dio->mapping = mapping;
+ dio->inode = inode;
dio->mm = current->mm;
dio->loop = false;
dio->sync = is_sync_kiocb(req) || extending;
+ dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
dio->free_iov = false;
dio->quota_res.sectors = 0;
dio->written = 0;
dio->iter = *iter;
+ dio->op.c = c;
ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
iter->count >> 9, true);
if (unlikely(ret))
goto err_put_bio;
- ret = write_invalidate_inode_pages_range(mapping,
- req->ki_pos,
- req->ki_pos + iter->count - 1);
- if (unlikely(ret))
- goto err_put_bio;
+ if (unlikely(mapping->nrpages)) {
+ ret = write_invalidate_inode_pages_range(mapping,
+ req->ki_pos,
+ req->ki_pos + iter->count - 1);
+ if (unlikely(ret))
+ goto err_put_bio;
+ }
ret = bch2_dio_write_loop(dio);
err:
@@ -2309,7 +2393,7 @@ err:
inode_unlock(&inode->v);
return ret;
err_put_bio:
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_block_put(inode);
bch2_quota_reservation_put(c, inode, &dio->quota_res);
bio_put(bio);
inode_dio_end(&inode->v);
@@ -2613,7 +2697,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
}
inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_block_get(inode);
ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
if (ret)
@@ -2692,7 +2776,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
ret = bch2_setattr_nonsize(mnt_userns, inode, iattr);
err:
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_block_put(inode);
return bch2_err_class(ret);
}
@@ -3005,8 +3089,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
}
ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
- &reservation.k_i,
- &disk_res, NULL,
+ &reservation.k_i, &disk_res,
0, &i_sectors_delta, true);
if (ret)
goto bkey_err;
@@ -3105,7 +3188,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
inode_lock(&inode->v);
inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_block_get(inode);
ret = file_modified(file);
if (ret)
@@ -3122,7 +3205,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
else
ret = -EOPNOTSUPP;
err:
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+ bch2_pagecache_block_put(inode);
inode_unlock(&inode->v);
percpu_ref_put(&c->writes);
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 186faa54..4591b75f 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -43,58 +43,6 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
struct bch_subvolume *);
-static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
-{
- BUG_ON(atomic_long_read(&lock->v) == 0);
-
- if (atomic_long_sub_return_release(i, &lock->v) == 0)
- wake_up_all(&lock->wait);
-}
-
-static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
-{
- long v = atomic_long_read(&lock->v), old;
-
- do {
- old = v;
-
- if (i > 0 ? v < 0 : v > 0)
- return false;
- } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
- old, old + i)) != old);
- return true;
-}
-
-static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
-{
- wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
-}
-
-void bch2_pagecache_add_put(struct pagecache_lock *lock)
-{
- __pagecache_lock_put(lock, 1);
-}
-
-bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
-{
- return __pagecache_lock_tryget(lock, 1);
-}
-
-void bch2_pagecache_add_get(struct pagecache_lock *lock)
-{
- __pagecache_lock_get(lock, 1);
-}
-
-void bch2_pagecache_block_put(struct pagecache_lock *lock)
-{
- __pagecache_lock_put(lock, -1);
-}
-
-void bch2_pagecache_block_get(struct pagecache_lock *lock)
-{
- __pagecache_lock_get(lock, -1);
-}
-
void bch2_inode_update_after_write(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
@@ -1409,7 +1357,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
inode_init_once(&inode->v);
mutex_init(&inode->ei_update_lock);
- pagecache_lock_init(&inode->ei_pagecache_lock);
+ two_state_lock_init(&inode->ei_pagecache_lock);
mutex_init(&inode->ei_quota_lock);
return &inode->v;
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index 9f4b57e3..b11a1508 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -6,31 +6,11 @@
#include "opts.h"
#include "str_hash.h"
#include "quota_types.h"
+#include "two_state_shared_lock.h"
#include <linux/seqlock.h>
#include <linux/stat.h>
-/*
- * Two-state lock - can be taken for add or block - both states are shared,
- * like read side of rwsem, but conflict with other state:
- */
-struct pagecache_lock {
- atomic_long_t v;
- wait_queue_head_t wait;
-};
-
-static inline void pagecache_lock_init(struct pagecache_lock *lock)
-{
- atomic_long_set(&lock->v, 0);
- init_waitqueue_head(&lock->wait);
-}
-
-void bch2_pagecache_add_put(struct pagecache_lock *);
-bool bch2_pagecache_add_tryget(struct pagecache_lock *);
-void bch2_pagecache_add_get(struct pagecache_lock *);
-void bch2_pagecache_block_put(struct pagecache_lock *);
-void bch2_pagecache_block_get(struct pagecache_lock *);
-
struct bch_inode_info {
struct inode v;
unsigned long ei_flags;
@@ -39,7 +19,7 @@ struct bch_inode_info {
u64 ei_quota_reserved;
unsigned long ei_last_dirtied;
- struct pagecache_lock ei_pagecache_lock;
+ two_state_lock_t ei_pagecache_lock;
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
@@ -50,6 +30,13 @@ struct bch_inode_info {
struct bch_inode_unpacked ei_inode;
};
+#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0)
+
+#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
+#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1)
+
static inline subvol_inum inode_inum(struct bch_inode_info *inode)
{
return (subvol_inum) {
@@ -96,7 +83,7 @@ do { \
if ((_locks) & INODE_LOCK) \
down_write_nested(&a[i]->v.i_rwsem, i); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
- bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\
+ bch2_pagecache_block_get(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
mutex_lock_nested(&a[i]->ei_update_lock, i);\
} \
@@ -114,7 +101,7 @@ do { \
if ((_locks) & INODE_LOCK) \
up_write(&a[i]->v.i_rwsem); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
- bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\
+ bch2_pagecache_block_put(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
mutex_unlock(&a[i]->ei_update_lock); \
} \
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 60a14fa1..82caaf51 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -16,6 +16,7 @@
#include "checksum.h"
#include "compress.h"
#include "clock.h"
+#include "data_update.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
@@ -237,12 +238,14 @@ int bch2_extent_update(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *k,
struct disk_reservation *disk_res,
- u64 *journal_seq,
u64 new_i_size,
s64 *i_sectors_delta_total,
bool check_enospc)
{
struct btree_iter inode_iter = { NULL };
+ struct bkey_s_c inode_k;
+ struct bkey_s_c_inode_v3 inode;
+ struct bkey_i_inode_v3 *new_inode;
struct bpos next_pos;
bool usage_increasing;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
@@ -282,59 +285,51 @@ int bch2_extent_update(struct btree_trans *trans,
return ret;
}
- if (new_i_size || i_sectors_delta) {
- struct bkey_s_c k;
- struct bkey_s_c_inode_v3 inode;
- struct bkey_i_inode_v3 *new_inode;
- bool i_size_update;
+ bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes,
+ SPOS(0, inum.inum, iter->snapshot),
+ BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+ inode_k = bch2_btree_iter_peek_slot(&inode_iter);
+ ret = bkey_err(inode_k);
+ if (unlikely(ret))
+ goto err;
- bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes,
- SPOS(0, inum.inum, iter->snapshot),
- BTREE_ITER_INTENT|BTREE_ITER_CACHED);
- k = bch2_btree_iter_peek_slot(&inode_iter);
- ret = bkey_err(k);
- if (unlikely(ret))
- goto err;
+ ret = bkey_is_inode(inode_k.k) ? 0 : -ENOENT;
+ if (unlikely(ret))
+ goto err;
- ret = bkey_is_inode(k.k) ? 0 : -ENOENT;
+ if (unlikely(inode_k.k->type != KEY_TYPE_inode_v3)) {
+ inode_k = bch2_inode_to_v3(trans, inode_k);
+ ret = bkey_err(inode_k);
if (unlikely(ret))
goto err;
+ }
- if (unlikely(k.k->type != KEY_TYPE_inode_v3)) {
- k = bch2_inode_to_v3(trans, k);
- ret = bkey_err(k);
- if (unlikely(ret))
- goto err;
- }
-
- inode = bkey_s_c_to_inode_v3(k);
- i_size_update = !(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
- new_i_size > le64_to_cpu(inode.v->bi_size);
-
- if (!i_sectors_delta && !i_size_update)
- goto no_inode_update;
+ inode = bkey_s_c_to_inode_v3(inode_k);
- new_inode = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
- ret = PTR_ERR_OR_ZERO(new_inode);
- if (unlikely(ret))
- goto err;
+ new_inode = bch2_trans_kmalloc(trans, bkey_bytes(inode_k.k));
+ ret = PTR_ERR_OR_ZERO(new_inode);
+ if (unlikely(ret))
+ goto err;
- bkey_reassemble(&new_inode->k_i, k);
+ bkey_reassemble(&new_inode->k_i, inode.s_c);
- if (i_size_update)
- new_inode->v.bi_size = cpu_to_le64(new_i_size);
+ if (!(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
+ new_i_size > le64_to_cpu(inode.v->bi_size))
+ new_inode->v.bi_size = cpu_to_le64(new_i_size);
- le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta);
+ le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta);
- new_inode->k.p.snapshot = iter->snapshot;
+ new_inode->k.p.snapshot = iter->snapshot;
- ret = bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0);
- if (unlikely(ret))
- goto err;
- }
-no_inode_update:
- ret = bch2_trans_update(trans, iter, k, 0) ?:
- bch2_trans_commit(trans, disk_res, journal_seq,
+ /*
+ * Note:
+ * We always have to do an inode updated - even when i_size/i_sectors
+ * aren't changing - for fsync to work properly; fsync relies on
+ * inode->bi_journal_seq which is updated by the trigger code:
+ */
+ ret = bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0) ?:
+ bch2_trans_update(trans, iter, k, 0) ?:
+ bch2_trans_commit(trans, disk_res, NULL,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL);
if (unlikely(ret))
@@ -397,8 +392,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
bch2_cut_back(end_pos, &delete);
ret = bch2_extent_update(trans, inum, iter, &delete,
- &disk_res, NULL,
- 0, i_sectors_delta, false);
+ &disk_res, 0, i_sectors_delta, false);
bch2_disk_reservation_put(c, &disk_res);
}
@@ -428,7 +422,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
return ret;
}
-int bch2_write_index_default(struct bch_write_op *op)
+static int bch2_write_index_default(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct bkey_buf sk;
@@ -465,7 +459,7 @@ int bch2_write_index_default(struct bch_write_op *op)
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
ret = bch2_extent_update(&trans, inum, &iter, sk.k,
- &op->res, op_journal_seq(op),
+ &op->res,
op->new_i_size, &op->i_sectors_delta,
op->flags & BCH_WRITE_CHECK_ENOSPC);
bch2_trans_iter_exit(&trans, &iter);
@@ -543,29 +537,22 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
}
}
-static void __bch2_write(struct closure *);
+static void __bch2_write(struct bch_write_op *);
static void bch2_write_done(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
- if (!op->error && (op->flags & BCH_WRITE_FLUSH))
- op->error = bch2_journal_error(&c->journal);
-
bch2_disk_reservation_put(c, &op->res);
percpu_ref_put(&c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
- if (op->end_io) {
- EBUG_ON(cl->parent);
- closure_debug_destroy(cl);
+ closure_debug_destroy(cl);
+ if (op->end_io)
op->end_io(op);
- } else {
- closure_return(cl);
- }
}
static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
@@ -603,7 +590,7 @@ static void __bch2_write_index(struct bch_write_op *op)
struct keylist *keys = &op->insert_keys;
struct bkey_i *k;
unsigned dev;
- int ret;
+ int ret = 0;
if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
ret = bch2_write_drop_io_error_ptrs(op);
@@ -626,7 +613,10 @@ static void __bch2_write_index(struct bch_write_op *op)
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
- int ret = op->index_update_fn(op);
+
+ ret = !(op->flags & BCH_WRITE_MOVE)
+ ? bch2_write_index_default(op)
+ : bch2_data_update_index_update(op);
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
BUG_ON(keylist_sectors(keys) && !ret);
@@ -636,7 +626,7 @@ static void __bch2_write_index(struct bch_write_op *op)
if (ret) {
bch_err_inum_ratelimited(c, op->pos.inode,
"write error while doing btree update: %s", bch2_err_str(ret));
- op->error = ret;
+ goto err;
}
}
out:
@@ -649,25 +639,45 @@ out:
err:
keys->top = keys->keys;
op->error = ret;
+ op->flags |= BCH_WRITE_DONE;
goto out;
}
static void bch2_write_index(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_fs *c = op->c;
+ struct write_point *wp = op->wp;
+ struct workqueue_struct *wq = index_update_wq(op);
- __bch2_write_index(op);
+ barrier();
+ op->btree_update_ready = true;
+ queue_work(wq, &wp->index_update_work);
+}
- if (!(op->flags & BCH_WRITE_DONE)) {
- continue_at(cl, __bch2_write, index_update_wq(op));
- } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
- bch2_journal_flush_seq_async(&c->journal,
- *op_journal_seq(op),
- cl);
- continue_at(cl, bch2_write_done, index_update_wq(op));
- } else {
- continue_at_nobarrier(cl, bch2_write_done, NULL);
+void bch2_write_point_do_index_updates(struct work_struct *work)
+{
+ struct write_point *wp =
+ container_of(work, struct write_point, index_update_work);
+ struct bch_write_op *op;
+
+ while (1) {
+ spin_lock(&wp->writes_lock);
+ op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
+ if (op && !op->btree_update_ready)
+ op = NULL;
+ if (op)
+ list_del(&op->wp_list);
+ spin_unlock(&wp->writes_lock);
+
+ if (!op)
+ break;
+
+ __bch2_write_index(op);
+
+ if (!(op->flags & BCH_WRITE_DONE))
+ __bch2_write(op);
+ else
+ bch2_write_done(&op->cl);
}
}
@@ -700,12 +710,12 @@ static void bch2_write_endio(struct bio *bio)
if (wbio->put_bio)
bio_put(bio);
- if (parent)
+ if (parent) {
bio_endio(&parent->bio);
- else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
- closure_put(cl);
- else
- continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
+ return;
+ }
+
+ closure_put(cl);
}
static void init_append_extent(struct bch_write_op *op,
@@ -1112,19 +1122,18 @@ err:
return ret;
}
-static void __bch2_write(struct closure *cl)
+static void __bch2_write(struct bch_write_op *op)
{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
- struct write_point *wp;
+ struct write_point *wp = NULL;
struct bio *bio = NULL;
- bool skip_put = true;
unsigned nofs_flags;
int ret;
nofs_flags = memalloc_nofs_save();
again:
memset(&op->failed, 0, sizeof(op->failed));
+ op->btree_update_ready = false;
do {
struct bkey_i *key_to_write;
@@ -1134,76 +1143,60 @@ again:
/* +1 for possible cache device: */
if (op->open_buckets.nr + op->nr_replicas + 1 >
ARRAY_SIZE(op->open_buckets.v))
- goto flush_io;
+ break;
if (bch2_keylist_realloc(&op->insert_keys,
op->inline_keys,
ARRAY_SIZE(op->inline_keys),
BKEY_EXTENT_U64s_MAX))
- goto flush_io;
+ break;
/*
* The copygc thread is now global, which means it's no longer
* freeing up space on specific disks, which means that
* allocations for specific disks may hang arbitrarily long:
*/
- wp = bch2_alloc_sectors_start(c,
- op->target,
- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
- op->write_point,
- &op->devs_have,
- op->nr_replicas,
- op->nr_replicas_required,
- op->alloc_reserve,
- op->flags,
- (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
- BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
- EBUG_ON(!wp);
-
- if (IS_ERR(wp)) {
- if (unlikely(wp != ERR_PTR(-EAGAIN))) {
- ret = PTR_ERR(wp);
- goto err;
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_alloc_sectors_start_trans(&trans,
+ op->target,
+ op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
+ op->write_point,
+ &op->devs_have,
+ op->nr_replicas,
+ op->nr_replicas_required,
+ op->alloc_reserve,
+ op->flags,
+ (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+ BCH_WRITE_ONLY_SPECIFIED_DEVS))
+ ? NULL : &op->cl, &wp));
+ if (unlikely(ret)) {
+ if (unlikely(ret != -EAGAIN)) {
+ op->error = ret;
+ op->flags |= BCH_WRITE_DONE;
}
- goto flush_io;
+ break;
}
- /*
- * It's possible for the allocator to fail, put us on the
- * freelist waitlist, and then succeed in one of various retry
- * paths: if that happens, we need to disable the skip_put
- * optimization because otherwise there won't necessarily be a
- * barrier before we free the bch_write_op:
- */
- if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
- skip_put = false;
-
bch2_open_bucket_get(c, wp, &op->open_buckets);
ret = bch2_write_extent(op, wp, &bio);
- bch2_alloc_sectors_done(c, wp);
- if (ret < 0)
- goto err;
+ bch2_alloc_sectors_done(c, wp);
- if (ret) {
- skip_put = false;
- } else {
- /*
- * for the skip_put optimization this has to be set
- * before we submit the bio:
- */
+ if (ret < 0) {
+ op->error = ret;
op->flags |= BCH_WRITE_DONE;
+ break;
}
+ if (!ret)
+ op->flags |= BCH_WRITE_DONE;
+
bio->bi_end_io = bch2_write_endio;
bio->bi_private = &op->cl;
bio->bi_opf |= REQ_OP_WRITE;
- if (!skip_put)
- closure_get(bio->bi_private);
- else
- op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
+ closure_get(bio->bi_private);
key_to_write = (void *) (op->insert_keys.keys_p +
key_to_write_offset);
@@ -1212,48 +1205,34 @@ again:
key_to_write);
} while (ret);
- if (!skip_put)
- continue_at(cl, bch2_write_index, index_update_wq(op));
-out:
- memalloc_nofs_restore(nofs_flags);
- return;
-err:
- op->error = ret;
- op->flags |= BCH_WRITE_DONE;
-
- continue_at(cl, bch2_write_index, index_update_wq(op));
- goto out;
-flush_io:
/*
- * If the write can't all be submitted at once, we generally want to
- * block synchronously as that signals backpressure to the caller.
+ * Sync or no?
*
- * However, if we're running out of a workqueue, we can't block here
- * because we'll be blocking other work items from completing:
+ * If we're running asynchronously, wne may still want to block
+ * synchronously here if we weren't able to submit all of the IO at
+ * once, as that signals backpressure to the caller.
*/
- if (current->flags & PF_WQ_WORKER) {
- continue_at(cl, bch2_write_index, index_update_wq(op));
- goto out;
- }
-
- closure_sync(cl);
-
- if (!bch2_keylist_empty(&op->insert_keys)) {
+ if ((op->flags & BCH_WRITE_SYNC) || !(op->flags & BCH_WRITE_DONE)) {
+ closure_sync(&op->cl);
__bch2_write_index(op);
- if (op->error) {
- op->flags |= BCH_WRITE_DONE;
- continue_at_nobarrier(cl, bch2_write_done, NULL);
- goto out;
- }
+ if (!(op->flags & BCH_WRITE_DONE))
+ goto again;
+ bch2_write_done(&op->cl);
+ } else {
+ spin_lock(&wp->writes_lock);
+ op->wp = wp;
+ list_add_tail(&op->wp_list, &wp->writes);
+ spin_unlock(&wp->writes_lock);
+
+ continue_at(&op->cl, bch2_write_index, NULL);
}
- goto again;
+ memalloc_nofs_restore(nofs_flags);
}
static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
{
- struct closure *cl = &op->cl;
struct bio *bio = &op->wbio.bio;
struct bvec_iter iter;
struct bkey_i_inline_data *id;
@@ -1290,8 +1269,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
op->flags |= BCH_WRITE_DONE;
- continue_at_nobarrier(cl, bch2_write_index, NULL);
- return;
+ __bch2_write_index(op);
err:
bch2_write_done(&op->cl);
}
@@ -1319,6 +1297,7 @@ void bch2_write(struct closure *cl)
struct bch_fs *c = op->c;
unsigned data_len;
+ EBUG_ON(op->cl.parent);
BUG_ON(!op->nr_replicas);
BUG_ON(!op->write_point.v);
BUG_ON(!bkey_cmp(op->pos, POS_MAX));
@@ -1352,24 +1331,19 @@ void bch2_write(struct closure *cl)
return;
}
- continue_at_nobarrier(cl, __bch2_write, NULL);
+ __bch2_write(op);
return;
err:
bch2_disk_reservation_put(c, &op->res);
- if (op->end_io) {
- EBUG_ON(cl->parent);
- closure_debug_destroy(cl);
+ closure_debug_destroy(&op->cl);
+ if (op->end_io)
op->end_io(op);
- } else {
- closure_return(cl);
- }
}
/* Cache promotion on read */
struct promote_op {
- struct closure cl;
struct rcu_head rcu;
u64 start_time;
@@ -1423,10 +1397,10 @@ static void promote_free(struct bch_fs *c, struct promote_op *op)
kfree_rcu(op, rcu);
}
-static void promote_done(struct closure *cl)
+static void promote_done(struct bch_write_op *wop)
{
struct promote_op *op =
- container_of(cl, struct promote_op, cl);
+ container_of(wop, struct promote_op, write.op);
struct bch_fs *c = op->write.op.c;
bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
@@ -1438,7 +1412,6 @@ static void promote_done(struct closure *cl)
static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
{
- struct closure *cl = &op->cl;
struct bio *bio = &op->write.op.wbio.bio;
trace_and_count(op->write.op.c, read_promote, &rbio->bio);
@@ -1451,9 +1424,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
- closure_init(cl, NULL);
- bch2_data_update_read_done(&op->write, rbio->pick.crc, cl);
- closure_return_with_destructor(cl, promote_done);
+ bch2_data_update_read_done(&op->write, rbio->pick.crc);
}
static struct promote_op *__promote_alloc(struct bch_fs *c,
@@ -1518,6 +1489,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
},
btree_id, k);
BUG_ON(ret);
+ op->write.op.end_io = promote_done;
return op;
err:
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index 3ae31758..e23ff0ed 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -27,28 +27,20 @@ const char *bch2_blk_status_to_str(blk_status_t);
enum bch_write_flags {
BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
BCH_WRITE_CACHED = (1 << 1),
- BCH_WRITE_FLUSH = (1 << 2),
- BCH_WRITE_DATA_ENCODED = (1 << 3),
- BCH_WRITE_PAGES_STABLE = (1 << 4),
- BCH_WRITE_PAGES_OWNED = (1 << 5),
- BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
- BCH_WRITE_WROTE_DATA_INLINE = (1 << 7),
- BCH_WRITE_FROM_INTERNAL = (1 << 8),
- BCH_WRITE_CHECK_ENOSPC = (1 << 9),
+ BCH_WRITE_DATA_ENCODED = (1 << 2),
+ BCH_WRITE_PAGES_STABLE = (1 << 3),
+ BCH_WRITE_PAGES_OWNED = (1 << 4),
+ BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 5),
+ BCH_WRITE_WROTE_DATA_INLINE = (1 << 6),
+ BCH_WRITE_CHECK_ENOSPC = (1 << 7),
+ BCH_WRITE_SYNC = (1 << 8),
+ BCH_WRITE_MOVE = (1 << 9),
/* Internal: */
- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10),
- BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11),
- BCH_WRITE_DONE = (1 << 12),
- BCH_WRITE_IO_ERROR = (1 << 13),
+ BCH_WRITE_DONE = (1 << 10),
+ BCH_WRITE_IO_ERROR = (1 << 11),
};
-static inline u64 *op_journal_seq(struct bch_write_op *op)
-{
- return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
- ? op->journal_seq_p : &op->journal_seq;
-}
-
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
return op->alloc_reserve == RESERVE_movinggc
@@ -60,14 +52,12 @@ int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
struct bkey_i *, bool *, s64 *, s64 *);
int bch2_extent_update(struct btree_trans *, subvol_inum,
struct btree_iter *, struct bkey_i *,
- struct disk_reservation *, u64 *, u64, s64 *, bool);
+ struct disk_reservation *, u64, s64 *, bool);
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
subvol_inum, u64, s64 *);
int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
-int bch2_write_index_default(struct bch_write_op *);
-
static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
struct bch_io_opts opts)
{
@@ -91,14 +81,14 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
op->version = ZERO_VERSION;
op->write_point = (struct write_point_specifier) { 0 };
op->res = (struct disk_reservation) { 0 };
- op->journal_seq = 0;
op->new_i_size = U64_MAX;
op->i_sectors_delta = 0;
- op->index_update_fn = bch2_write_index_default;
}
void bch2_write(struct closure *);
+void bch2_write_point_do_index_updates(struct work_struct *);
+
static inline struct bch_write_bio *wbio_init(struct bio *bio)
{
struct bch_write_bio *wbio = to_wbio(bio);
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index 78bff13d..a91635d1 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -117,6 +117,7 @@ struct bch_write_op {
unsigned nr_replicas_required:4;
unsigned alloc_reserve:3;
unsigned incompressible:1;
+ unsigned btree_update_ready:1;
struct bch_devs_list devs_have;
u16 target;
@@ -132,23 +133,16 @@ struct bch_write_op {
struct write_point_specifier write_point;
+ struct write_point *wp;
+ struct list_head wp_list;
+
struct disk_reservation res;
struct open_buckets open_buckets;
- /*
- * If caller wants to flush but hasn't passed us a journal_seq ptr, we
- * still need to stash the journal_seq somewhere:
- */
- union {
- u64 *journal_seq_p;
- u64 journal_seq;
- };
u64 new_i_size;
s64 i_sectors_delta;
- int (*index_update_fn)(struct bch_write_op *);
-
struct bch_devs_mask failed;
struct keylist insert_keys;
diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h
index 195799bb..635efb7e 100644
--- a/libbcachefs/keylist.h
+++ b/libbcachefs/keylist.h
@@ -17,7 +17,6 @@ static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
{
if (l->keys_p != inline_keys)
kfree(l->keys_p);
- bch2_keylist_init(l, inline_keys);
}
static inline void bch2_keylist_push(struct keylist *l)
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 74869204..1d11cf0d 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -53,9 +53,8 @@ struct moving_io {
struct bio_vec bi_inline_vecs[0];
};
-static void move_free(struct closure *cl)
+static void move_free(struct moving_io *io)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
struct bch_fs *c = ctxt->c;
@@ -65,31 +64,30 @@ static void move_free(struct closure *cl)
kfree(io);
}
-static void move_write_done(struct closure *cl)
+static void move_write_done(struct bch_write_op *op)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
+ struct moving_io *io = container_of(op, struct moving_io, write.op);
struct moving_context *ctxt = io->write.ctxt;
if (io->write.op.error)
ctxt->write_error = true;
atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
- closure_return_with_destructor(cl, move_free);
+ move_free(io);
+ closure_put(&ctxt->cl);
}
-static void move_write(struct closure *cl)
+static void move_write(struct moving_io *io)
{
- struct moving_io *io = container_of(cl, struct moving_io, cl);
-
if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
- closure_return_with_destructor(cl, move_free);
+ move_free(io);
return;
}
+ closure_get(&io->write.ctxt->cl);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
- bch2_data_update_read_done(&io->write, io->rbio.pick.crc, cl);
- continue_at(cl, move_write_done, NULL);
+ bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
}
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@@ -121,7 +119,7 @@ static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *t
while ((io = next_pending_write(ctxt))) {
list_del(&io->list);
- closure_call(&io->cl, move_write, NULL, &ctxt->cl);
+ move_write(io);
}
}
@@ -185,7 +183,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
}
}
-void bch_move_stats_init(struct bch_move_stats *stats, char *name)
+void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
{
memset(stats, 0, sizeof(*stats));
scnprintf(stats->name, sizeof(stats->name), "%s", name);
@@ -302,6 +300,7 @@ static int bch2_move_extent(struct btree_trans *trans,
goto err_free_pages;
io->write.ctxt = ctxt;
+ io->write.op.end_io = move_write_done;
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
@@ -956,7 +955,7 @@ int bch2_data_job(struct bch_fs *c,
switch (op.op) {
case BCH_DATA_OP_REREPLICATE:
- bch_move_stats_init(stats, "rereplicate");
+ bch2_move_stats_init(stats, "rereplicate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
@@ -980,7 +979,7 @@ int bch2_data_job(struct bch_fs *c,
if (op.migrate.dev >= c->sb.nr_devices)
return -EINVAL;
- bch_move_stats_init(stats, "migrate");
+ bch2_move_stats_init(stats, "migrate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
@@ -1001,7 +1000,7 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
- bch_move_stats_init(stats, "rewrite_old_nodes");
+ bch2_move_stats_init(stats, "rewrite_old_nodes");
ret = bch2_scan_old_btree_nodes(c, stats);
break;
default:
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index c0fec69b..b14f679f 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -60,8 +60,7 @@ int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
-inline void bch_move_stats_init(struct bch_move_stats *stats,
- char *name);
+void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
#endif /* _BCACHEFS_MOVE_H */
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 044eca87..63bc692f 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -102,7 +102,7 @@ static int bch2_copygc(struct bch_fs *c)
};
int ret = 0;
- bch_move_stats_init(&move_stats, "copygc");
+ bch2_move_stats_init(&move_stats, "copygc");
for_each_rw_member(ca, c, dev_idx)
heap_size += ca->mi.nbuckets >> 7;
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index 17b289b0..4df981bd 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -189,7 +189,7 @@ static int bch2_rebalance_thread(void *arg)
prev_start = jiffies;
prev_cputime = curr_cputime();
- bch_move_stats_init(&move_stats, "rebalance");
+ bch2_move_stats_init(&move_stats, "rebalance");
while (!kthread_wait_freezable(r->enabled)) {
cond_resched();
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 6968f934..fdcd70e8 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -1414,7 +1414,7 @@ use_clean:
le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
struct bch_move_stats stats;
- bch_move_stats_init(&stats, "recovery");
+ bch2_move_stats_init(&stats, "recovery");
bch_info(c, "scanning for old btree nodes");
ret = bch2_fs_read_write(c);
@@ -1486,6 +1486,9 @@ int bch2_fs_initialize(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
set_bit(BCH_FS_MAY_GO_RW, &c->flags);
set_bit(BCH_FS_FSCK_DONE, &c->flags);
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
index d5c14bb2..0d4c004d 100644
--- a/libbcachefs/reflink.c
+++ b/libbcachefs/reflink.c
@@ -378,7 +378,7 @@ s64 bch2_remap_range(struct bch_fs *c,
dst_end.offset - dst_iter.pos.offset));
ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
- new_dst.k, &disk_res, NULL,
+ new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
true);
bch2_disk_reservation_put(c, &disk_res);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 06b2924c..647d018b 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -184,7 +184,7 @@ read_attribute(io_latency_stats_read);
read_attribute(io_latency_stats_write);
read_attribute(congested);
-read_attribute(btree_avg_write_size);
+read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
@@ -250,14 +250,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
return ret;
}
-static size_t bch2_btree_avg_write_size(struct bch_fs *c)
-{
- u64 nr = atomic64_read(&c->btree_writes_nr);
- u64 sectors = atomic64_read(&c->btree_writes_sectors);
-
- return nr ? div64_u64(sectors, nr) : 0;
-}
-
static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
{
long ret = 0;
@@ -396,7 +388,9 @@ SHOW(bch2_fs)
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
- sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c));
+
+ if (attr == &sysfs_btree_write_stats)
+ bch2_btree_write_stats_to_text(out, c);
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
@@ -554,7 +548,7 @@ SYSFS_OPS(bch2_fs);
struct attribute *bch2_fs_files[] = {
&sysfs_minor,
&sysfs_btree_cache_size,
- &sysfs_btree_avg_write_size,
+ &sysfs_btree_write_stats,
&sysfs_promote_whole_extents,
diff --git a/libbcachefs/two_state_shared_lock.c b/libbcachefs/two_state_shared_lock.c
new file mode 100644
index 00000000..dc508d54
--- /dev/null
+++ b/libbcachefs/two_state_shared_lock.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "two_state_shared_lock.h"
+
+void bch2_two_state_unlock(two_state_lock_t *lock, int s)
+{
+ long i = s ? 1 : -1;
+
+ BUG_ON(atomic_long_read(&lock->v) == 0);
+
+ if (atomic_long_sub_return_release(i, &lock->v) == 0)
+ wake_up_all(&lock->wait);
+}
+
+bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
+{
+ long i = s ? 1 : -1;
+ long v = atomic_long_read(&lock->v), old;
+
+ do {
+ old = v;
+
+ if (i > 0 ? v < 0 : v > 0)
+ return false;
+ } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+ old, old + i)) != old);
+ return true;
+}
+
+void bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+ wait_event(lock->wait, bch2_two_state_trylock(lock, s));
+}
diff --git a/libbcachefs/two_state_shared_lock.h b/libbcachefs/two_state_shared_lock.h
new file mode 100644
index 00000000..1b4f1089
--- /dev/null
+++ b/libbcachefs/two_state_shared_lock.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TWO_STATE_LOCK_H
+#define _BCACHEFS_TWO_STATE_LOCK_H
+
+#include <linux/atomic.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+typedef struct {
+ atomic_long_t v;
+ wait_queue_head_t wait;
+} two_state_lock_t;
+
+static inline void two_state_lock_init(two_state_lock_t *lock)
+{
+ atomic_long_set(&lock->v, 0);
+ init_waitqueue_head(&lock->wait);
+}
+
+void bch2_two_state_unlock(two_state_lock_t *, int);
+bool bch2_two_state_trylock(two_state_lock_t *, int);
+void bch2_two_state_lock(two_state_lock_t *, int);
+
+#endif /* _BCACHEFS_TWO_STATE_LOCK_H */
diff --git a/linux/mean_and_variance.c b/linux/mean_and_variance.c
index 643e3113..aa95db12 100644
--- a/linux/mean_and_variance.c
+++ b/linux/mean_and_variance.c
@@ -52,7 +52,7 @@
*
* note: this rounds towards 0.
*/
-inline s64 fast_divpow2(s64 n, u8 d)
+s64 fast_divpow2(s64 n, u8 d)
{
return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
}
diff --git a/linux/printbuf_userspace.c b/linux/printbuf_userspace.c
index df9567c5..0ae56ee1 100644
--- a/linux/printbuf_userspace.c
+++ b/linux/printbuf_userspace.c
@@ -27,3 +27,8 @@ void prt_printf(struct printbuf *out, const char *fmt, ...)
prt_vprintf(out, fmt, args);
va_end(args);
}
+
+void prt_u64(struct printbuf *out, u64 v)
+{
+ prt_printf(out, "%llu", v);
+}
diff --git a/linux/six.c b/linux/six.c
index 39f7ea79..39a9bd6e 100644
--- a/linux/six.c
+++ b/linux/six.c
@@ -342,7 +342,11 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
return true;
}
-#ifdef CONFIG_LOCK_SPIN_ON_OWNER
+/*
+ * We don't see stable performance with SIX_LOCK_SPIN_ON_OWNER enabled, so it's
+ * off for now:
+ */
+#ifdef SIX_LOCK_SPIN_ON_OWNER
static inline bool six_optimistic_spin(struct six_lock *lock,
struct six_lock_waiter *wait)
diff --git a/linux/wait.c b/linux/wait.c
index 991875c5..b1f002b9 100644
--- a/linux/wait.c
+++ b/linux/wait.c
@@ -66,6 +66,11 @@ void wake_up(wait_queue_head_t *q)
__wake_up(q, TASK_NORMAL, 1, NULL);
}
+void wake_up_all(wait_queue_head_t *q)
+{
+ __wake_up(q, TASK_NORMAL, 0, NULL);
+}
+
static void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
{
__wake_up_common(q, mode, nr, 0, NULL);