summaryrefslogtreecommitdiff
path: root/libbcachefs
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2021-04-04 22:12:56 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2021-04-04 22:15:29 -0400
commit209695dedf49425ad9e91ba2b2239c3a040ea159 (patch)
treefac7a385d53f0fb72551622c5a95f353346a393b /libbcachefs
parentf46437f06e8f3b67c81c5e1648a62279aed5f525 (diff)
Update bcachefs sources to f26267fc82 bcachefs: kill bset_tree->max_key
Diffstat (limited to 'libbcachefs')
-rw-r--r--libbcachefs/bcachefs.h6
-rw-r--r--libbcachefs/bcachefs_format.h14
-rw-r--r--libbcachefs/bset.c36
-rw-r--r--libbcachefs/btree_cache.c134
-rw-r--r--libbcachefs/btree_cache.h5
-rw-r--r--libbcachefs/btree_gc.c7
-rw-r--r--libbcachefs/btree_io.c3
-rw-r--r--libbcachefs/btree_iter.c201
-rw-r--r--libbcachefs/btree_iter.h25
-rw-r--r--libbcachefs/btree_key_cache.c36
-rw-r--r--libbcachefs/btree_key_cache.h12
-rw-r--r--libbcachefs/btree_locking.h24
-rw-r--r--libbcachefs/btree_types.h13
-rw-r--r--libbcachefs/btree_update_interior.c415
-rw-r--r--libbcachefs/btree_update_interior.h28
-rw-r--r--libbcachefs/btree_update_leaf.c160
-rw-r--r--libbcachefs/buckets.c277
-rw-r--r--libbcachefs/buckets.h22
-rw-r--r--libbcachefs/buckets_types.h28
-rw-r--r--libbcachefs/chardev.c6
-rw-r--r--libbcachefs/journal.c57
-rw-r--r--libbcachefs/journal.h46
-rw-r--r--libbcachefs/journal_reclaim.c123
-rw-r--r--libbcachefs/journal_types.h10
-rw-r--r--libbcachefs/migrate.c9
-rw-r--r--libbcachefs/move.c7
-rw-r--r--libbcachefs/opts.c5
-rw-r--r--libbcachefs/opts.h1
-rw-r--r--libbcachefs/recovery.c30
-rw-r--r--libbcachefs/replicas.c54
-rw-r--r--libbcachefs/replicas.h25
-rw-r--r--libbcachefs/super-io.c9
-rw-r--r--libbcachefs/sysfs.c16
33 files changed, 858 insertions, 986 deletions
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 4133651d..549cded6 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -690,10 +690,11 @@ struct bch_fs {
struct bch_fs_usage *usage_base;
struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR];
struct bch_fs_usage __percpu *usage_gc;
+ u64 __percpu *online_reserved;
/* single element mempool: */
struct mutex usage_scratch_lock;
- struct bch_fs_usage *usage_scratch;
+ struct bch_fs_usage_online *usage_scratch;
struct io_clock io_clock[2];
@@ -804,6 +805,9 @@ struct bch_fs {
struct bio_set dio_write_bioset;
struct bio_set dio_read_bioset;
+
+ atomic64_t btree_writes_nr;
+ atomic64_t btree_writes_sectors;
struct bio_list btree_write_error_list;
struct work_struct btree_write_error_work;
spinlock_t btree_write_error_lock;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index cb225951..ead7268b 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -1398,11 +1398,17 @@ enum bch_sb_feature {
BCH_FEATURE_NR,
};
+#define BCH_SB_COMPAT() \
+ x(alloc_info, 0) \
+ x(alloc_metadata, 1) \
+ x(extents_above_btree_updates_done, 2) \
+ x(bformat_overflow_done, 3)
+
enum bch_sb_compat {
- BCH_COMPAT_FEAT_ALLOC_INFO = 0,
- BCH_COMPAT_FEAT_ALLOC_METADATA = 1,
- BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE = 2,
- BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE = 3,
+#define x(f, n) BCH_COMPAT_##f,
+ BCH_SB_COMPAT()
+#undef x
+ BCH_COMPAT_NR,
};
/* options: */
diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c
index 3fb9a9ed..f92a757f 100644
--- a/libbcachefs/bset.c
+++ b/libbcachefs/bset.c
@@ -698,7 +698,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
k = (void *) max_key;
bkey_init(&k->k);
- k->k.p = t->max_key;
+ k->k.p = b->data->max_key;
}
}
@@ -782,8 +782,6 @@ retry:
while (k != btree_bkey_last(b, t))
prev = k, k = bkey_next(k);
- t->max_key = bkey_unpack_pos(b, prev);
-
if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
bkey_init(&min_key.k);
min_key.k.p = b->data->min_key;
@@ -791,7 +789,7 @@ retry:
if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
bkey_init(&max_key.k);
- max_key.k.p = t->max_key;
+ max_key.k.p = b->data->max_key;
}
/* Then we build the tree */
@@ -970,8 +968,6 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
min_key.u64s = max_key.u64s = 0;
if (bkey_next(k) == btree_bkey_last(b, t)) {
- t->max_key = bkey_unpack_pos(b, k);
-
for (j = 1; j < t->size; j = j * 2 + 1)
make_bfloat(b, t, j, &min_key, &max_key);
}
@@ -1311,16 +1307,6 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
case BSET_RW_AUX_TREE:
return bset_search_write_set(b, t, search);
case BSET_RO_AUX_TREE:
- /*
- * Each node in the auxiliary search tree covers a certain range
- * of bits, and keys above and below the set it covers might
- * differ outside those bits - so we have to special case the
- * start and end - handle that here:
- */
-
- if (bpos_cmp(*search, t->max_key) > 0)
- return btree_bkey_last(b, t);
-
return bset_search_tree(b, t, search, lossy_packed_search);
default:
unreachable();
@@ -1357,23 +1343,6 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
return m;
}
-/*
- * Returns the first key greater than or equal to @search
- */
-static __always_inline __flatten
-struct bkey_packed *bch2_bset_search(struct btree *b,
- struct bset_tree *t,
- struct bpos *search,
- struct bkey_packed *packed_search,
- const struct bkey_packed *lossy_packed_search)
-{
- struct bkey_packed *m = __bch2_bset_search(b, t, search,
- lossy_packed_search);
-
- return bch2_bset_search_linear(b, t, search,
- packed_search, lossy_packed_search, m);
-}
-
/* Btree node iterator */
static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
@@ -1469,6 +1438,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
unsigned i;
EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0);
+ EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0);
bset_aux_tree_verify(b);
memset(iter, 0, sizeof(*iter));
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 8a4667ba..1abc50f1 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -906,136 +906,6 @@ out:
return b;
}
-struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
- struct btree_iter *iter,
- struct btree *b,
- enum btree_node_sibling sib)
-{
- struct btree_trans *trans = iter->trans;
- struct btree *parent;
- struct btree_node_iter node_iter;
- struct bkey_packed *k;
- struct bkey_buf tmp;
- struct btree *ret = NULL;
- unsigned level = b->c.level;
-
- bch2_bkey_buf_init(&tmp);
-
- parent = btree_iter_node(iter, level + 1);
- if (!parent)
- return NULL;
-
- /*
- * There's a corner case where a btree_iter might have a node locked
- * that is just outside its current pos - when
- * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node.
- *
- * But the lock ordering checks in __bch2_btree_node_lock() go off of
- * iter->pos, not the node's key: so if the iterator is marked as
- * needing to be traversed, we risk deadlock if we don't bail out here:
- */
- if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
- return ERR_PTR(-EINTR);
-
- if (!bch2_btree_node_relock(iter, level + 1)) {
- ret = ERR_PTR(-EINTR);
- goto out;
- }
-
- node_iter = iter->l[parent->c.level].iter;
-
- k = bch2_btree_node_iter_peek_all(&node_iter, parent);
- BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
-
- k = sib == btree_prev_sib
- ? bch2_btree_node_iter_prev(&node_iter, parent)
- : (bch2_btree_node_iter_advance(&node_iter, parent),
- bch2_btree_node_iter_peek(&node_iter, parent));
- if (!k)
- goto out;
-
- bch2_bkey_buf_unpack(&tmp, c, parent, k);
-
- ret = bch2_btree_node_get(c, iter, tmp.k, level,
- SIX_LOCK_intent, _THIS_IP_);
-
- if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
- struct btree_iter *linked;
-
- if (!bch2_btree_node_relock(iter, level + 1))
- goto out;
-
- /*
- * We might have got -EINTR because trylock failed, and we're
- * holding other locks that would cause us to deadlock:
- */
- trans_for_each_iter(trans, linked)
- if (btree_iter_lock_cmp(iter, linked) < 0)
- __bch2_btree_iter_unlock(linked);
-
- if (sib == btree_prev_sib)
- btree_node_unlock(iter, level);
-
- ret = bch2_btree_node_get(c, iter, tmp.k, level,
- SIX_LOCK_intent, _THIS_IP_);
-
- /*
- * before btree_iter_relock() calls btree_iter_verify_locks():
- */
- if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(iter, level + 1);
-
- if (!bch2_btree_node_relock(iter, level)) {
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
-
- if (!IS_ERR(ret)) {
- six_unlock_intent(&ret->c.lock);
- ret = ERR_PTR(-EINTR);
- }
- }
-
- bch2_trans_relock(trans);
- }
-out:
- if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(iter, level + 1);
-
- if (PTR_ERR_OR_ZERO(ret) == -EINTR)
- bch2_btree_iter_upgrade(iter, level + 2);
-
- BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level));
-
- if (!IS_ERR_OR_NULL(ret)) {
- struct btree *n1 = ret, *n2 = b;
-
- if (sib != btree_prev_sib)
- swap(n1, n2);
-
- if (bpos_cmp(bpos_successor(n1->key.k.p),
- n2->data->min_key)) {
- char buf1[200], buf2[200];
-
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
- bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
-
- bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
- "prev: %s\n"
- "next: %s\n",
- bch2_btree_ids[iter->btree_id], level,
- buf1, buf2);
-
- six_unlock_intent(&ret->c.lock);
- ret = NULL;
- }
- }
-
- bch2_btree_trans_verify_locks(trans);
-
- bch2_bkey_buf_exit(&tmp, c);
-
- return ret;
-}
-
void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
const struct bkey_i *k,
enum btree_id btree_id, unsigned level)
@@ -1075,7 +945,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
" format: u64s %u fields %u %u %u %u %u\n"
" unpack fn len: %u\n"
" bytes used %zu/%zu (%zu%% full)\n"
- " sib u64s: %u, %u (merge threshold %zu)\n"
+ " sib u64s: %u, %u (merge threshold %u)\n"
" nr packed keys %u\n"
" nr unpacked keys %u\n"
" floats %zu\n"
@@ -1092,7 +962,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
b->nr.live_u64s * 100 / btree_max_u64s(c),
b->sib_u64s[0],
b->sib_u64s[1],
- BTREE_FOREGROUND_MERGE_THRESHOLD(c),
+ c->btree_foreground_merge_threshold,
b->nr.packed_keys,
b->nr.unpacked_keys,
stats.floats,
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index 21798869..4791c3b6 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -26,9 +26,6 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
enum btree_id, unsigned, bool);
-struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
- struct btree *, enum btree_node_sibling);
-
void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
const struct bkey_i *, enum btree_id, unsigned);
@@ -92,7 +89,7 @@ static inline unsigned btree_blocks(struct bch_fs *c)
#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
(BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
- (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b)
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 88c549c4..268e0072 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -779,7 +779,7 @@ static int bch2_gc_done(struct bch_fs *c,
{
struct bch_dev *ca;
bool verify = (!initial ||
- (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
+ (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
unsigned i, dev;
int ret = 0;
@@ -1297,11 +1297,10 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
return;
}
- as = bch2_btree_update_start(iter->trans, iter->btree_id,
+ as = bch2_btree_update_start(iter, old_nodes[0]->c.level,
btree_update_reserve_required(c, parent) + nr_old_nodes,
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE,
- NULL);
+ BTREE_INSERT_USE_RESERVE);
if (IS_ERR(as)) {
trace_btree_gc_coalesce_fail(c,
BTREE_GC_COALESCE_FAIL_RESERVE_GET);
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 7fbacd9e..ec1290fa 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1547,6 +1547,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
b->written += sectors_to_write;
+ atomic64_inc(&c->btree_writes_nr);
+ atomic64_add(sectors_to_write, &c->btree_writes_sectors);
+
/* XXX: submitting IO with btree locks held: */
bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k);
bch2_bkey_buf_exit(&k, c);
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 8190e73d..425c9ad7 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -12,6 +12,7 @@
#include "error.h"
#include "extents.h"
#include "journal.h"
+#include "replicas.h"
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
@@ -238,6 +239,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
struct btree_iter *linked, *deadlock_iter = NULL;
u64 start_time = local_clock();
unsigned reason = 9;
+ bool ret;
/* Check if it's safe to block: */
trans_for_each_iter(trans, linked) {
@@ -258,17 +260,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
*/
if (type == SIX_LOCK_intent &&
linked->nodes_locked != linked->nodes_intent_locked) {
- if (!(trans->nounlock)) {
- linked->locks_want = max_t(unsigned,
- linked->locks_want,
- __fls(linked->nodes_locked) + 1);
- if (!btree_iter_get_locks(linked, true, false)) {
- deadlock_iter = linked;
- reason = 1;
- }
- } else {
+ linked->locks_want = max_t(unsigned,
+ linked->locks_want,
+ __fls(linked->nodes_locked) + 1);
+ if (!btree_iter_get_locks(linked, true, false)) {
deadlock_iter = linked;
- reason = 2;
+ reason = 1;
}
}
@@ -298,18 +295,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
* we're about to lock, it must have the ancestors locked too:
*/
if (level > __fls(linked->nodes_locked)) {
- if (!(trans->nounlock)) {
- linked->locks_want =
- max(level + 1, max_t(unsigned,
- linked->locks_want,
- iter->locks_want));
- if (!btree_iter_get_locks(linked, true, false)) {
- deadlock_iter = linked;
- reason = 5;
- }
- } else {
+ linked->locks_want =
+ max(level + 1, max_t(unsigned,
+ linked->locks_want,
+ iter->locks_want));
+ if (!btree_iter_get_locks(linked, true, false)) {
deadlock_iter = linked;
- reason = 6;
+ reason = 5;
}
}
@@ -346,12 +338,23 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
if (six_trylock_type(&b->c.lock, type))
return true;
- if (six_lock_type(&b->c.lock, type, should_sleep_fn, p))
- return false;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ trans->locking_iter_idx = iter->idx;
+ trans->locking_pos = pos;
+ trans->locking_btree_id = iter->btree_id;
+ trans->locking_level = level;
+ trans->locking = b;
+#endif
- bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
- start_time);
- return true;
+ ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ trans->locking = NULL;
+#endif
+ if (ret)
+ bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
+ start_time);
+ return ret;
}
/* Btree iterator locking: */
@@ -421,50 +424,25 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
return false;
}
-bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
- unsigned new_locks_want)
+void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+ unsigned new_locks_want)
{
- unsigned l = iter->level;
+ unsigned l;
- EBUG_ON(iter->locks_want >= new_locks_want);
+ EBUG_ON(iter->locks_want < new_locks_want);
iter->locks_want = new_locks_want;
- do {
- if (!btree_iter_node(iter, l))
- break;
-
- if (!bch2_btree_node_upgrade(iter, l)) {
- iter->locks_want = l;
- return false;
- }
-
- l++;
- } while (l < iter->locks_want);
-
- return true;
-}
-
-void __bch2_btree_iter_downgrade(struct btree_iter *iter,
- unsigned downgrade_to)
-{
- unsigned l, new_locks_want = downgrade_to ?:
- (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
-
- if (iter->locks_want < downgrade_to) {
- iter->locks_want = new_locks_want;
-
- while (iter->nodes_locked &&
- (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
- if (l > iter->level) {
- btree_node_unlock(iter, l);
- } else {
- if (btree_node_intent_locked(iter, l)) {
- six_lock_downgrade(&iter->l[l].b->c.lock);
- iter->nodes_intent_locked ^= 1 << l;
- }
- break;
+ while (iter->nodes_locked &&
+ (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
+ if (l > iter->level) {
+ btree_node_unlock(iter, l);
+ } else {
+ if (btree_node_intent_locked(iter, l)) {
+ six_lock_downgrade(&iter->l[l].b->c.lock);
+ iter->nodes_intent_locked ^= 1 << l;
}
+ break;
}
}
@@ -484,13 +462,12 @@ void bch2_trans_downgrade(struct btree_trans *trans)
bool bch2_trans_relock(struct btree_trans *trans)
{
struct btree_iter *iter;
- bool ret = true;
trans_for_each_iter(trans, iter)
- if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
- ret &= bch2_btree_iter_relock(iter, true);
-
- return ret;
+ if (btree_iter_keep(trans, iter) &&
+ !bch2_btree_iter_relock(iter, true))
+ return false;
+ return true;
}
void bch2_trans_unlock(struct btree_trans *trans)
@@ -1027,7 +1004,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
trans_for_each_iter(iter->trans, linked)
if (linked->l[level].b == b) {
- __btree_node_unlock(linked, level);
+ btree_node_unlock(linked, level);
linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
}
}
@@ -2008,6 +1985,8 @@ static inline void btree_iter_copy(struct btree_iter *dst,
struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
unsigned btree_id, struct bpos pos,
+ unsigned locks_want,
+ unsigned depth,
unsigned flags)
{
struct btree_iter *iter, *best = NULL;
@@ -2020,10 +1999,6 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
pos.snapshot = btree_type_has_snapshots(btree_id)
? U32_MAX : 0;
- /* We always want a fresh iterator for node iterators: */
- if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES)
- goto alloc_iter;
-
trans_for_each_iter(trans, iter) {
if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
continue;
@@ -2038,7 +2013,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
best = iter;
}
-alloc_iter:
+
if (!best) {
iter = btree_trans_iter_alloc(trans);
bch2_btree_iter_init(trans, iter, btree_id);
@@ -2062,10 +2037,25 @@ alloc_iter:
iter->snapshot = pos.snapshot;
- if (!(iter->flags & BTREE_ITER_INTENT))
- bch2_btree_iter_downgrade(iter);
- else if (!iter->locks_want)
- __bch2_btree_iter_upgrade_nounlock(iter, 1);
+ locks_want = min(locks_want, BTREE_MAX_DEPTH);
+
+ if (locks_want > iter->locks_want) {
+ iter->locks_want = locks_want;
+ btree_iter_get_locks(iter, true, false);
+ } else if (locks_want < iter->locks_want) {
+ __bch2_btree_iter_downgrade(iter, locks_want);
+ }
+
+ while (iter->level < depth) {
+ btree_node_unlock(iter, iter->level);
+ iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
+ iter->level++;
+ }
+
+ while (iter->level > depth)
+ iter->l[--iter->level].b = BTREE_ITER_NO_NODE_INIT;
+
+ iter->min_depth = depth;
bch2_btree_iter_set_pos(iter, pos);
btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
@@ -2082,21 +2072,16 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
{
struct btree_iter *iter =
__bch2_trans_get_iter(trans, btree_id, pos,
- BTREE_ITER_NODES|
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS|
- flags);
- unsigned i;
+ locks_want, depth,
+ BTREE_ITER_NODES|
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ flags);
BUG_ON(bkey_cmp(iter->pos, pos));
-
- iter->locks_want = locks_want;
- iter->level = depth;
- iter->min_depth = depth;
-
- for (i = 0; i < ARRAY_SIZE(iter->l); i++)
- iter->l[i].b = NULL;
- iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
+ BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
+ BUG_ON(iter->level != depth);
+ BUG_ON(iter->min_depth != depth);
iter->ip_allocated = _RET_IP_;
return iter;
@@ -2304,11 +2289,24 @@ bch2_btree_iter_node_to_text(struct printbuf *out,
struct btree_bkey_cached_common *_b,
enum btree_iter_type type)
{
- pr_buf(out, " %px l=%u %s:",
- _b, _b->level, bch2_btree_ids[_b->btree_id]);
+ pr_buf(out, " l=%u %s:",
+ _b->level, bch2_btree_ids[_b->btree_id]);
bch2_bpos_to_text(out, btree_node_pos(_b, type));
}
+#ifdef CONFIG_BCACHEFS_DEBUG
+static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
+{
+ struct btree_iter *iter;
+
+ trans_for_each_iter(trans, iter)
+ if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
+ iter->nodes_locked)
+ return true;
+ return false;
+}
+#endif
+
void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
{
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -2319,14 +2317,18 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
- pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip);
+ if (!trans_has_btree_nodes_locked(trans))
+ continue;
+
+ pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
trans_for_each_iter(trans, iter) {
if (!iter->nodes_locked)
continue;
- pr_buf(out, " iter %u %s:",
+ pr_buf(out, " iter %u %c %s:",
iter->idx,
+ btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
bch2_btree_ids[iter->btree_id]);
bch2_bpos_to_text(out, iter->pos);
pr_buf(out, "\n");
@@ -2345,17 +2347,18 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
b = READ_ONCE(trans->locking);
if (b) {
- pr_buf(out, " locking iter %u l=%u %s:",
+ iter = &trans->iters[trans->locking_iter_idx];
+ pr_buf(out, " locking iter %u %c l=%u %s:",
trans->locking_iter_idx,
+ btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
trans->locking_level,
bch2_btree_ids[trans->locking_btree_id]);
bch2_bpos_to_text(out, trans->locking_pos);
-
pr_buf(out, " node ");
bch2_btree_iter_node_to_text(out,
(void *) b,
- btree_iter_type(&trans->iters[trans->locking_iter_idx]));
+ btree_iter_type(iter));
pr_buf(out, "\n");
}
}
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 7585f989..07d9b6d3 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -116,7 +116,6 @@ bool bch2_trans_relock(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
-bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
unsigned new_locks_want)
@@ -124,9 +123,7 @@ static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
return iter->locks_want < new_locks_want
- ? (!iter->trans->nounlock
- ? __bch2_btree_iter_upgrade(iter, new_locks_want)
- : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
+ ? __bch2_btree_iter_upgrade(iter, new_locks_want)
: iter->uptodate <= BTREE_ITER_NEED_PEEK;
}
@@ -134,8 +131,10 @@ void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
{
- if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
- __bch2_btree_iter_downgrade(iter, 0);
+ unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
+
+ if (iter->locks_want > new_locks_want)
+ __bch2_btree_iter_downgrade(iter, new_locks_want);
}
void bch2_trans_downgrade(struct btree_trans *);
@@ -175,8 +174,11 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
new_pos.snapshot = iter->snapshot;
- bkey_init(&iter->k);
- iter->k.p = iter->pos = new_pos;
+ iter->k.type = KEY_TYPE_deleted;
+ iter->k.p.inode = iter->pos.inode = new_pos.inode;
+ iter->k.p.offset = iter->pos.offset = new_pos.offset;
+ iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot;
+ iter->k.size = 0;
}
/* Sort order for locking btree iterators: */
@@ -261,14 +263,17 @@ int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
void bch2_trans_unlink_iters(struct btree_trans *);
struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
- struct bpos, unsigned);
+ struct bpos, unsigned,
+ unsigned, unsigned);
static inline struct btree_iter *
bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
struct bpos pos, unsigned flags)
{
struct btree_iter *iter =
- __bch2_trans_get_iter(trans, btree_id, pos, flags);
+ __bch2_trans_get_iter(trans, btree_id, pos,
+ (flags & BTREE_ITER_INTENT) != 0, 0,
+ flags);
iter->ip_allocated = _THIS_IP_;
return iter;
}
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 04354f56..0d3c0a40 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -352,6 +352,7 @@ err:
static int btree_key_cache_flush_pos(struct btree_trans *trans,
struct bkey_cached_key key,
u64 journal_seq,
+ unsigned commit_flags,
bool evict)
{
struct bch_fs *c = trans->c;
@@ -390,12 +391,17 @@ retry:
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RESERVED|
- BTREE_INSERT_JOURNAL_RECLAIM);
+ (ck->journal.seq == journal_last_seq(j)
+ ? BTREE_INSERT_JOURNAL_RESERVED
+ : 0)|
+ commit_flags);
err:
if (ret == -EINTR)
goto retry;
+ if (ret == -EAGAIN)
+ goto out;
+
if (ret) {
bch2_fs_fatal_err_on(!bch2_journal_error(j), c,
"error flushing key cache: %i", ret);
@@ -438,15 +444,15 @@ out:
return ret;
}
-static void btree_key_cache_journal_flush(struct journal *j,
- struct journal_entry_pin *pin,
- u64 seq)
+int bch2_btree_key_cache_journal_flush(struct journal *j,
+ struct journal_entry_pin *pin, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bkey_cached *ck =
container_of(pin, struct bkey_cached, journal);
struct bkey_cached_key key;
struct btree_trans trans;
+ int ret = 0;
int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
@@ -461,10 +467,13 @@ static void btree_key_cache_journal_flush(struct journal *j,
six_unlock_read(&ck->c.lock);
bch2_trans_init(&trans, c, 0, 0);
- btree_key_cache_flush_pos(&trans, key, seq, false);
+ ret = btree_key_cache_flush_pos(&trans, key, seq,
+ BTREE_INSERT_JOURNAL_RECLAIM, false);
bch2_trans_exit(&trans);
unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
+ return ret;
}
/*
@@ -480,7 +489,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
if (!bch2_btree_key_cache_find(c, id, pos))
return 0;
- return btree_key_cache_flush_pos(trans, key, 0, true);
+ return btree_key_cache_flush_pos(trans, key, 0, 0, true);
}
bool bch2_btree_insert_key_cached(struct btree_trans *trans,
@@ -517,7 +526,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
}
bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
- &ck->journal, btree_key_cache_journal_flush);
+ &ck->journal, bch2_btree_key_cache_journal_flush);
if (kick_reclaim)
journal_reclaim_kick(&c->journal);
@@ -581,9 +590,14 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
do {
struct rhash_head *pos, *next;
- rht_for_each_entry_safe(ck, pos, next, tbl, bc->shrink_iter, hash) {
+ pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+
+ while (!rht_is_a_nulls(pos)) {
+ next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+ ck = container_of(pos, struct bkey_cached, hash);
+
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
- continue;
+ goto next;
if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
@@ -595,6 +609,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
scanned++;
if (scanned >= nr)
break;
+next:
+ pos = next;
}
bc->shrink_iter++;
diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h
index 4e1e5a9c..7e2b0a08 100644
--- a/libbcachefs/btree_key_cache.h
+++ b/libbcachefs/btree_key_cache.h
@@ -1,15 +1,6 @@
#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
#define _BCACHEFS_BTREE_KEY_CACHE_H
-static inline size_t bch2_nr_btree_keys_want_flush(struct bch_fs *c)
-{
- size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
- size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
- size_t max_dirty = nr_keys / 4;
-
- return max_t(ssize_t, 0, nr_dirty - max_dirty);
-}
-
static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
{
size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
@@ -29,6 +20,9 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
}
+int bch2_btree_key_cache_journal_flush(struct journal *,
+ struct journal_entry_pin *, u64);
+
struct bkey_cached *
bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 38323e32..7532bcde 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -95,7 +95,7 @@ btree_lock_want(struct btree_iter *iter, int level)
return BTREE_NODE_UNLOCKED;
}
-static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
{
int lock_type = btree_node_locked_type(iter, level);
@@ -106,13 +106,6 @@ static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
mark_btree_node_unlocked(iter, level);
}
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
-{
- EBUG_ON(!level && iter->trans->nounlock);
-
- __btree_node_unlock(iter, level);
-}
-
static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
{
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
@@ -187,27 +180,14 @@ static inline bool btree_node_lock(struct btree *b,
unsigned long ip)
{
struct btree_trans *trans = iter->trans;
- bool ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans->locking = b;
- trans->locking_iter_idx = iter->idx;
- trans->locking_pos = pos;
- trans->locking_btree_id = iter->btree_id;
- trans->locking_level = level;
-#endif
- ret = likely(six_trylock_type(&b->c.lock, type)) ||
+ return likely(six_trylock_type(&b->c.lock, type)) ||
btree_node_lock_increment(trans, b, level, type) ||
__bch2_btree_node_lock(b, pos, level, iter, type,
should_sleep_fn, p, ip);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans->locking = NULL;
-#endif
- return ret;
}
bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 1941616f..39e93da1 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -47,8 +47,6 @@ struct bset_tree {
u16 data_offset;
u16 aux_data_offset;
u16 end_offset;
-
- struct bpos max_key;
};
struct btree_write {
@@ -98,6 +96,11 @@ struct btree {
u8 byte_order;
u8 unpack_fn_len;
+ struct btree_write writes[2];
+
+ /* Key/pointer for this btree node */
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
/*
* XXX: add a delete sequence number, so when bch2_btree_node_relock()
* fails because the lock sequence number has changed - i.e. the
@@ -128,11 +131,6 @@ struct btree {
/* lru list */
struct list_head list;
-
- struct btree_write writes[2];
-
- /* Key/pointer for this btree node */
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
struct btree_cache {
@@ -372,7 +370,6 @@ struct btree_trans {
u8 nr_updates2;
unsigned used_mempool:1;
unsigned error:1;
- unsigned nounlock:1;
unsigned in_traverse_all:1;
u64 iters_linked;
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 19dfc32e..00144707 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -437,10 +437,6 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
goto err_free;
}
- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
- if (ret)
- goto err_free;
-
as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
}
@@ -458,6 +454,10 @@ static void bch2_btree_update_free(struct btree_update *as)
{
struct bch_fs *c = as->c;
+ if (as->took_gc_lock)
+ up_read(&c->gc_lock);
+ as->took_gc_lock = false;
+
bch2_journal_preres_put(&c->journal, &as->journal_preres);
bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -893,24 +893,33 @@ void bch2_btree_update_done(struct btree_update *as)
{
BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+ if (as->took_gc_lock)
+ up_read(&as->c->gc_lock);
+ as->took_gc_lock = false;
+
bch2_btree_reserve_put(as);
continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
}
struct btree_update *
-bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
- unsigned nr_nodes, unsigned flags,
- struct closure *cl)
+bch2_btree_update_start(struct btree_iter *iter, unsigned level,
+ unsigned nr_nodes, unsigned flags)
{
+ struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
struct btree_update *as;
+ struct closure cl;
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0;
- int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED)
- ? JOURNAL_RES_GET_RECLAIM : 0;
+ int journal_flags = 0;
int ret = 0;
+ if (flags & BTREE_INSERT_JOURNAL_RESERVED)
+ journal_flags |= JOURNAL_RES_GET_RESERVED;
+
+ closure_init_stack(&cl);
+retry:
/*
* This check isn't necessary for correctness - it's just to potentially
* prevent us from doing a lot of work that'll end up being wasted:
@@ -919,12 +928,36 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
if (ret)
return ERR_PTR(ret);
+ /*
+ * XXX: figure out how far we might need to split,
+ * instead of locking/reserving all the way to the root:
+ */
+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
+ trace_trans_restart_iter_upgrade(trans->ip);
+ return ERR_PTR(-EINTR);
+ }
+
+ if (flags & BTREE_INSERT_GC_LOCK_HELD)
+ lockdep_assert_held(&c->gc_lock);
+ else if (!down_read_trylock(&c->gc_lock)) {
+ if (flags & BTREE_INSERT_NOUNLOCK)
+ return ERR_PTR(-EINTR);
+
+ bch2_trans_unlock(trans);
+ down_read(&c->gc_lock);
+ if (!bch2_trans_relock(trans)) {
+ up_read(&c->gc_lock);
+ return ERR_PTR(-EINTR);
+ }
+ }
+
as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
memset(as, 0, sizeof(*as));
closure_init(&as->cl, NULL);
as->c = c;
as->mode = BTREE_INTERIOR_NO_UPDATE;
- as->btree_id = id;
+ as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+ as->btree_id = iter->btree_id;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
@@ -936,16 +969,25 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
BTREE_UPDATE_JOURNAL_RES,
journal_flags|JOURNAL_RES_GET_NONBLOCK);
if (ret == -EAGAIN) {
- if (flags & BTREE_INSERT_NOUNLOCK)
- return ERR_PTR(-EINTR);
+ /*
+ * this would be cleaner if bch2_journal_preres_get() took a
+ * closure argument
+ */
+ if (flags & BTREE_INSERT_NOUNLOCK) {
+ ret = -EINTR;
+ goto err;
+ }
bch2_trans_unlock(trans);
+ if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+ goto err;
+
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
journal_flags);
if (ret)
- return ERR_PTR(ret);
+ goto err;
if (!bch2_trans_relock(trans)) {
ret = -EINTR;
@@ -960,7 +1002,8 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
if (ret)
goto err;
- ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl);
+ ret = bch2_btree_reserve_get(as, nr_nodes, flags,
+ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
if (ret)
goto err;
@@ -975,6 +1018,18 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
return as;
err:
bch2_btree_update_free(as);
+
+ if (ret == -EAGAIN) {
+ BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
+
+ bch2_trans_unlock(trans);
+ closure_sync(&cl);
+ ret = -EINTR;
+ }
+
+ if (ret == -EINTR && bch2_trans_relock(trans))
+ goto retry;
+
return ERR_PTR(ret);
}
@@ -1419,6 +1474,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
+ lockdep_assert_held(&c->gc_lock);
BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
BUG_ON(!b->c.level);
BUG_ON(!as || as->b);
@@ -1450,14 +1506,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
bch2_btree_node_unlock_write(b, iter);
btree_node_interior_verify(c, b);
-
- /*
- * when called from the btree_split path the new nodes aren't added to
- * the btree iterator yet, so the merge path's unlock/wait/relock dance
- * won't work:
- */
- bch2_foreground_maybe_merge(c, iter, b->c.level,
- flags|BTREE_INSERT_NOUNLOCK);
return;
split:
btree_split(as, b, iter, keys, flags);
@@ -1466,109 +1514,73 @@ split:
int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
unsigned flags)
{
- struct btree_trans *trans = iter->trans;
struct btree *b = iter_l(iter)->b;
struct btree_update *as;
- struct closure cl;
+ unsigned l;
int ret = 0;
- closure_init_stack(&cl);
-
- /* Hack, because gc and splitting nodes doesn't mix yet: */
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
- !down_read_trylock(&c->gc_lock)) {
- if (flags & BTREE_INSERT_NOUNLOCK) {
- trace_transaction_restart_ip(trans->ip, _THIS_IP_);
- return -EINTR;
- }
-
- bch2_trans_unlock(trans);
- down_read(&c->gc_lock);
-
- if (!bch2_trans_relock(trans))
- ret = -EINTR;
- }
-
- /*
- * XXX: figure out how far we might need to split,
- * instead of locking/reserving all the way to the root:
- */
- if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
- trace_trans_restart_iter_upgrade(trans->ip);
- ret = -EINTR;
- goto out;
- }
-
- as = bch2_btree_update_start(trans, iter->btree_id,
- btree_update_reserve_required(c, b), flags,
- !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
- if (IS_ERR(as)) {
- ret = PTR_ERR(as);
- if (ret == -EAGAIN) {
- BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
- bch2_trans_unlock(trans);
- ret = -EINTR;
-
- trace_transaction_restart_ip(trans->ip, _THIS_IP_);
- }
- goto out;
- }
+ as = bch2_btree_update_start(iter, iter->level,
+ btree_update_reserve_required(c, b), flags);
+ if (IS_ERR(as))
+ return PTR_ERR(as);
btree_split(as, b, iter, NULL, flags);
bch2_btree_update_done(as);
- /*
- * We haven't successfully inserted yet, so don't downgrade all the way
- * back to read locks;
- */
- __bch2_btree_iter_downgrade(iter, 1);
-out:
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
- up_read(&c->gc_lock);
- closure_sync(&cl);
+ for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++)
+ ret = bch2_foreground_maybe_merge(c, iter, l, flags);
+
return ret;
}
-void __bch2_foreground_maybe_merge(struct bch_fs *c,
- struct btree_iter *iter,
- unsigned level,
- unsigned flags,
- enum btree_node_sibling sib)
+int __bch2_foreground_maybe_merge(struct bch_fs *c,
+ struct btree_iter *iter,
+ unsigned level,
+ unsigned flags,
+ enum btree_node_sibling sib)
{
struct btree_trans *trans = iter->trans;
+ struct btree_iter *sib_iter = NULL;
struct btree_update *as;
struct bkey_format_state new_s;
struct bkey_format new_f;
struct bkey_i delete;
struct btree *b, *m, *n, *prev, *next, *parent;
- struct closure cl;
+ struct bpos sib_pos;
size_t sib_u64s;
- int ret = 0;
+ int ret = 0, ret2 = 0;
BUG_ON(!btree_node_locked(iter, level));
-
- closure_init_stack(&cl);
retry:
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ goto err;
+
BUG_ON(!btree_node_locked(iter, level));
b = iter->l[level].b;
- parent = btree_node_parent(iter, b);
- if (!parent)
+ if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
+ (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) {
+ b->sib_u64s[sib] = U16_MAX;
goto out;
+ }
- if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
- goto out;
+ sib_pos = sib == btree_prev_sib
+ ? bpos_predecessor(b->data->min_key)
+ : bpos_successor(b->data->max_key);
- /* XXX: can't be holding read locks */
- m = bch2_btree_node_get_sibling(c, iter, b, sib);
- if (IS_ERR(m)) {
- ret = PTR_ERR(m);
+ sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
+ sib_pos, U8_MAX, level,
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(sib_iter);
+ if (ret)
goto err;
- }
- /* NULL means no sibling: */
- if (!m) {
+ m = sib_iter->l[level].b;
+
+ if (btree_node_parent(iter, b) !=
+ btree_node_parent(sib_iter, m)) {
b->sib_u64s[sib] = U16_MAX;
goto out;
}
@@ -1581,6 +1593,8 @@ retry:
next = m;
}
+ BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key));
+
bch2_bkey_format_init(&new_s);
bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
__bch2_btree_calc_format(&new_s, prev);
@@ -1598,33 +1612,21 @@ retry:
}
sib_u64s = min(sib_u64s, btree_max_u64s(c));
+ sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
b->sib_u64s[sib] = sib_u64s;
- if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
- six_unlock_intent(&m->c.lock);
+ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
goto out;
- }
-
- /* We're changing btree topology, doesn't mix with gc: */
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
- !down_read_trylock(&c->gc_lock))
- goto err_cycle_gc_lock;
-
- if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
- ret = -EINTR;
- goto err_unlock;
- }
- as = bch2_btree_update_start(trans, iter->btree_id,
+ parent = btree_node_parent(iter, b);
+ as = bch2_btree_update_start(iter, level,
btree_update_reserve_required(c, parent) + 1,
flags|
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE,
- !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
- if (IS_ERR(as)) {
- ret = PTR_ERR(as);
- goto err_unlock;
- }
+ BTREE_INSERT_USE_RESERVE);
+ ret = PTR_ERR_OR_ZERO(as);
+ if (ret)
+ goto err;
trace_btree_merge(c, b);
@@ -1658,6 +1660,7 @@ retry:
bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
+ six_lock_increment(&m->c.lock, SIX_LOCK_intent);
bch2_btree_iter_node_drop(iter, b);
bch2_btree_iter_node_drop(iter, m);
@@ -1671,11 +1674,9 @@ retry:
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as);
-
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
- up_read(&c->gc_lock);
out:
bch2_btree_trans_verify_locks(trans);
+ bch2_trans_iter_free(trans, sib_iter);
/*
* Don't downgrade locks here: we're called after successful insert,
@@ -1686,58 +1687,56 @@ out:
* split path, and downgrading to read locks in there is potentially
* confusing:
*/
- closure_sync(&cl);
- return;
-
-err_cycle_gc_lock:
- six_unlock_intent(&m->c.lock);
-
- if (flags & BTREE_INSERT_NOUNLOCK)
- goto out;
-
- bch2_trans_unlock(trans);
-
- down_read(&c->gc_lock);
- up_read(&c->gc_lock);
- ret = -EINTR;
- goto err;
-
-err_unlock:
- six_unlock_intent(&m->c.lock);
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
- up_read(&c->gc_lock);
+ return ret ?: ret2;
err:
- BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
-
- if ((ret == -EAGAIN || ret == -EINTR) &&
- !(flags & BTREE_INSERT_NOUNLOCK)) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- goto out;
+ bch2_trans_iter_put(trans, sib_iter);
+ sib_iter = NULL;
+ if (ret == -EINTR && bch2_trans_relock(trans))
goto retry;
+
+ if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
+ ret2 = ret;
+ ret = bch2_btree_iter_traverse_all(trans);
+ if (!ret)
+ goto retry;
}
goto out;
}
-static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
- struct btree *b, unsigned flags,
- struct closure *cl)
+/**
+ * bch_btree_node_rewrite - Rewrite/move a btree node
+ */
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+ __le64 seq, unsigned flags)
{
- struct btree *n, *parent = btree_node_parent(iter, b);
+ struct btree *b, *n, *parent;
struct btree_update *as;
+ int ret;
+
+ flags |= BTREE_INSERT_NOFAIL;
+retry:
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ goto out;
+
+ b = bch2_btree_iter_peek_node(iter);
+ if (!b || b->data->keys.seq != seq)
+ goto out;
- as = bch2_btree_update_start(iter->trans, iter->btree_id,
+ parent = btree_node_parent(iter, b);
+ as = bch2_btree_update_start(iter, b->c.level,
(parent
? btree_update_reserve_required(c, parent)
: 0) + 1,
- flags, cl);
- if (IS_ERR(as)) {
+ flags);
+ ret = PTR_ERR_OR_ZERO(as);
+ if (ret == -EINTR)
+ goto retry;
+ if (ret) {
trace_btree_gc_rewrite_node_fail(c, b);
- return PTR_ERR(as);
+ goto out;
}
bch2_btree_interior_update_will_free_node(as, b);
@@ -1768,60 +1767,8 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as);
- return 0;
-}
-
-/**
- * bch_btree_node_rewrite - Rewrite/move a btree node
- *
- * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
- * btree_check_reserve() has to wait)
- */
-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
- __le64 seq, unsigned flags)
-{
- struct btree_trans *trans = iter->trans;
- struct closure cl;
- struct btree *b;
- int ret;
-
- flags |= BTREE_INSERT_NOFAIL;
-
- closure_init_stack(&cl);
-
- bch2_btree_iter_upgrade(iter, U8_MAX);
-
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
- if (!down_read_trylock(&c->gc_lock)) {
- bch2_trans_unlock(trans);
- down_read(&c->gc_lock);
- }
- }
-
- while (1) {
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- break;
-
- b = bch2_btree_iter_peek_node(iter);
- if (!b || b->data->keys.seq != seq)
- break;
-
- ret = __btree_node_rewrite(c, iter, b, flags, &cl);
- if (ret != -EAGAIN &&
- ret != -EINTR)
- break;
-
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- }
-
+out:
bch2_btree_iter_downgrade(iter);
-
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
- up_read(&c->gc_lock);
-
- closure_sync(&cl);
return ret;
}
@@ -1892,71 +1839,34 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
struct btree_update *as = NULL;
struct btree *new_hash = NULL;
struct closure cl;
- int ret;
+ int ret = 0;
closure_init_stack(&cl);
- if (!bch2_btree_iter_upgrade(iter, U8_MAX))
- return -EINTR;
-
- if (!down_read_trylock(&c->gc_lock)) {
- bch2_trans_unlock(iter->trans);
- down_read(&c->gc_lock);
-
- if (!bch2_trans_relock(iter->trans)) {
- ret = -EINTR;
- goto err;
- }
- }
-
/*
* check btree_ptr_hash_val() after @b is locked by
* btree_iter_traverse():
*/
if (btree_ptr_hash_val(new_key) != b->hash_val) {
- /* bch2_btree_reserve_get will unlock */
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
if (ret) {
bch2_trans_unlock(iter->trans);
- up_read(&c->gc_lock);
closure_sync(&cl);
- down_read(&c->gc_lock);
-
- if (!bch2_trans_relock(iter->trans)) {
- ret = -EINTR;
- goto err;
- }
+ if (!bch2_trans_relock(iter->trans))
+ return -EINTR;
}
new_hash = bch2_btree_node_mem_alloc(c);
}
-retry:
- as = bch2_btree_update_start(iter->trans, iter->btree_id,
- parent ? btree_update_reserve_required(c, parent) : 0,
- BTREE_INSERT_NOFAIL, &cl);
+ as = bch2_btree_update_start(iter, b->c.level,
+ parent ? btree_update_reserve_required(c, parent) : 0,
+ BTREE_INSERT_NOFAIL);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
- if (ret == -EAGAIN)
- ret = -EINTR;
-
- if (ret == -EINTR) {
- bch2_trans_unlock(iter->trans);
- up_read(&c->gc_lock);
- closure_sync(&cl);
- down_read(&c->gc_lock);
-
- if (bch2_trans_relock(iter->trans))
- goto retry;
- }
-
goto err;
}
- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
- if (ret)
- goto err_free_update;
-
__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
bch2_btree_iter_downgrade(iter);
@@ -1969,12 +1879,9 @@ err:
six_unlock_write(&new_hash->c.lock);
six_unlock_intent(&new_hash->c.lock);
}
- up_read(&c->gc_lock);
closure_sync(&cl);
+ bch2_btree_cache_cannibalize_unlock(c);
return ret;
-err_free_update:
- bch2_btree_update_free(as);
- goto err;
}
/* Init code: */
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index 45d21273..f2925b0d 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -48,6 +48,7 @@ struct btree_update {
} mode;
unsigned nodes_written:1;
+ unsigned took_gc_lock:1;
enum btree_id btree_id;
@@ -120,8 +121,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
void bch2_btree_update_done(struct btree_update *);
struct btree_update *
-bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned,
- unsigned, struct closure *);
+bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned);
void bch2_btree_interior_update_will_free_node(struct btree_update *,
struct btree *);
@@ -132,10 +132,10 @@ void bch2_btree_insert_node(struct btree_update *, struct btree *,
unsigned);
int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
-void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
- unsigned, unsigned, enum btree_node_sibling);
+int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+ unsigned, unsigned, enum btree_node_sibling);
-static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
+static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
struct btree_iter *iter,
unsigned level, unsigned flags,
enum btree_node_sibling sib)
@@ -143,27 +143,27 @@ static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
struct btree *b;
if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
- return;
+ return 0;
if (!bch2_btree_node_relock(iter, level))
- return;
+ return 0;
b = iter->l[level].b;
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
- return;
+ return 0;
- __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
+ return __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
}
-static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
+static inline int bch2_foreground_maybe_merge(struct bch_fs *c,
struct btree_iter *iter,
unsigned level,
unsigned flags)
{
- bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
- btree_prev_sib);
- bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
- btree_next_sib);
+ return bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+ btree_prev_sib) ?:
+ bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+ btree_next_sib);
}
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 221a6004..e258cf89 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -134,7 +134,7 @@ fix_iter:
return true;
}
-static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
unsigned i, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -145,14 +145,15 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
bch2_btree_node_write_cond(c, b,
(btree_current_write(b) == w && w->journal.seq == seq));
six_unlock_read(&b->c.lock);
+ return 0;
}
-static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
return __btree_node_flush(j, pin, 0, seq);
}
-static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
return __btree_node_flush(j, pin, 1, seq);
}
@@ -375,7 +376,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
struct btree_insert_entry **stopped_at)
{
struct bch_fs *c = trans->c;
- struct bch_fs_usage *fs_usage = NULL;
struct btree_insert_entry *i;
struct btree_trans_commit_hook *h;
unsigned u64s = 0;
@@ -423,7 +423,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (marking) {
percpu_down_read(&c->mark_lock);
- fs_usage = bch2_fs_usage_scratch_get(c);
+ }
+
+ /* Must be called under mark_lock: */
+ if (marking && trans->fs_usage_deltas &&
+ !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) {
+ ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+ goto err;
}
/*
@@ -462,21 +468,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
i->k->k.version = MAX_VERSION;
}
- /* Must be called under mark_lock: */
- if (marking && trans->fs_usage_deltas &&
- bch2_replicas_delta_list_apply(c, fs_usage,
- trans->fs_usage_deltas)) {
- ret = BTREE_INSERT_NEED_MARK_REPLICAS;
- goto err;
- }
-
trans_for_each_update(trans, i)
if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
bch2_mark_update(trans, i->iter, i->k,
- fs_usage, i->trigger_flags);
+ NULL, i->trigger_flags);
- if (marking)
- bch2_trans_fs_usage_apply(trans, fs_usage);
+ if (marking && trans->fs_usage_deltas)
+ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
if (unlikely(c->gc_pos.phase))
bch2_trans_mark_gc(trans);
@@ -485,31 +483,85 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
do_btree_insert_one(trans, i->iter, i->k);
err:
if (marking) {
- bch2_fs_usage_scratch_put(c, fs_usage);
percpu_up_read(&c->mark_lock);
}
return ret;
}
+static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter)
+{
+ struct btree_insert_entry *i;
+ struct btree *b = iter_l(iter)->b;
+ struct bkey_s_c old;
+ int u64s_delta = 0;
+ int ret;
+
+ /*
+ * Inserting directly into interior nodes is an uncommon operation with
+ * various weird edge cases: also, a lot of things about
+ * BTREE_ITER_NODES iters need to be audited
+ */
+ if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS))
+ return 0;
+
+ BUG_ON(iter->level);
+
+ trans_for_each_update2(trans, i) {
+ if (iter_l(i->iter)->b != b)
+ continue;
+
+ old = bch2_btree_iter_peek_slot(i->iter);
+ ret = bkey_err(old);
+ if (ret)
+ return ret;
+
+ u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+ u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+ }
+
+ return u64s_delta <= 0
+ ? (bch2_foreground_maybe_merge(trans->c, iter, iter->level,
+ trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR)
+ : 0;
+}
+
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
static inline int do_bch2_trans_commit(struct btree_trans *trans,
struct btree_insert_entry **stopped_at)
{
+ struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
struct btree_iter *iter;
int ret;
+ trans_for_each_update2(trans, i) {
+ struct btree *b;
+
+ BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+
+ if (btree_iter_type(i->iter) == BTREE_ITER_CACHED)
+ continue;
+
+ b = iter_l(i->iter)->b;
+ if (b->sib_u64s[0] < c->btree_foreground_merge_threshold ||
+ b->sib_u64s[1] < c->btree_foreground_merge_threshold) {
+ ret = maybe_do_btree_merge(trans, i->iter);
+ if (unlikely(ret))
+ return ret;
+ }
+ }
+
trans_for_each_update2(trans, i)
- BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level));
+ BUG_ON(!btree_node_intent_locked(i->iter, i->level));
- ret = bch2_journal_preres_get(&trans->c->journal,
+ ret = bch2_journal_preres_get(&c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
JOURNAL_RES_GET_NONBLOCK|
- ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)
- ? JOURNAL_RES_GET_RECLAIM : 0));
+ ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
+ ? JOURNAL_RES_GET_RESERVED : 0));
if (unlikely(ret == -EAGAIN))
ret = bch2_trans_journal_preres_get_cold(trans,
trans->journal_preres_u64s);
@@ -547,7 +599,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_lock_for_insert(trans->c,
+ bch2_btree_node_lock_for_insert(c,
iter_l(i->iter)->b, i->iter);
ret = bch2_trans_commit_write_locked(trans, stopped_at);
@@ -558,33 +610,43 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
i->iter);
if (!ret && trans->journal_pin)
- bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq,
+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
trans->journal_pin, NULL);
/*
* Drop journal reservation after dropping write locks, since dropping
* the journal reservation may kick off a journal write:
*/
- bch2_journal_res_put(&trans->c->journal, &trans->journal_res);
+ bch2_journal_res_put(&c->journal, &trans->journal_res);
if (unlikely(ret))
return ret;
- if (trans->flags & BTREE_INSERT_NOUNLOCK)
- trans->nounlock = true;
+ bch2_trans_downgrade(trans);
- if (!(trans->flags & BTREE_INSERT_NOUNLOCK))
- trans_for_each_update2(trans, i)
- if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
- !same_leaf_as_prev(trans, i))
- bch2_foreground_maybe_merge(trans->c, i->iter,
- 0, trans->flags);
+ return 0;
+}
- trans->nounlock = false;
+static int journal_reclaim_wait_done(struct bch_fs *c)
+{
+ int ret;
- bch2_trans_downgrade(trans);
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ return ret;
- return 0;
+ ret = !bch2_btree_key_cache_must_wait(c);
+ if (ret)
+ return ret;
+
+ if (mutex_trylock(&c->journal.reclaim_lock)) {
+ ret = bch2_journal_reclaim(&c->journal);
+ mutex_unlock(&c->journal.reclaim_lock);
+ }
+
+ if (!ret)
+ ret = !bch2_btree_key_cache_must_wait(c);
+ return ret;
}
static noinline
@@ -641,11 +703,9 @@ int bch2_trans_commit_error(struct btree_trans *trans,
case BTREE_INSERT_NEED_MARK_REPLICAS:
bch2_trans_unlock(trans);
- trans_for_each_update(trans, i) {
- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
- if (ret)
- return ret;
- }
+ ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
+ if (ret)
+ return ret;
if (bch2_trans_relock(trans))
return 0;
@@ -656,6 +716,10 @@ int bch2_trans_commit_error(struct btree_trans *trans,
case BTREE_INSERT_NEED_JOURNAL_RES:
bch2_trans_unlock(trans);
+ if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED))
+ return -EAGAIN;
+
ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
if (ret)
return ret;
@@ -669,11 +733,8 @@ int bch2_trans_commit_error(struct btree_trans *trans,
case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
bch2_trans_unlock(trans);
- do {
- mutex_lock(&c->journal.reclaim_lock);
- ret = bch2_journal_reclaim(&c->journal);
- mutex_unlock(&c->journal.reclaim_lock);
- } while (!ret && bch2_btree_key_cache_must_wait(c));
+ wait_event(c->journal.reclaim_wait,
+ (ret = journal_reclaim_wait_done(c)));
if (!ret && bch2_trans_relock(trans))
return 0;
@@ -920,17 +981,14 @@ int __bch2_trans_commit(struct btree_trans *trans)
goto out;
}
- /*
- * We're not using bch2_btree_iter_upgrade here because
- * we know trans->nounlock can't be set:
- */
- if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) &&
- !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) {
+ if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
trace_trans_restart_upgrade(trans->ip);
ret = -EINTR;
goto out;
}
+ BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+
u64s = jset_u64s(i->k->k.u64s);
if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index e6e75235..31f7617e 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -167,37 +167,6 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
percpu_up_write(&c->mark_lock);
}
-void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage)
-{
- if (fs_usage == c->usage_scratch)
- mutex_unlock(&c->usage_scratch_lock);
- else
- kfree(fs_usage);
-}
-
-struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c)
-{
- struct bch_fs_usage *ret;
- unsigned bytes = fs_usage_u64s(c) * sizeof(u64);
-
- ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN);
- if (ret)
- return ret;
-
- if (mutex_trylock(&c->usage_scratch_lock))
- goto out_pool;
-
- ret = kzalloc(bytes, GFP_NOFS);
- if (ret)
- return ret;
-
- mutex_lock(&c->usage_scratch_lock);
-out_pool:
- ret = c->usage_scratch;
- memset(ret, 0, bytes);
- return ret;
-}
-
static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
unsigned journal_seq,
bool gc)
@@ -252,30 +221,28 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
return ret;
}
-struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
{
- struct bch_fs_usage *ret;
- unsigned seq, i, v, u64s = fs_usage_u64s(c);
-retry:
- ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
- if (unlikely(!ret))
- return NULL;
+ struct bch_fs_usage_online *ret;
+ unsigned seq, i, u64s;
percpu_down_read(&c->mark_lock);
- v = fs_usage_u64s(c);
- if (unlikely(u64s != v)) {
- u64s = v;
+ ret = kmalloc(sizeof(struct bch_fs_usage_online) +
+ sizeof(u64) + c->replicas.nr, GFP_NOFS);
+ if (unlikely(!ret)) {
percpu_up_read(&c->mark_lock);
- kfree(ret);
- goto retry;
+ return NULL;
}
+ ret->online_reserved = percpu_u64_get(c->online_reserved);
+
+ u64s = fs_usage_u64s(c);
do {
seq = read_seqcount_begin(&c->usage_lock);
- memcpy(ret, c->usage_base, u64s * sizeof(u64));
+ memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[i], u64s);
+ acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
} while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
@@ -311,31 +278,31 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
void bch2_fs_usage_to_text(struct printbuf *out,
struct bch_fs *c,
- struct bch_fs_usage *fs_usage)
+ struct bch_fs_usage_online *fs_usage)
{
unsigned i;
pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
pr_buf(out, "hidden:\t\t\t\t%llu\n",
- fs_usage->hidden);
+ fs_usage->u.hidden);
pr_buf(out, "data:\t\t\t\t%llu\n",
- fs_usage->data);
+ fs_usage->u.data);
pr_buf(out, "cached:\t\t\t\t%llu\n",
- fs_usage->cached);
+ fs_usage->u.cached);
pr_buf(out, "reserved:\t\t\t%llu\n",
- fs_usage->reserved);
+ fs_usage->u.reserved);
pr_buf(out, "nr_inodes:\t\t\t%llu\n",
- fs_usage->nr_inodes);
+ fs_usage->u.nr_inodes);
pr_buf(out, "online reserved:\t\t%llu\n",
fs_usage->online_reserved);
for (i = 0;
- i < ARRAY_SIZE(fs_usage->persistent_reserved);
+ i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
i++) {
pr_buf(out, "%u replicas:\n", i + 1);
pr_buf(out, "\treserved:\t\t%llu\n",
- fs_usage->persistent_reserved[i]);
+ fs_usage->u.persistent_reserved[i]);
}
for (i = 0; i < c->replicas.nr; i++) {
@@ -344,7 +311,7 @@ void bch2_fs_usage_to_text(struct printbuf *out,
pr_buf(out, "\t");
bch2_replicas_entry_to_text(out, e);
- pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]);
+ pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
}
}
@@ -360,12 +327,12 @@ static u64 avail_factor(u64 r)
return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
}
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
{
- return min(fs_usage->hidden +
- fs_usage->btree +
- fs_usage->data +
- reserve_factor(fs_usage->reserved +
+ return min(fs_usage->u.hidden +
+ fs_usage->u.btree +
+ fs_usage->u.data +
+ reserve_factor(fs_usage->u.reserved +
fs_usage->online_reserved),
c->capacity);
}
@@ -382,7 +349,7 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
bch2_fs_usage_read_one(c, &c->usage_base->btree);
reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
- bch2_fs_usage_read_one(c, &c->usage_base->online_reserved);
+ percpu_u64_get(c->online_reserved);
ret.used = min(ret.capacity, data + reserve_factor(reserved));
ret.free = ret.capacity - ret.used;
@@ -436,43 +403,6 @@ static bool bucket_became_unavailable(struct bucket_mark old,
!is_available_bucket(new);
}
-int bch2_fs_usage_apply(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct disk_reservation *disk_res,
- unsigned journal_seq)
-{
- s64 added = fs_usage->data + fs_usage->reserved;
- s64 should_not_have_added;
- int ret = 0;
-
- percpu_rwsem_assert_held(&c->mark_lock);
-
- /*
- * Not allowed to reduce sectors_available except by getting a
- * reservation:
- */
- should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
- if (WARN_ONCE(should_not_have_added > 0,
- "disk usage increased by %lli more than reservation of %llu",
- added, disk_res ? disk_res->sectors : 0)) {
- atomic64_sub(should_not_have_added, &c->sectors_available);
- added -= should_not_have_added;
- ret = -1;
- }
-
- if (added > 0) {
- disk_res->sectors -= added;
- fs_usage->online_reserved -= added;
- }
-
- preempt_disable();
- acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false),
- (u64 *) fs_usage, fs_usage_u64s(c));
- preempt_enable();
-
- return ret;
-}
-
static inline void account_bucket(struct bch_fs_usage *fs_usage,
struct bch_dev_usage *dev_usage,
enum bch_data_type type,
@@ -494,6 +424,8 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
percpu_rwsem_assert_held(&c->mark_lock);
preempt_disable();
+ if (!fs_usage)
+ fs_usage = fs_usage_ptr(c, journal_seq, gc);
u = dev_usage_ptr(ca, journal_seq, gc);
if (bucket_type(old))
@@ -504,8 +436,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
account_bucket(fs_usage, u, bucket_type(new),
1, ca->mi.bucket_size);
- u->buckets_alloc +=
- (int) new.owned_by_allocator - (int) old.owned_by_allocator;
u->buckets_ec += (int) new.stripe - (int) old.stripe;
u->buckets_unavailable +=
is_unavailable_bucket(new) - is_unavailable_bucket(old);
@@ -524,22 +454,17 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
bch2_wake_allocator(ca);
}
-static inline int update_replicas(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct bch_replicas_entry *r,
- s64 sectors)
+static inline void update_replicas(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ struct bch_replicas_entry *r,
+ s64 sectors)
{
int idx = bch2_replicas_entry_idx(c, r);
- if (idx < 0)
- return -1;
-
- if (!fs_usage)
- return 0;
+ BUG_ON(idx < 0);
fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
- return 0;
}
static inline void update_cached_sectors(struct bch_fs *c,
@@ -586,6 +511,7 @@ static inline void update_replicas_list(struct btree_trans *trans,
n = (void *) d->d + d->used;
n->delta = sectors;
memcpy(&n->r, r, replicas_entry_bytes(r));
+ bch2_replicas_entry_sort(&n->r);
d->used += b;
}
@@ -599,43 +525,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
update_replicas_list(trans, &r.e, sectors);
}
-static inline struct replicas_delta *
-replicas_delta_next(struct replicas_delta *d)
-{
- return (void *) d + replicas_entry_bytes(&d->r) + 8;
-}
-
-int bch2_replicas_delta_list_apply(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct replicas_delta_list *r)
-{
- struct replicas_delta *d = r->d;
- struct replicas_delta *top = (void *) r->d + r->used;
- unsigned i;
-
- for (d = r->d; d != top; d = replicas_delta_next(d))
- if (update_replicas(c, fs_usage, &d->r, d->delta)) {
- top = d;
- goto unwind;
- }
-
- if (!fs_usage)
- return 0;
-
- fs_usage->nr_inodes += r->nr_inodes;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- fs_usage->reserved += r->persistent_reserved[i];
- fs_usage->persistent_reserved[i] += r->persistent_reserved[i];
- }
-
- return 0;
-unwind:
- for (d = r->d; d != top; d = replicas_delta_next(d))
- update_replicas(c, fs_usage, &d->r, -d->delta);
- return -1;
-}
-
#define do_mark_fn(fn, c, pos, flags, ...) \
({ \
int gc, ret = 0; \
@@ -653,7 +542,6 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
bool gc)
{
- struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark old, new;
@@ -661,13 +549,6 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
new.owned_by_allocator = owned_by_allocator;
}));
- /*
- * XXX: this is wrong, this means we'll be doing updates to the percpu
- * buckets_alloc counter that don't have an open journal buffer and
- * we'll race with the machinery that accumulates that to ca->usage_base
- */
- bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc);
-
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
@@ -1416,22 +1297,15 @@ int bch2_mark_update(struct btree_trans *trans,
return ret;
}
-void bch2_trans_fs_usage_apply(struct btree_trans *trans,
- struct bch_fs_usage *fs_usage)
+static noinline __cold
+void fs_usage_apply_warn(struct btree_trans *trans,
+ unsigned disk_res_sectors)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- static int warned_disk_usage = 0;
- u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
char buf[200];
- if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res,
- trans->journal_res.seq) ||
- warned_disk_usage ||
- xchg(&warned_disk_usage, 1))
- return;
-
- bch_err(c, "disk usage increased more than %llu sectors reserved",
+ bch_err(c, "disk usage increased more than %u sectors reserved",
disk_res_sectors);
trans_for_each_update(trans, i) {
@@ -1466,6 +1340,65 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
}
}
+void bch2_trans_fs_usage_apply(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
+{
+ struct bch_fs *c = trans->c;
+ static int warned_disk_usage = 0;
+ bool warn = false;
+ unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+ struct replicas_delta *d = deltas->d;
+ struct replicas_delta *top = (void *) deltas->d + deltas->used;
+ struct bch_fs_usage *dst;
+ s64 added = 0, should_not_have_added;
+ unsigned i;
+
+ percpu_rwsem_assert_held(&c->mark_lock);
+
+ preempt_disable();
+ dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+ for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+ switch (d->r.data_type) {
+ case BCH_DATA_btree:
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ added += d->delta;
+ }
+
+ update_replicas(c, dst, &d->r, d->delta);
+ }
+
+ dst->nr_inodes += deltas->nr_inodes;
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ added += deltas->persistent_reserved[i];
+ dst->reserved += deltas->persistent_reserved[i];
+ dst->persistent_reserved[i] += deltas->persistent_reserved[i];
+ }
+
+ /*
+ * Not allowed to reduce sectors_available except by getting a
+ * reservation:
+ */
+ should_not_have_added = added - (s64) disk_res_sectors;
+ if (unlikely(should_not_have_added > 0)) {
+ atomic64_sub(should_not_have_added, &c->sectors_available);
+ added -= should_not_have_added;
+ warn = true;
+ }
+
+ if (added > 0) {
+ trans->disk_res->sectors -= added;
+ this_cpu_sub(*c->online_reserved, added);
+ }
+
+ preempt_enable();
+
+ if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
+ fs_usage_apply_warn(trans, disk_res_sectors);
+}
+
/* trans_mark: */
static struct btree_iter *trans_get_update(struct btree_trans *trans,
@@ -2197,16 +2130,6 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c,
/* Disk reservations: */
-void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
-{
- percpu_down_read(&c->mark_lock);
- this_cpu_sub(c->usage[0]->online_reserved,
- res->sectors);
- percpu_up_read(&c->mark_lock);
-
- res->sectors = 0;
-}
-
#define SECTORS_CACHE 1024
int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
@@ -2240,7 +2163,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
out:
pcpu->sectors_available -= sectors;
- this_cpu_add(c->usage[0]->online_reserved, sectors);
+ this_cpu_add(*c->online_reserved, sectors);
res->sectors += sectors;
preempt_enable();
@@ -2257,7 +2180,7 @@ recalculate:
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
atomic64_set(&c->sectors_available,
max_t(s64, 0, sectors_available - sectors));
- this_cpu_add(c->usage[0]->online_reserved, sectors);
+ this_cpu_add(*c->online_reserved, sectors);
res->sectors += sectors;
ret = 0;
} else {
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 6d15c455..54dcc827 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -210,19 +210,16 @@ static inline unsigned dev_usage_u64s(void)
return sizeof(struct bch_dev_usage) / sizeof(u64);
}
-void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *);
-struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *);
-
u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
-struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
void bch2_fs_usage_to_text(struct printbuf *,
- struct bch_fs *, struct bch_fs_usage *);
+ struct bch_fs *, struct bch_fs_usage_online *);
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *);
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
@@ -240,20 +237,15 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned,
s64, struct bch_fs_usage *, u64, unsigned);
-int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
- struct disk_reservation *, unsigned);
int bch2_mark_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, struct bch_fs_usage *, unsigned);
-int bch2_replicas_delta_list_apply(struct bch_fs *,
- struct bch_fs_usage *,
- struct replicas_delta_list *);
int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c,
unsigned, s64, unsigned);
int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
struct bkey_i *insert, unsigned);
-void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
+void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *,
struct disk_reservation *, struct bch_dev *,
@@ -263,13 +255,11 @@ int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *,
/* disk reservations: */
-void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
-
static inline void bch2_disk_reservation_put(struct bch_fs *c,
struct disk_reservation *res)
{
- if (res->sectors)
- __bch2_disk_reservation_put(c, res);
+ this_cpu_sub(*c->online_reserved, res->sectors);
+ res->sectors = 0;
}
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 404c89a7..588b1a72 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -53,7 +53,6 @@ struct bucket_array {
};
struct bch_dev_usage {
- u64 buckets_alloc;
u64 buckets_ec;
u64 buckets_unavailable;
@@ -66,12 +65,6 @@ struct bch_dev_usage {
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
-
- u64 online_reserved;
-
- /* fields after online_reserved are cleared/recalculated by gc: */
- u64 gc_start[0];
-
u64 hidden;
u64 btree;
u64 data;
@@ -91,6 +84,11 @@ struct bch_fs_usage {
u64 replicas[];
};
+struct bch_fs_usage_online {
+ u64 online_reserved;
+ struct bch_fs_usage u;
+};
+
struct bch_fs_usage_short {
u64 capacity;
u64 used;
@@ -98,22 +96,6 @@ struct bch_fs_usage_short {
u64 nr_inodes;
};
-struct replicas_delta {
- s64 delta;
- struct bch_replicas_entry r;
-} __packed;
-
-struct replicas_delta_list {
- unsigned size;
- unsigned used;
-
- struct {} memset_start;
- u64 nr_inodes;
- u64 persistent_reserved[BCH_REPLICAS_MAX];
- struct {} memset_end;
- struct replicas_delta d[0];
-};
-
/*
* A reservation for space on disk:
*/
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index 49842ec8..c6160147 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -379,7 +379,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
{
struct bch_ioctl_fs_usage *arg = NULL;
struct bch_replicas_usage *dst_e, *dst_end;
- struct bch_fs_usage *src;
+ struct bch_fs_usage_online *src;
u32 replica_entries_bytes;
unsigned i;
int ret = 0;
@@ -405,7 +405,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
arg->online_reserved = src->online_reserved;
for (i = 0; i < BCH_REPLICAS_MAX; i++)
- arg->persistent_reserved[i] = src->persistent_reserved[i];
+ arg->persistent_reserved[i] = src->u.persistent_reserved[i];
dst_e = arg->replicas;
dst_end = (void *) arg->replicas + replica_entries_bytes;
@@ -419,7 +419,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
break;
}
- dst_e->sectors = src->replicas[i];
+ dst_e->sectors = src->u.replicas[i];
dst_e->r = *src_e;
/* recheck after setting nr_devs: */
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 69c553a6..b901be5b 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -11,6 +11,7 @@
#include "btree_gc.h"
#include "btree_update.h"
#include "buckets.h"
+#include "error.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
@@ -59,21 +60,23 @@ journal_seq_to_buf(struct journal *j, u64 seq)
return buf;
}
-static void journal_pin_new_entry(struct journal *j, int count)
+static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
{
- struct journal_entry_pin_list *p;
+ INIT_LIST_HEAD(&p->list);
+ INIT_LIST_HEAD(&p->key_cache_list);
+ INIT_LIST_HEAD(&p->flushed);
+ atomic_set(&p->count, count);
+ p->devs.nr = 0;
+}
+static void journal_pin_new_entry(struct journal *j)
+{
/*
* The fifo_push() needs to happen at the same time as j->seq is
* incremented for journal_last_seq() to be calculated correctly
*/
atomic64_inc(&j->seq);
- p = fifo_push_ref(&j->pin);
-
- INIT_LIST_HEAD(&p->list);
- INIT_LIST_HEAD(&p->flushed);
- atomic_set(&p->count, count);
- p->devs.nr = 0;
+ journal_pin_list_init(fifo_push_ref(&j->pin), 1);
}
static void bch2_journal_buf_init(struct journal *j)
@@ -192,7 +195,7 @@ static bool __journal_entry_close(struct journal *j)
__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
/* Initialize new buffer: */
- journal_pin_new_entry(j, 1);
+ journal_pin_new_entry(j);
bch2_journal_buf_init(j);
@@ -450,6 +453,27 @@ unlock:
if (!ret)
goto retry;
+ if ((ret == cur_entry_journal_full ||
+ ret == cur_entry_journal_pin_full) &&
+ !can_discard &&
+ j->reservations.idx == j->reservations.unwritten_idx &&
+ (flags & JOURNAL_RES_GET_RESERVED)) {
+ char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
+
+ bch_err(c, "Journal stuck!");
+ if (journal_debug_buf) {
+ bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
+ bch_err(c, "%s", journal_debug_buf);
+
+ bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
+ bch_err(c, "Journal pins:\n%s", journal_debug_buf);
+ kfree(journal_debug_buf);
+ }
+
+ bch2_fatal_error(c);
+ dump_stack();
+ }
+
/*
* Journal is full - can't rely on reclaim from work item due to
* freezing:
@@ -499,7 +523,7 @@ static bool journal_preres_available(struct journal *j,
unsigned new_u64s,
unsigned flags)
{
- bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags);
+ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
if (!ret && mutex_trylock(&j->reclaim_lock)) {
bch2_journal_reclaim(j);
@@ -1009,12 +1033,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
j->pin.back = cur_seq;
atomic64_set(&j->seq, cur_seq - 1);
- fifo_for_each_entry_ptr(p, &j->pin, seq) {
- INIT_LIST_HEAD(&p->list);
- INIT_LIST_HEAD(&p->flushed);
- atomic_set(&p->count, 1);
- p->devs.nr = 0;
- }
+ fifo_for_each_entry_ptr(p, &j->pin, seq)
+ journal_pin_list_init(p, 1);
list_for_each_entry(i, journal_entries, list) {
unsigned ptr;
@@ -1037,7 +1057,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
set_bit(JOURNAL_STARTED, &j->flags);
j->last_flush_write = jiffies;
- journal_pin_new_entry(j, 1);
+ journal_pin_new_entry(j);
j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
@@ -1114,6 +1134,7 @@ int bch2_fs_journal_init(struct journal *j)
spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);
INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+ init_waitqueue_head(&j->reclaim_wait);
init_waitqueue_head(&j->pin_flush_wait);
mutex_init(&j->reclaim_lock);
mutex_init(&j->discard_lock);
@@ -1166,6 +1187,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
"last_seq_ondisk:\t%llu\n"
"flushed_seq_ondisk:\t%llu\n"
"prereserved:\t\t%u/%u\n"
+ "each entry reserved:\t%u\n"
"nr flush writes:\t%llu\n"
"nr noflush writes:\t%llu\n"
"nr direct reclaim:\t%llu\n"
@@ -1180,6 +1202,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
j->flushed_seq_ondisk,
j->prereserved.reserved,
j->prereserved.remaining,
+ j->entry_u64s_reserved,
j->nr_flush_writes,
j->nr_noflush_writes,
j->nr_direct_reclaim,
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index bda8cb97..cc497125 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -213,11 +213,13 @@ static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type
enum btree_id id, unsigned level,
const void *data, unsigned u64s)
{
- memset(entry, 0, sizeof(*entry));
entry->u64s = cpu_to_le16(u64s);
- entry->type = type;
entry->btree_id = id;
entry->level = level;
+ entry->type = type;
+ entry->pad[0] = 0;
+ entry->pad[1] = 0;
+ entry->pad[2] = 0;
memcpy_u64s_small(entry->_data, data, u64s);
return jset_u64s(u64s);
@@ -306,7 +308,6 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
#define JOURNAL_RES_GET_NONBLOCK (1 << 0)
#define JOURNAL_RES_GET_CHECK (1 << 1)
#define JOURNAL_RES_GET_RESERVED (1 << 2)
-#define JOURNAL_RES_GET_RECLAIM (1 << 3)
static inline int journal_res_get_fast(struct journal *j,
struct journal_res *res,
@@ -410,7 +411,12 @@ static inline void bch2_journal_preres_put(struct journal *j,
s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
res->u64s = 0;
- closure_wake_up(&j->preres_wait);
+
+ if (unlikely(s.waiting)) {
+ clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
+ (unsigned long *) &j->prereserved.v);
+ closure_wake_up(&j->preres_wait);
+ }
if (s.reserved <= s.remaining &&
!test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
@@ -426,32 +432,32 @@ int __bch2_journal_preres_get(struct journal *,
static inline int bch2_journal_preres_get_fast(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
- unsigned flags)
+ unsigned flags,
+ bool set_waiting)
{
int d = new_u64s - res->u64s;
union journal_preres_state old, new;
u64 v = atomic64_read(&j->prereserved.counter);
+ int ret;
do {
old.v = new.v = v;
-
- new.reserved += d;
-
- /*
- * If we're being called from the journal reclaim path, we have
- * to unconditionally give out the pre-reservation, there's
- * nothing else sensible we can do - otherwise we'd recurse back
- * into the reclaim path and deadlock:
- */
-
- if (!(flags & JOURNAL_RES_GET_RECLAIM) &&
- new.reserved > new.remaining)
+ ret = 0;
+
+ if ((flags & JOURNAL_RES_GET_RESERVED) ||
+ new.reserved + d < new.remaining) {
+ new.reserved += d;
+ ret = 1;
+ } else if (set_waiting && !new.waiting)
+ new.waiting = true;
+ else
return 0;
} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
old.v, new.v)) != old.v);
- res->u64s += d;
- return 1;
+ if (ret)
+ res->u64s += d;
+ return ret;
}
static inline int bch2_journal_preres_get(struct journal *j,
@@ -462,7 +468,7 @@ static inline int bch2_journal_preres_get(struct journal *j,
if (new_u64s <= res->u64s)
return 0;
- if (bch2_journal_preres_get_fast(j, res, new_u64s, flags))
+ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
return 0;
if (flags & JOURNAL_RES_GET_NONBLOCK)
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 93b5e07e..7be6c65c 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -239,7 +239,7 @@ void bch2_journal_space_available(struct journal *j)
u64s_remaining = (u64) clean << 6;
u64s_remaining -= (u64) total << 3;
u64s_remaining = max(0LL, u64s_remaining);
- u64s_remaining /= 2;
+ u64s_remaining /= 4;
u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
@@ -353,6 +353,9 @@ static inline void __journal_pin_drop(struct journal *j,
if (!journal_pin_active(pin))
return;
+ if (j->flush_in_progress == pin)
+ j->flush_in_progress_dropped = true;
+
pin_list = journal_seq_pin(j, pin->seq);
pin->seq = 0;
list_del_init(&pin->list);
@@ -404,7 +407,12 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
pin->seq = seq;
pin->flush = flush_fn;
- list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
+ if (flush_fn == bch2_btree_key_cache_journal_flush)
+ list_add(&pin->list, &pin_list->key_cache_list);
+ else if (flush_fn)
+ list_add(&pin->list, &pin_list->list);
+ else
+ list_add(&pin->list, &pin_list->flushed);
spin_unlock(&j->lock);
/*
@@ -434,39 +442,49 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
*/
static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
+journal_get_next_pin(struct journal *j,
+ bool get_any,
+ bool get_key_cache,
+ u64 max_seq, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
- if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
- return NULL;
-
- spin_lock(&j->lock);
-
- fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
- if (*seq > max_seq ||
- (ret = list_first_entry_or_null(&pin_list->list,
- struct journal_entry_pin, list)))
+ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
+ if (*seq > max_seq && !get_any && !get_key_cache)
break;
- if (ret) {
- list_move(&ret->list, &pin_list->flushed);
- BUG_ON(j->flush_in_progress);
- j->flush_in_progress = ret;
- }
+ if (*seq <= max_seq || get_any) {
+ ret = list_first_entry_or_null(&pin_list->list,
+ struct journal_entry_pin, list);
+ if (ret)
+ return ret;
+ }
- spin_unlock(&j->lock);
+ if (*seq <= max_seq || get_any || get_key_cache) {
+ ret = list_first_entry_or_null(&pin_list->key_cache_list,
+ struct journal_entry_pin, list);
+ if (ret)
+ return ret;
+ }
+ }
- return ret;
+ return NULL;
}
/* returns true if we did work */
-static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush,
- unsigned min_nr)
+static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
+ unsigned min_any,
+ unsigned min_key_cache)
{
struct journal_entry_pin *pin;
- u64 seq, ret = 0;
+ size_t nr_flushed = 0;
+ journal_pin_flush_fn flush_fn;
+ u64 seq;
+ int err;
+
+ if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
+ return 0;
lockdep_assert_held(&j->reclaim_lock);
@@ -475,23 +493,47 @@ static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush,
j->last_flushed = jiffies;
- pin = journal_get_next_pin(j, min_nr
- ? U64_MAX : seq_to_flush, &seq);
+ spin_lock(&j->lock);
+ pin = journal_get_next_pin(j,
+ min_any != 0,
+ min_key_cache != 0,
+ seq_to_flush, &seq);
+ if (pin) {
+ BUG_ON(j->flush_in_progress);
+ j->flush_in_progress = pin;
+ j->flush_in_progress_dropped = false;
+ flush_fn = pin->flush;
+ }
+ spin_unlock(&j->lock);
+
if (!pin)
break;
- if (min_nr)
- min_nr--;
+ if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
+ min_key_cache--;
+
+ if (min_any)
+ min_any--;
- pin->flush(j, pin, seq);
+ err = flush_fn(j, pin, seq);
- BUG_ON(j->flush_in_progress != pin);
+ spin_lock(&j->lock);
+ /* Pin might have been dropped or rearmed: */
+ if (likely(!err && !j->flush_in_progress_dropped))
+ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
j->flush_in_progress = NULL;
+ j->flush_in_progress_dropped = false;
+ spin_unlock(&j->lock);
+
wake_up(&j->pin_flush_wait);
- ret++;
+
+ if (err)
+ break;
+
+ nr_flushed++;
}
- return ret;
+ return nr_flushed;
}
static u64 journal_seq_to_flush(struct journal *j)
@@ -556,8 +598,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool kthread = (current->flags & PF_KTHREAD) != 0;
- u64 seq_to_flush, nr_flushed = 0;
- size_t min_nr;
+ u64 seq_to_flush;
+ size_t min_nr, nr_flushed;
unsigned flags;
int ret = 0;
@@ -595,15 +637,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
if (j->prereserved.reserved * 2 > j->prereserved.remaining)
min_nr = 1;
- if (atomic_read(&c->btree_cache.dirty) * 4 >
- c->btree_cache.used * 3)
- min_nr = 1;
-
if (fifo_free(&j->pin) <= 32)
min_nr = 1;
- min_nr = max(min_nr, bch2_nr_btree_keys_want_flush(c));
-
trace_journal_reclaim_start(c,
min_nr,
j->prereserved.reserved,
@@ -613,14 +649,19 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));
- nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr);
+ nr_flushed = journal_flush_pins(j, seq_to_flush,
+ min_nr,
+ min(bch2_nr_btree_keys_need_flush(c), 128UL));
if (direct)
j->nr_direct_reclaim += nr_flushed;
else
j->nr_background_reclaim += nr_flushed;
trace_journal_reclaim_finish(c, nr_flushed);
- } while (min_nr && nr_flushed);
+
+ if (nr_flushed)
+ wake_up(&j->reclaim_wait);
+ } while (min_nr && nr_flushed && !direct);
memalloc_noreclaim_restore(flags);
@@ -713,7 +754,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
- *did_work = journal_flush_pins(j, seq_to_flush, 0) != 0;
+ *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0;
spin_lock(&j->lock);
/*
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index d17a1ff8..c24bc4aa 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -43,6 +43,7 @@ struct journal_buf {
struct journal_entry_pin_list {
struct list_head list;
+ struct list_head key_cache_list;
struct list_head flushed;
atomic_t count;
struct bch_devs_list devs;
@@ -50,7 +51,7 @@ struct journal_entry_pin_list {
struct journal;
struct journal_entry_pin;
-typedef void (*journal_pin_flush_fn)(struct journal *j,
+typedef int (*journal_pin_flush_fn)(struct journal *j,
struct journal_entry_pin *, u64);
struct journal_entry_pin {
@@ -105,8 +106,9 @@ union journal_preres_state {
};
struct {
- u32 reserved;
- u32 remaining;
+ u64 waiting:1,
+ reserved:31,
+ remaining:32;
};
};
@@ -243,6 +245,7 @@ struct journal {
spinlock_t err_lock;
struct mutex reclaim_lock;
+ wait_queue_head_t reclaim_wait;
struct task_struct *reclaim_thread;
bool reclaim_kicked;
u64 nr_direct_reclaim;
@@ -250,6 +253,7 @@ struct journal {
unsigned long last_flushed;
struct journal_entry_pin *flush_in_progress;
+ bool flush_in_progress_dropped;
wait_queue_head_t pin_flush_wait;
/* protects advancing ja->discard_idx: */
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index 1403616b..ef69a19f 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -88,6 +88,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
if (ret)
break;
}
+ bch2_trans_iter_put(&trans, iter);
ret = bch2_trans_exit(&trans) ?: ret;
bch2_bkey_buf_exit(&sk, c);
@@ -135,20 +136,24 @@ retry:
dev_idx, flags, true);
if (ret) {
bch_err(c, "Cannot drop device without losing data");
- goto err;
+ break;
}
ret = bch2_btree_node_update_key(c, iter, b, k.k);
if (ret == -EINTR) {
b = bch2_btree_iter_peek_node(iter);
+ ret = 0;
goto retry;
}
if (ret) {
bch_err(c, "Error updating btree node key: %i", ret);
- goto err;
+ break;
}
}
bch2_trans_iter_free(&trans, iter);
+
+ if (ret)
+ goto err;
}
/* flush relevant btree updates */
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index c9e18491..5b108490 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -793,6 +793,9 @@ next:
out:
bch2_trans_exit(&trans);
+ if (ret)
+ bch_err(c, "error %i in bch2_move_btree", ret);
+
return ret;
}
@@ -916,8 +919,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
rewrite_old_nodes_pred, c, stats);
if (!ret) {
mutex_lock(&c->sb_lock);
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE;
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
c->disk_sb.sb->version_min = c->disk_sb.sb->version;
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index f9312f00..0cfbb56a 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -21,6 +21,11 @@ const char * const bch2_sb_features[] = {
NULL
};
+const char * const bch2_sb_compat[] = {
+ BCH_SB_COMPAT()
+ NULL
+};
+
const char * const bch2_btree_ids[] = {
BCH_BTREE_IDS()
NULL
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 4ae58b68..001e865c 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -10,6 +10,7 @@
extern const char * const bch2_error_actions[];
extern const char * const bch2_sb_features[];
+extern const char * const bch2_sb_compat[];
extern const char * const bch2_btree_ids[];
extern const char * const bch2_csum_opts[];
extern const char * const bch2_compression_opts[];
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 3d1bf87e..86593e92 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -935,7 +935,7 @@ static int read_btree_roots(struct bch_fs *c)
if (i == BTREE_ID_alloc &&
c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
continue;
}
@@ -945,7 +945,7 @@ static int read_btree_roots(struct bch_fs *c)
"invalid btree root %s",
bch2_btree_ids[i]);
if (i == BTREE_ID_alloc)
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
}
ret = bch2_btree_root_read(c, i, &r->key, r->level);
@@ -955,7 +955,7 @@ static int read_btree_roots(struct bch_fs *c)
"error reading btree root %s",
bch2_btree_ids[i]);
if (i == BTREE_ID_alloc)
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
}
}
@@ -998,7 +998,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) {
+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
ret = -EINVAL;
goto err;
@@ -1041,7 +1041,7 @@ int bch2_fs_recovery(struct bch_fs *c)
last_journal_entry &&
!journal_entry_empty(last_journal_entry), c,
"filesystem marked clean but journal not empty")) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
}
@@ -1075,7 +1075,7 @@ use_clean:
}
if (c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
drop_alloc_keys(&c->journal_keys);
}
@@ -1128,8 +1128,8 @@ use_clean:
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
if (c->opts.fsck ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
bch_info(c, "starting mark and sweep");
err = "error in mark and sweep";
@@ -1215,11 +1215,11 @@ use_clean:
bch_verbose(c, "quotas done");
}
- if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE)) ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) {
+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
struct bch_move_stats stats = { 0 };
- bch_verbose(c, "scanning for old btree nodes");
+ bch_info(c, "scanning for old btree nodes");
ret = bch2_fs_read_write(c);
if (ret)
goto err;
@@ -1227,7 +1227,7 @@ use_clean:
ret = bch2_scan_old_btree_nodes(c, &stats);
if (ret)
goto err;
- bch_verbose(c, "scanning for old btree nodes done");
+ bch_info(c, "scanning for old btree nodes done");
}
mutex_lock(&c->sb_lock);
@@ -1238,7 +1238,7 @@ use_clean:
}
if (!test_bit(BCH_FS_ERROR, &c->flags)) {
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
write_sb = true;
}
@@ -1289,8 +1289,8 @@ int bch2_fs_initialize(struct bch_fs *c)
bch_notice(c, "initializing new filesystem");
mutex_lock(&c->sb_lock);
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE;
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
if (c->opts.version_upgrade) {
c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index be73b458..1e297171 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -271,11 +271,13 @@ static int replicas_table_update(struct bch_fs *c,
struct bch_replicas_cpu *new_r)
{
struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
- struct bch_fs_usage *new_scratch = NULL;
+ struct bch_fs_usage_online *new_scratch = NULL;
struct bch_fs_usage __percpu *new_gc = NULL;
struct bch_fs_usage *new_base = NULL;
unsigned i, bytes = sizeof(struct bch_fs_usage) +
sizeof(u64) * new_r->nr;
+ unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
+ sizeof(u64) * new_r->nr;
int ret = 0;
memset(new_usage, 0, sizeof(new_usage));
@@ -286,7 +288,7 @@ static int replicas_table_update(struct bch_fs *c,
goto err;
if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
- !(new_scratch = kmalloc(bytes, GFP_KERNEL)) ||
+ !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) ||
(c->usage_gc &&
!(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
goto err;
@@ -462,6 +464,36 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
+/* replicas delta list: */
+
+bool bch2_replicas_delta_list_marked(struct bch_fs *c,
+ struct replicas_delta_list *r)
+{
+ struct replicas_delta *d = r->d;
+ struct replicas_delta *top = (void *) r->d + r->used;
+
+ percpu_rwsem_assert_held(&c->mark_lock);
+
+ for (d = r->d; d != top; d = replicas_delta_next(d))
+ if (bch2_replicas_entry_idx(c, &d->r) < 0)
+ return false;
+ return true;
+}
+
+int bch2_replicas_delta_list_mark(struct bch_fs *c,
+ struct replicas_delta_list *r)
+{
+ struct replicas_delta *d = r->d;
+ struct replicas_delta *top = (void *) r->d + r->used;
+ int ret = 0;
+
+ for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
+ ret = bch2_mark_replicas(c, &d->r);
+ return ret;
+}
+
+/* bkey replicas: */
+
bool bch2_bkey_replicas_marked(struct bch_fs *c,
struct bkey_s_c k)
{
@@ -473,6 +505,11 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
return __bch2_mark_bkey_replicas(c, k, false);
}
+/*
+ * Old replicas_gc mechanism: only used for journal replicas entries now, should
+ * die at some point:
+ */
+
int bch2_replicas_gc_end(struct bch_fs *c, int ret)
{
unsigned i;
@@ -566,6 +603,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
return 0;
}
+/* New much simpler mechanism for clearing out unneeded replicas entries: */
+
int bch2_replicas_gc2(struct bch_fs *c)
{
struct bch_replicas_cpu new = { 0 };
@@ -966,11 +1005,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
percpu_down_read(&c->mark_lock);
for_each_cpu_replicas_entry(&c->replicas, e) {
- unsigned i, nr_online = 0, dflags = 0;
+ unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
bool metadata = e->data_type < BCH_DATA_user;
- for (i = 0; i < e->nr_devs; i++)
+ for (i = 0; i < e->nr_devs; i++) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
+
nr_online += test_bit(e->devs[i], devs.d);
+ nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
+ }
+
+ if (nr_failed == e->nr_devs)
+ continue;
if (nr_online < e->nr_required)
dflags |= metadata
diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h
index 9c8fd3d9..c77e873e 100644
--- a/libbcachefs/replicas.h
+++ b/libbcachefs/replicas.h
@@ -26,6 +26,31 @@ bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
int bch2_mark_replicas(struct bch_fs *,
struct bch_replicas_entry *);
+struct replicas_delta {
+ s64 delta;
+ struct bch_replicas_entry r;
+} __packed;
+
+struct replicas_delta_list {
+ unsigned size;
+ unsigned used;
+
+ struct {} memset_start;
+ u64 nr_inodes;
+ u64 persistent_reserved[BCH_REPLICAS_MAX];
+ struct {} memset_end;
+ struct replicas_delta d[0];
+};
+
+static inline struct replicas_delta *
+replicas_delta_next(struct replicas_delta *d)
+{
+ return (void *) d + replicas_entry_bytes(&d->r) + 8;
+}
+
+bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *);
+int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
+
void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c);
int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 6f5d5391..17936974 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -377,7 +377,6 @@ static void bch2_sb_update(struct bch_fs *c)
ca->mi = bch2_mi_to_cpu(mi->members + i);
}
-/* doesn't copy member info */
static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
{
struct bch_sb_field *src_f, *dst_f;
@@ -996,7 +995,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct bch_dev *ca;
unsigned i, dev;
- percpu_down_write(&c->mark_lock);
+ percpu_down_read(&c->mark_lock);
if (!journal_seq) {
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
@@ -1067,7 +1066,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
}
}
- percpu_up_write(&c->mark_lock);
+ percpu_up_read(&c->mark_lock);
for (i = 0; i < 2; i++) {
struct jset_entry_clock *clock =
@@ -1093,8 +1092,8 @@ void bch2_fs_mark_clean(struct bch_fs *c)
SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata;
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index c57ebff5..2d008979 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -153,6 +153,8 @@ read_attribute(io_latency_stats_read);
read_attribute(io_latency_stats_write);
read_attribute(congested);
+read_attribute(btree_avg_write_size);
+
read_attribute(bucket_quantiles_last_read);
read_attribute(bucket_quantiles_last_write);
read_attribute(bucket_quantiles_fragmentation);
@@ -230,9 +232,17 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
return ret;
}
+static size_t bch2_btree_avg_write_size(struct bch_fs *c)
+{
+ u64 nr = atomic64_read(&c->btree_writes_nr);
+ u64 sectors = atomic64_read(&c->btree_writes_sectors);
+
+ return nr ? div64_u64(sectors, nr) : 0;
+}
+
static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
+ struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
if (!fs_usage)
return -ENOMEM;
@@ -318,6 +328,7 @@ SHOW(bch2_fs)
sysfs_print(block_size, block_bytes(c));
sysfs_print(btree_node_size, btree_bytes(c));
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
+ sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c));
sysfs_print(read_realloc_races,
atomic_long_read(&c->read_realloc_races));
@@ -513,6 +524,7 @@ struct attribute *bch2_fs_files[] = {
&sysfs_block_size,
&sysfs_btree_node_size,
&sysfs_btree_cache_size,
+ &sysfs_btree_avg_write_size,
&sysfs_journal_write_delay_ms,
&sysfs_journal_reclaim_delay_ms,
@@ -800,7 +812,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
pr_buf(out,
"ec\t%16llu\n"
"available%15llu\n"
- "alloc\t%16llu\n"
"\n"
"free_inc\t\t%zu/%zu\n"
"free[RESERVE_MOVINGGC]\t%zu/%zu\n"
@@ -813,7 +824,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
"btree reserve cache\t%u\n",
stats.buckets_ec,
__dev_buckets_available(ca, stats),
- stats.buckets_alloc,
fifo_used(&ca->free_inc), ca->free_inc.size,
fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,