summaryrefslogtreecommitdiff
path: root/libbcachefs/btree_gc.c
diff options
context:
space:
mode:
Diffstat (limited to 'libbcachefs/btree_gc.c')
-rw-r--r--libbcachefs/btree_gc.c272
1 files changed, 74 insertions, 198 deletions
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index dc97991b..517c2ca2 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -20,6 +20,7 @@
#include "buckets.h"
#include "clock.h"
#include "debug.h"
+#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "extents.h"
@@ -174,10 +175,11 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
return 0;
}
-static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
+static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
struct btree *prev, struct btree *cur,
struct bpos *pulled_from_scan)
{
+ struct bch_fs *c = trans->c;
struct bpos expected_start = !prev
? b->data->min_key
: bpos_successor(prev->key.k.p);
@@ -215,29 +217,29 @@ static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
*pulled_from_scan = cur->data->min_key;
ret = DID_FILL_FROM_SCAN;
} else {
- if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+ if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
"btree node with incorrect min_key%s", buf.buf))
ret = set_node_min(c, cur, expected_start);
}
} else { /* overlap */
if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
- if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
+ if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
"btree node overwritten by next node%s", buf.buf))
ret = DROP_PREV_NODE;
} else {
- if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+ if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
"btree node with incorrect max_key%s", buf.buf))
ret = set_node_max(c, prev,
bpos_predecessor(cur->data->min_key));
}
} else {
if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
- if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
+ if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
"btree node overwritten by prev node%s", buf.buf))
ret = DROP_THIS_NODE;
} else {
- if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+ if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
"btree node with incorrect min_key%s", buf.buf))
ret = set_node_min(c, cur, expected_start);
}
@@ -249,9 +251,10 @@ fsck_err:
return ret;
}
-static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
+static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
struct btree *child, struct bpos *pulled_from_scan)
{
+ struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -265,7 +268,7 @@ static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
prt_str(&buf, "\n child: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
- if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+ if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
"btree node with incorrect max_key%s", buf.buf)) {
if (b->c.level == 1 &&
bpos_lt(*pulled_from_scan, b->key.k.p)) {
@@ -324,8 +327,8 @@ again:
printbuf_reset(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
- if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c,
- btree_node_unreadable,
+ if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
+ trans, btree_node_unreadable,
"Topology repair: unreadable btree node at btree %s level %u:\n"
" %s",
bch2_btree_id_str(b->c.btree_id),
@@ -362,7 +365,7 @@ again:
continue;
}
- ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
+ ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan);
if (ret == DID_FILL_FROM_SCAN) {
new_pass = true;
ret = 0;
@@ -403,7 +406,7 @@ again:
if (!ret && !IS_ERR_OR_NULL(prev)) {
BUG_ON(cur);
- ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
+ ret = btree_repair_node_end(trans, b, prev, pulled_from_scan);
if (ret == DID_FILL_FROM_SCAN) {
new_pass = true;
ret = 0;
@@ -461,8 +464,8 @@ again:
printbuf_reset(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- if (mustfix_fsck_err_on(!have_child, c,
- btree_node_topology_interior_node_empty,
+ if (mustfix_fsck_err_on(!have_child,
+ trans, btree_node_topology_interior_node_empty,
"empty interior btree node at btree %s level %u\n"
" %s",
bch2_btree_id_str(b->c.btree_id),
@@ -509,7 +512,7 @@ reconstruct_root:
r->error = 0;
if (!bch2_btree_has_scanned_nodes(c, i)) {
- mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
+ mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
"no nodes found for btree %s, continue?", bch2_btree_id_str(i));
bch2_btree_root_alloc_fake_trans(trans, i, 0);
} else {
@@ -583,8 +586,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
BUG_ON(bch2_journal_seq_verify &&
k.k->version.lo > atomic64_read(&c->journal.seq));
- if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
- bkey_version_in_future,
+ if (fsck_err_on(btree_id != BTREE_ID_accounting &&
+ k.k->version.lo > atomic64_read(&c->key_version),
+ trans, bkey_version_in_future,
"key version number higher than recorded %llu\n %s",
atomic64_read(&c->key_version),
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
@@ -592,7 +596,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
}
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
- c, btree_bitmap_not_marked,
+ trans, btree_bitmap_not_marked,
"btree ptr not marked in member info btree allocated bitmap\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
@@ -622,7 +626,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
}
ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
- BTREE_TRIGGER_gc|flags);
+ BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
out:
fsck_err:
printbuf_exit(&buf);
@@ -633,29 +637,14 @@ fsck_err:
static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
{
struct bch_fs *c = trans->c;
- int level = 0, target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1;
+ unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
int ret = 0;
/* We need to make sure every leaf node is readable before going RW */
if (initial)
target_depth = 0;
- /* root */
- mutex_lock(&c->btree_root_lock);
- struct btree *b = bch2_btree_id_root(c, btree)->b;
- if (!btree_node_fake(b)) {
- gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
- ret = lockrestart_do(trans,
- bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
- NULL, NULL, bkey_i_to_s_c(&b->key), initial));
- level = b->c.level;
- }
- mutex_unlock(&c->btree_root_lock);
-
- if (ret)
- return ret;
-
- for (; level >= target_depth; --level) {
+ for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
struct btree *prev = NULL;
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
@@ -666,9 +655,21 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
}));
if (ret)
- break;
+ goto err;
}
+ /* root */
+ mutex_lock(&c->btree_root_lock);
+ struct btree *b = bch2_btree_id_root(c, btree)->b;
+ if (!btree_node_fake(b)) {
+ gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
+ ret = lockrestart_do(trans,
+ bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
+ NULL, NULL, bkey_i_to_s_c(&b->key), initial));
+ }
+ mutex_unlock(&c->btree_root_lock);
+err:
+ bch_err_fn(c, ret);
return ret;
}
@@ -697,7 +698,7 @@ static int bch2_gc_btrees(struct bch_fs *c)
ret = bch2_gc_btree(trans, btree, true);
if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
- c, btree_node_read_error,
+ trans, btree_node_read_error,
"btree node read error for %s",
bch2_btree_id_str(btree)))
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
@@ -720,131 +721,25 @@ static int bch2_mark_superblocks(struct bch_fs *c)
static void bch2_gc_free(struct bch_fs *c)
{
+ bch2_accounting_free(&c->accounting[1]);
+
genradix_free(&c->reflink_gc_table);
genradix_free(&c->gc_stripes);
for_each_member_device(c, ca) {
kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
ca->buckets_gc = NULL;
-
- free_percpu(ca->usage_gc);
- ca->usage_gc = NULL;
}
-
- free_percpu(c->usage_gc);
- c->usage_gc = NULL;
-}
-
-static int bch2_gc_done(struct bch_fs *c)
-{
- struct bch_dev *ca = NULL;
- struct printbuf buf = PRINTBUF;
- unsigned i;
- int ret = 0;
-
- percpu_down_write(&c->mark_lock);
-
-#define copy_field(_err, _f, _msg, ...) \
- if (fsck_err_on(dst->_f != src->_f, c, _err, \
- _msg ": got %llu, should be %llu" , ##__VA_ARGS__, \
- dst->_f, src->_f)) \
- dst->_f = src->_f
-#define copy_dev_field(_err, _f, _msg, ...) \
- copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__)
-#define copy_fs_field(_err, _f, _msg, ...) \
- copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
-
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- bch2_fs_usage_acc_to_base(c, i);
-
- __for_each_member_device(c, ca) {
- struct bch_dev_usage *dst = ca->usage_base;
- struct bch_dev_usage *src = (void *)
- bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
- dev_usage_u64s());
-
- for (i = 0; i < BCH_DATA_NR; i++) {
- copy_dev_field(dev_usage_buckets_wrong,
- d[i].buckets, "%s buckets", bch2_data_type_str(i));
- copy_dev_field(dev_usage_sectors_wrong,
- d[i].sectors, "%s sectors", bch2_data_type_str(i));
- copy_dev_field(dev_usage_fragmented_wrong,
- d[i].fragmented, "%s fragmented", bch2_data_type_str(i));
- }
- }
-
- {
- unsigned nr = fs_usage_u64s(c);
- struct bch_fs_usage *dst = c->usage_base;
- struct bch_fs_usage *src = (void *)
- bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
-
- copy_fs_field(fs_usage_hidden_wrong,
- b.hidden, "hidden");
- copy_fs_field(fs_usage_btree_wrong,
- b.btree, "btree");
-
- copy_fs_field(fs_usage_data_wrong,
- b.data, "data");
- copy_fs_field(fs_usage_cached_wrong,
- b.cached, "cached");
- copy_fs_field(fs_usage_reserved_wrong,
- b.reserved, "reserved");
- copy_fs_field(fs_usage_nr_inodes_wrong,
- b.nr_inodes,"nr_inodes");
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- copy_fs_field(fs_usage_persistent_reserved_wrong,
- persistent_reserved[i],
- "persistent_reserved[%i]", i);
-
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(&c->replicas, i);
-
- printbuf_reset(&buf);
- bch2_replicas_entry_to_text(&buf, e);
-
- copy_fs_field(fs_usage_replicas_wrong,
- replicas[i], "%s", buf.buf);
- }
- }
-
-#undef copy_fs_field
-#undef copy_dev_field
-#undef copy_stripe_field
-#undef copy_field
-fsck_err:
- bch2_dev_put(ca);
- bch_err_fn(c, ret);
- percpu_up_write(&c->mark_lock);
- printbuf_exit(&buf);
- return ret;
}
static int bch2_gc_start(struct bch_fs *c)
{
- BUG_ON(c->usage_gc);
-
- c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
- sizeof(u64), GFP_KERNEL);
- if (!c->usage_gc) {
- bch_err(c, "error allocating c->usage_gc");
- return -BCH_ERR_ENOMEM_gc_start;
- }
-
for_each_member_device(c, ca) {
- BUG_ON(ca->usage_gc);
-
- ca->usage_gc = alloc_percpu(struct bch_dev_usage);
- if (!ca->usage_gc) {
- bch_err(c, "error allocating ca->usage_gc");
+ int ret = bch2_dev_usage_init(ca, true);
+ if (ret) {
bch2_dev_put(ca);
- return -BCH_ERR_ENOMEM_gc_start;
+ return ret;
}
-
- this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
- ca->mi.nbuckets - ca->mi.first_bucket);
}
return 0;
@@ -858,6 +753,7 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
l.oldest_gen != r.oldest_gen ||
l.data_type != r.data_type ||
l.dirty_sectors != r.dirty_sectors ||
+ l.stripe_sectors != r.stripe_sectors ||
l.cached_sectors != r.cached_sectors ||
l.stripe_redundancy != r.stripe_redundancy ||
l.stripe != r.stripe;
@@ -888,6 +784,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
gc.data_type = old->data_type;
gc.dirty_sectors = old->dirty_sectors;
}
+ percpu_up_read(&c->mark_lock);
/*
* gc.data_type doesn't yet include need_discard & need_gc_gen states -
@@ -896,12 +793,14 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
alloc_data_type_set(&gc, gc.data_type);
if (gc.data_type != old_gc.data_type ||
- gc.dirty_sectors != old_gc.dirty_sectors)
- bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true);
- percpu_up_read(&c->mark_lock);
+ gc.dirty_sectors != old_gc.dirty_sectors) {
+ ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
+ if (ret)
+ return ret;
+ }
- if (fsck_err_on(new.data_type != gc.data_type, c,
- alloc_key_data_type_wrong,
+ if (fsck_err_on(new.data_type != gc.data_type,
+ trans, alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
": got %s, should be %s",
iter->pos.inode, iter->pos.offset,
@@ -911,7 +810,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
- if (fsck_err_on(new._f != gc._f, c, _errtype, \
+ if (fsck_err_on(new._f != gc._f, \
+ trans, _errtype, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
@@ -924,6 +824,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
gen);
copy_bucket_field(alloc_key_dirty_sectors_wrong,
dirty_sectors);
+ copy_bucket_field(alloc_key_stripe_sectors_wrong,
+ stripe_sectors);
copy_bucket_field(alloc_key_cached_sectors_wrong,
cached_sectors);
copy_bucket_field(alloc_key_stripe_wrong,
@@ -978,14 +880,16 @@ static int bch2_gc_alloc_done(struct bch_fs *c)
static int bch2_gc_alloc_start(struct bch_fs *c)
{
+ int ret = 0;
+
for_each_member_device(c, ca) {
struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
bch2_dev_put(ca);
- bch_err(c, "error allocating ca->buckets[gc]");
- return -BCH_ERR_ENOMEM_gc_alloc_start;
+ ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+ break;
}
buckets->first_bucket = ca->mi.first_bucket;
@@ -993,25 +897,6 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
rcu_assign_pointer(ca->buckets_gc, buckets);
}
- struct bch_dev *ca = NULL;
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- ca = bch2_dev_iterate(c, ca, k.k->p.inode);
- if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
- continue;
- }
-
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-
- struct bucket *g = gc_bucket(ca, k.k->p.offset);
- g->gen_valid = 1;
- g->gen = a->gen;
- 0;
- })));
- bch2_dev_put(ca);
bch_err_fn(c, ret);
return ret;
}
@@ -1041,8 +926,8 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
return -EINVAL;
}
- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
- reflink_v_refcount_wrong,
+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
+ trans, reflink_v_refcount_wrong,
"reflink key has wrong refcount:\n"
" %s\n"
" should be %u",
@@ -1140,7 +1025,8 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans,
if (bad)
bch2_bkey_val_to_text(&buf, c, k);
- if (fsck_err_on(bad, c, stripe_sector_count_wrong,
+ if (fsck_err_on(bad,
+ trans, stripe_sector_count_wrong,
"%s", buf.buf)) {
struct bkey_i_stripe *new;
@@ -1199,8 +1085,6 @@ int bch2_check_allocations(struct bch_fs *c)
lockdep_assert_held(&c->state_lock);
- down_write(&c->gc_lock);
-
bch2_btree_interior_updates_flush(c);
ret = bch2_gc_start(c) ?:
@@ -1212,7 +1096,9 @@ int bch2_check_allocations(struct bch_fs *c)
gc_pos_set(c, gc_phase(GC_PHASE_start));
ret = bch2_mark_superblocks(c);
- BUG_ON(ret);
+ bch_err_msg(c, ret, "marking superblocks");
+ if (ret)
+ goto out;
ret = bch2_gc_btrees(c);
if (ret)
@@ -1220,15 +1106,11 @@ int bch2_check_allocations(struct bch_fs *c)
c->gc_count++;
- bch2_journal_block(&c->journal);
-out:
ret = bch2_gc_alloc_done(c) ?:
- bch2_gc_done(c) ?:
+ bch2_accounting_gc_done(c) ?:
bch2_gc_stripes_done(c) ?:
bch2_gc_reflink_done(c);
-
- bch2_journal_unblock(&c->journal);
-
+out:
percpu_down_write(&c->mark_lock);
/* Indicates that gc is no longer in progress: */
__gc_pos_set(c, gc_phase(GC_PHASE_not_running));
@@ -1236,13 +1118,6 @@ out:
bch2_gc_free(c);
percpu_up_write(&c->mark_lock);
- up_write(&c->gc_lock);
-
- /*
- * At startup, allocations can happen directly instead of via the
- * allocator thread - issue wakeup in case they blocked on gc_lock:
- */
- closure_wake_up(&c->freelist_wait);
bch_err_fn(c, ret);
return ret;
}
@@ -1323,7 +1198,7 @@ int bch2_gc_gens(struct bch_fs *c)
int ret;
/*
- * Ideally we would be using state_lock and not gc_lock here, but that
+ * Ideally we would be using state_lock and not gc_gens_lock here, but that
* introduces a deadlock in the RO path - we currently take the state
* lock at the start of going RO, thus the gc thread may get stuck:
*/
@@ -1331,7 +1206,8 @@ int bch2_gc_gens(struct bch_fs *c)
return 0;
trace_and_count(c, gc_gens_start, c);
- down_read(&c->gc_lock);
+
+ down_read(&c->state_lock);
for_each_member_device(c, ca) {
struct bucket_gens *gens = bucket_gens(ca);
@@ -1400,7 +1276,7 @@ err:
ca->oldest_gen = NULL;
}
- up_read(&c->gc_lock);
+ up_read(&c->state_lock);
mutex_unlock(&c->gc_gens_lock);
if (!bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret);