summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/bcachefs.h2
-rw-r--r--fs/bcachefs/btree_gc.c6
-rw-r--r--fs/bcachefs/btree_io.c1
-rw-r--r--fs/bcachefs/disk_accounting.c7
-rw-r--r--fs/bcachefs/ec.c55
-rw-r--r--fs/bcachefs/ec.h9
-rw-r--r--fs/bcachefs/inode.c2
-rw-r--r--fs/bcachefs/journal_reclaim.c7
-rw-r--r--fs/bcachefs/rebalance.c112
-rw-r--r--fs/bcachefs/sb-errors_format.h11
-rw-r--r--fs/bcachefs/super.c13
11 files changed, 206 insertions, 19 deletions
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 3ccca855f05e..933c5a68eff9 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -686,6 +686,7 @@ struct btree_debug {
unsigned id;
};
+#define BCH_LINK_MAX U32_MAX
#define BCH_TRANSACTIONS_NR 128
struct btree_transaction_stats {
@@ -1061,6 +1062,7 @@ struct bch_fs {
GENRADIX(struct gc_stripe) gc_stripes;
struct hlist_head ec_stripes_new[32];
+ struct hlist_head ec_stripes_new_buckets[64];
spinlock_t ec_stripes_new_lock;
/* ERASURE CODING */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 63dc0836bf08..638c2a9268a9 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -204,7 +204,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
if (bpos_eq(expected_start, cur->data->min_key))
return 0;
- prt_printf(&buf, " at ");
+ prt_printf(&buf, " at ");
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
prt_printf(&buf, ":\nparent: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
@@ -229,8 +229,8 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
*pulled_from_scan = cur->data->min_key;
ret = bch_err_throw(c, topology_repair_did_fill_from_scan);
} else {
- if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
- "btree node with incorrect min_key%s", buf.buf))
+ if (mustfix_fsck_err(trans, btree_node_topology_gap_between_nodes,
+ "gap between btree nodes%s", buf.buf))
ret = set_node_min(c, cur, expected_start);
}
} else { /* overlap */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 52d21259ed6f..3808c41dda84 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1318,6 +1318,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
set_btree_bset_end(b, b->set);
set_btree_node_need_rewrite(b);
set_btree_node_need_rewrite_error(b);
+ ret = 0;
continue;
}
if (ret)
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 9da26e11446b..b20ea162bfa3 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -817,6 +817,8 @@ int bch2_accounting_read(struct bch_fs *c)
struct journal_keys *keys = &c->journal_keys;
struct journal_key *jk = keys->data;
+ move_gap(keys, keys->nr);
+
while (jk < &darray_top(*keys) &&
__journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0)
jk++;
@@ -832,9 +834,6 @@ int bch2_accounting_read(struct bch_fs *c)
iter.flags &= ~BTREE_ITER_with_journal;
int ret = for_each_btree_key_continue(trans, iter,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
-
if (k.k->type != KEY_TYPE_accounting)
continue;
@@ -863,7 +862,7 @@ int bch2_accounting_read(struct bch_fs *c)
struct disk_accounting_pos next_acc;
memset(&next_acc, 0, sizeof(next_acc));
next_acc.type = acc_k.type + 1;
- struct bpos next = disk_accounting_pos_to_bpos(&next_acc);
+ struct bpos next = bpos_predecessor(disk_accounting_pos_to_bpos(&next_acc));
if (jk < end)
next = bpos_min(next, journal_key_k(c, jk)->k.p);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 103719a76c81..78afd44a7a3f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -895,8 +895,60 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans,
* Hash table of open stripes:
* Stripes that are being created or modified are kept in a hash table, so that
* stripe deletion can skip them.
+ *
+ * Additionally, we have a hash table for buckets that have stripes being
+ * created, to avoid racing with rebalance:
*/
+static bool __bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket)
+{
+ unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets)));
+ struct ec_stripe_new_bucket *s;
+
+ hlist_for_each_entry(s, &c->ec_stripes_new_buckets[hash], hash)
+ if (s->dev_bucket == dev_bucket)
+ return true;
+ return false;
+}
+
+bool bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket)
+{
+ guard(spinlock)(&c->ec_stripes_new_lock);
+ return __bch2_bucket_has_new_stripe(c, dev_bucket);
+}
+
+static void stripe_new_bucket_add(struct bch_fs *c, struct ec_stripe_new_bucket *s, u64 dev_bucket)
+{
+ s->dev_bucket = dev_bucket;
+
+ unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets)));
+ hlist_add_head(&s->hash, &c->ec_stripes_new_buckets[hash]);
+}
+
+static void stripe_new_buckets_add(struct bch_fs *c, struct ec_stripe_new *s)
+{
+ unsigned nr_blocks = s->nr_data + s->nr_parity;
+
+ guard(spinlock)(&c->ec_stripes_new_lock);
+ for (unsigned i = 0; i < nr_blocks; i++) {
+ if (!s->blocks[i])
+ continue;
+
+ struct open_bucket *ob = c->open_buckets + s->blocks[i];
+ struct bpos bucket = POS(ob->dev, ob->bucket);
+
+ stripe_new_bucket_add(c, &s->buckets[i], bucket_to_u64(bucket));
+ }
+}
+
+static void stripe_new_buckets_del(struct bch_fs *c, struct ec_stripe_new *s)
+{
+ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+
+ for (unsigned i = 0; i < v->nr_blocks; i++)
+ hlist_del_init(&s->buckets[i].hash);
+}
+
static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
{
unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
@@ -937,6 +989,8 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
hlist_del_init(&s->hash);
s->idx = 0;
+
+ stripe_new_buckets_del(c, s);
}
/* stripe deletion */
@@ -2027,6 +2081,7 @@ allocate_buf:
if (ret)
goto err;
+ stripe_new_buckets_add(c, s);
s->allocated = true;
allocated:
BUG_ON(!s->idx);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index cc778da99030..85598448c7e1 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -191,6 +191,11 @@ enum ec_stripe_ref {
STRIPE_REF_NR
};
+struct ec_stripe_new_bucket {
+ struct hlist_node hash;
+ u64 dev_bucket;
+};
+
struct ec_stripe_new {
struct bch_fs *c;
struct ec_stripe_head *h;
@@ -217,6 +222,8 @@ struct ec_stripe_new {
open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX];
struct disk_reservation res;
+ struct ec_stripe_new_bucket buckets[BCH_BKEY_PTRS_MAX];
+
struct ec_stripe_buf new_stripe;
struct ec_stripe_buf existing_stripe;
};
@@ -248,6 +255,8 @@ struct ec_stripe_head {
int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
+bool bch2_bucket_has_new_stripe(struct bch_fs *, u64);
+
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index a05046aeb999..fda4ca783848 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -1184,7 +1184,7 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
if (bi->bi_flags & BCH_INODE_unlinked)
bi->bi_flags &= ~BCH_INODE_unlinked;
else {
- if (bi->bi_nlink == U32_MAX)
+ if (bi->bi_nlink == BCH_LINK_MAX - nlink_bias(bi->bi_mode))
return -BCH_ERR_too_many_links;
bi->bi_nlink++;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ae747c87fcf9..f7b0fdd99c75 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -766,6 +766,9 @@ static int bch2_journal_reclaim_thread(void *arg)
set_freezable();
+ kthread_wait_freezable(test_bit(BCH_FS_rw, &c->flags) ||
+ kthread_should_stop());
+
j->last_flushed = jiffies;
while (!ret && !kthread_should_stop()) {
@@ -826,8 +829,10 @@ int bch2_journal_reclaim_start(struct journal *j)
struct task_struct *p;
int ret;
- if (j->reclaim_thread)
+ if (j->reclaim_thread) {
+ wake_up_process(j->reclaim_thread);
return 0;
+ }
p = kthread_create(bch2_journal_reclaim_thread, j,
"bch-reclaim/%s", c->name);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 94de89d6a6cf..0e40e7bd3441 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -11,6 +11,7 @@
#include "clock.h"
#include "compress.h"
#include "disk_groups.h"
+#include "ec.h"
#include "errcode.h"
#include "error.h"
#include "inode.h"
@@ -270,6 +271,10 @@ bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
r.need_rb &= !BIT(BCH_REBALANCE_data_checksum);
}
+ if (durability < r.data_replicas || durability >= r.data_replicas + min_durability)
+ r.need_rb |= BIT(BCH_REBALANCE_data_replicas);
+ if (!unwritten && r.erasure_code != ec)
+ r.need_rb |= BIT(BCH_REBALANCE_erasure_code);
return r;
}
@@ -319,6 +324,17 @@ static int check_dev_rebalance_scan_cookie(struct btree_trans *trans, struct bke
return 0;
}
+static bool bkey_has_ec(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr)
+ return true;
+ return false;
+}
+
static int new_needs_rb_allowed(struct btree_trans *trans,
struct per_snapshot_io_opts *s,
struct bkey_s_c k,
@@ -348,9 +364,26 @@ static int new_needs_rb_allowed(struct btree_trans *trans,
ctx == SET_NEEDS_REBALANCE_opt_change_indirect)
return 0;
+ if ((new_need_rb & BIT(BCH_REBALANCE_erasure_code)) &&
+ !bkey_has_ec(k)) {
+ /* Foreground writes are not initially erasure coded - and we
+ * may crash before a stripe is created
+ */
+ new_need_rb &= ~BIT(BCH_REBALANCE_erasure_code);
+ }
+
if (ctx == SET_NEEDS_REBALANCE_foreground) {
new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)|
BIT(BCH_REBALANCE_background_target));
+
+ /*
+ * Foreground writes might end up degraded when a device is
+ * getting yanked:
+ *
+ * XXX: this is something we need to fix, but adding retries to
+ * the write path is something we have to do carefully.
+ */
+ new_need_rb &= ~BIT(BCH_REBALANCE_data_replicas);
if (!new_need_rb)
return 0;
@@ -748,6 +781,23 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans,
return &(&darray_pop(buf))->k_i;
}
+static int extent_ec_pending(struct btree_trans *trans, struct bkey_ptrs_c ptrs)
+{
+ struct bch_fs *c = trans->c;
+
+ guard(rcu)();
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+ if (!ca)
+ continue;
+
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+ if (bch2_bucket_has_new_stripe(c, bucket_to_u64(bucket)))
+ return true;
+ }
+ return false;
+}
+
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
struct per_snapshot_io_opts *snapshot_io_opts,
struct bpos work_pos,
@@ -790,6 +840,68 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
data_opts->target = opts->background_target;
data_opts->write_flags |= BCH_WRITE_only_specified_devs;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ if (r->need_rb & BIT(BCH_REBALANCE_data_replicas)) {
+ unsigned durability = bch2_bkey_durability(c, k);
+ unsigned ptr_bit = 1;
+
+ guard(rcu)();
+ if (durability <= opts->data_replicas) {
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+ if (ca && !ptr->cached && !ca->mi.durability)
+ data_opts->kill_ptrs |= ptr_bit;
+ ptr_bit <<= 1;
+ }
+
+ data_opts->extra_replicas = opts->data_replicas - durability;
+ } else {
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ unsigned d = bch2_extent_ptr_durability(c, &p);
+
+ if (d && durability - d >= opts->data_replicas) {
+ data_opts->kill_ptrs |= ptr_bit;
+ durability -= d;
+ }
+
+ ptr_bit <<= 1;
+ }
+
+ ptr_bit = 1;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.has_ec && durability - p.ec.redundancy >= opts->data_replicas) {
+ data_opts->kill_ec_ptrs |= ptr_bit;
+ durability -= p.ec.redundancy;
+ }
+
+ ptr_bit <<= 1;
+ }
+ }
+ }
+
+ if (r->need_rb & BIT(BCH_REBALANCE_erasure_code)) {
+ if (opts->erasure_code) {
+ /* XXX: we'll need ratelimiting */
+ if (extent_ec_pending(trans, ptrs))
+ return bkey_s_c_null;
+
+ data_opts->extra_replicas = opts->data_replicas;
+ } else {
+ unsigned ptr_bit = 1;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.has_ec) {
+ data_opts->kill_ec_ptrs |= ptr_bit;
+ data_opts->extra_replicas += p.ec.redundancy;
+ }
+
+ ptr_bit <<= 1;
+ }
+ }
+ }
+
if (!data_opts->rewrite_ptrs &&
!data_opts->kill_ptrs &&
!data_opts->kill_ec_ptrs &&
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 9ec2df6c8071..010b2a86f1be 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -74,6 +74,7 @@ enum bch_fsck_flags {
x(btree_root_bad_min_key, 60, 0) \
x(btree_root_bad_max_key, 61, 0) \
x(btree_node_read_error, 62, FSCK_AUTOFIX) \
+ x(btree_node_topology_gap_between_nodes, 328, FSCK_AUTOFIX) \
x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \
x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \
x(btree_node_topology_bad_root_min_key, 323, FSCK_AUTOFIX) \
@@ -159,8 +160,8 @@ enum bch_fsck_flags {
x(extent_ptrs_redundant_stripe, 139, 0) \
x(extent_ptrs_unwritten, 140, 0) \
x(extent_ptrs_written_and_unwritten, 141, 0) \
- x(extent_rebalance_bad_pending, 330, 0) \
- x(extent_rebalance_bad_hipri, 331, 0) \
+ x(extent_rebalance_bad_pending, 331, 0) \
+ x(extent_rebalance_bad_hipri, 332, 0) \
x(ptr_to_invalid_device, 142, 0) \
x(ptr_to_removed_device, 322, FSCK_AUTOFIX) \
x(ptr_to_duplicate_device, 143, 0) \
@@ -341,9 +342,9 @@ enum bch_fsck_flags {
x(dirent_stray_data_after_cf_name, 305, 0) \
x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
- x(extent_io_opts_not_set, 328, FSCK_AUTOFIX) \
- x(extent_io_opts_unneeded, 329, FSCK_AUTOFIX) \
- x(MAX, 332, 0)
+ x(extent_io_opts_not_set, 329, FSCK_AUTOFIX) \
+ x(extent_io_opts_unneeded, 330, FSCK_AUTOFIX) \
+ x(MAX, 333, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 52c6823ae7a4..3984f3cee929 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -564,15 +564,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
* successfully marked the filesystem dirty
*/
- ret = bch2_journal_reclaim_start(&c->journal);
- if (ret)
- goto err;
-
set_bit(BCH_FS_rw, &c->flags);
set_bit(BCH_FS_was_rw, &c->flags);
enumerated_ref_start(&c->writes);
+ ret = bch2_journal_reclaim_start(&c->journal);
+ if (ret) {
+ bch_err_msg(c, ret, "error starting journal reclaim thread");
+ goto err;
+ }
+
ret = bch2_copygc_start(c);
if (ret) {
bch_err_msg(c, ret, "error starting copygc thread");
@@ -852,7 +854,8 @@ int bch2_fs_init_rw(struct bch_fs *c)
bch2_fs_btree_write_buffer_init(c) ?:
bch2_fs_fs_io_buffered_init(c) ?:
bch2_fs_io_write_init(c) ?:
- bch2_fs_journal_init(&c->journal);
+ bch2_fs_journal_init(&c->journal) ?:
+ bch2_journal_reclaim_start(&c->journal);
if (ret)
return ret;