diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/bcachefs/bcachefs.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/btree_gc.c | 6 | ||||
-rw-r--r-- | fs/bcachefs/btree_io.c | 1 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting.c | 7 | ||||
-rw-r--r-- | fs/bcachefs/ec.c | 55 | ||||
-rw-r--r-- | fs/bcachefs/ec.h | 9 | ||||
-rw-r--r-- | fs/bcachefs/inode.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/journal_reclaim.c | 7 | ||||
-rw-r--r-- | fs/bcachefs/rebalance.c | 112 | ||||
-rw-r--r-- | fs/bcachefs/sb-errors_format.h | 11 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 13 |
11 files changed, 206 insertions, 19 deletions
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 3ccca855f05e..933c5a68eff9 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -686,6 +686,7 @@ struct btree_debug { unsigned id; }; +#define BCH_LINK_MAX U32_MAX #define BCH_TRANSACTIONS_NR 128 struct btree_transaction_stats { @@ -1061,6 +1062,7 @@ struct bch_fs { GENRADIX(struct gc_stripe) gc_stripes; struct hlist_head ec_stripes_new[32]; + struct hlist_head ec_stripes_new_buckets[64]; spinlock_t ec_stripes_new_lock; /* ERASURE CODING */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 63dc0836bf08..638c2a9268a9 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -204,7 +204,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * if (bpos_eq(expected_start, cur->data->min_key)) return 0; - prt_printf(&buf, " at "); + prt_printf(&buf, " at "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); prt_printf(&buf, ":\nparent: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); @@ -229,8 +229,8 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * *pulled_from_scan = cur->data->min_key; ret = bch_err_throw(c, topology_repair_did_fill_from_scan); } else { - if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, - "btree node with incorrect min_key%s", buf.buf)) + if (mustfix_fsck_err(trans, btree_node_topology_gap_between_nodes, + "gap between btree nodes%s", buf.buf)) ret = set_node_min(c, cur, expected_start); } } else { /* overlap */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 52d21259ed6f..3808c41dda84 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1318,6 +1318,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, set_btree_bset_end(b, b->set); set_btree_node_need_rewrite(b); set_btree_node_need_rewrite_error(b); + ret = 0; continue; } if (ret) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 9da26e11446b..b20ea162bfa3 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -817,6 +817,8 @@ int bch2_accounting_read(struct bch_fs *c) struct journal_keys *keys = &c->journal_keys; struct journal_key *jk = keys->data; + move_gap(keys, keys->nr); + while (jk < &darray_top(*keys) && __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0) jk++; @@ -832,9 +834,6 @@ int bch2_accounting_read(struct bch_fs *c) iter.flags &= ~BTREE_ITER_with_journal; int ret = for_each_btree_key_continue(trans, iter, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); - if (k.k->type != KEY_TYPE_accounting) continue; @@ -863,7 +862,7 @@ int bch2_accounting_read(struct bch_fs *c) struct disk_accounting_pos next_acc; memset(&next_acc, 0, sizeof(next_acc)); next_acc.type = acc_k.type + 1; - struct bpos next = disk_accounting_pos_to_bpos(&next_acc); + struct bpos next = bpos_predecessor(disk_accounting_pos_to_bpos(&next_acc)); if (jk < end) next = bpos_min(next, journal_key_k(c, jk)->k.p); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 103719a76c81..78afd44a7a3f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -895,8 +895,60 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans, * Hash table of open stripes: * Stripes that are being created or modified are kept in a hash table, so that * stripe deletion can skip them. + * + * Additionally, we have a hash table for buckets that have stripes being + * created, to avoid racing with rebalance: */ +static bool __bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket) +{ + unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets))); + struct ec_stripe_new_bucket *s; + + hlist_for_each_entry(s, &c->ec_stripes_new_buckets[hash], hash) + if (s->dev_bucket == dev_bucket) + return true; + return false; +} + +bool bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket) +{ + guard(spinlock)(&c->ec_stripes_new_lock); + return __bch2_bucket_has_new_stripe(c, dev_bucket); +} + +static void stripe_new_bucket_add(struct bch_fs *c, struct ec_stripe_new_bucket *s, u64 dev_bucket) +{ + s->dev_bucket = dev_bucket; + + unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets))); + hlist_add_head(&s->hash, &c->ec_stripes_new_buckets[hash]); +} + +static void stripe_new_buckets_add(struct bch_fs *c, struct ec_stripe_new *s) +{ + unsigned nr_blocks = s->nr_data + s->nr_parity; + + guard(spinlock)(&c->ec_stripes_new_lock); + for (unsigned i = 0; i < nr_blocks; i++) { + if (!s->blocks[i]) + continue; + + struct open_bucket *ob = c->open_buckets + s->blocks[i]; + struct bpos bucket = POS(ob->dev, ob->bucket); + + stripe_new_bucket_add(c, &s->buckets[i], bucket_to_u64(bucket)); + } +} + +static void stripe_new_buckets_del(struct bch_fs *c, struct ec_stripe_new *s) +{ + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + + for (unsigned i = 0; i < v->nr_blocks; i++) + hlist_del_init(&s->buckets[i].hash); +} + static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) { unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); @@ -937,6 +989,8 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) hlist_del_init(&s->hash); s->idx = 0; + + stripe_new_buckets_del(c, s); } /* stripe deletion */ @@ -2027,6 +2081,7 @@ allocate_buf: if (ret) goto err; + stripe_new_buckets_add(c, s); s->allocated = true; allocated: BUG_ON(!s->idx); diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index cc778da99030..85598448c7e1 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -191,6 +191,11 @@ enum ec_stripe_ref { STRIPE_REF_NR }; +struct ec_stripe_new_bucket { + struct hlist_node hash; + u64 dev_bucket; +}; + struct ec_stripe_new { struct bch_fs *c; struct ec_stripe_head *h; @@ -217,6 +222,8 @@ struct ec_stripe_new { open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; struct disk_reservation res; + struct ec_stripe_new_bucket buckets[BCH_BKEY_PTRS_MAX]; + struct ec_stripe_buf new_stripe; struct ec_stripe_buf existing_stripe; }; @@ -248,6 +255,8 @@ struct ec_stripe_head { int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c); +bool bch2_bucket_has_new_stripe(struct bch_fs *, u64); + void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index a05046aeb999..fda4ca783848 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1184,7 +1184,7 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) if (bi->bi_flags & BCH_INODE_unlinked) bi->bi_flags &= ~BCH_INODE_unlinked; else { - if (bi->bi_nlink == U32_MAX) + if (bi->bi_nlink == BCH_LINK_MAX - nlink_bias(bi->bi_mode)) return -BCH_ERR_too_many_links; bi->bi_nlink++; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ae747c87fcf9..f7b0fdd99c75 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -766,6 +766,9 @@ static int bch2_journal_reclaim_thread(void *arg) set_freezable(); + kthread_wait_freezable(test_bit(BCH_FS_rw, &c->flags) || + kthread_should_stop()); + j->last_flushed = jiffies; while (!ret && !kthread_should_stop()) { @@ -826,8 +829,10 @@ int bch2_journal_reclaim_start(struct journal *j) struct task_struct *p; int ret; - if (j->reclaim_thread) + if (j->reclaim_thread) { + wake_up_process(j->reclaim_thread); return 0; + } p = kthread_create(bch2_journal_reclaim_thread, j, "bch-reclaim/%s", c->name); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 94de89d6a6cf..0e40e7bd3441 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -11,6 +11,7 @@ #include "clock.h" #include "compress.h" #include "disk_groups.h" +#include "ec.h" #include "errcode.h" #include "error.h" #include "inode.h" @@ -270,6 +271,10 @@ bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, r.need_rb &= !BIT(BCH_REBALANCE_data_checksum); } + if (durability < r.data_replicas || durability >= r.data_replicas + min_durability) + r.need_rb |= BIT(BCH_REBALANCE_data_replicas); + if (!unwritten && r.erasure_code != ec) + r.need_rb |= BIT(BCH_REBALANCE_erasure_code); return r; } @@ -319,6 +324,17 @@ static int check_dev_rebalance_scan_cookie(struct btree_trans *trans, struct bke return 0; } +static bool bkey_has_ec(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + + bkey_extent_entry_for_each(ptrs, entry) + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) + return true; + return false; +} + static int new_needs_rb_allowed(struct btree_trans *trans, struct per_snapshot_io_opts *s, struct bkey_s_c k, @@ -348,9 +364,26 @@ static int new_needs_rb_allowed(struct btree_trans *trans, ctx == SET_NEEDS_REBALANCE_opt_change_indirect) return 0; + if ((new_need_rb & BIT(BCH_REBALANCE_erasure_code)) && + !bkey_has_ec(k)) { + /* Foreground writes are not initially erasure coded - and we + * may crash before a stripe is created + */ + new_need_rb &= ~BIT(BCH_REBALANCE_erasure_code); + } + if (ctx == SET_NEEDS_REBALANCE_foreground) { new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)| BIT(BCH_REBALANCE_background_target)); + + /* + * Foreground writes might end up degraded when a device is + * getting yanked: + * + * XXX: this is something we need to fix, but adding retries to + * the write path is something we have to do carefully. + */ + new_need_rb &= ~BIT(BCH_REBALANCE_data_replicas); if (!new_need_rb) return 0; @@ -748,6 +781,23 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans, return &(&darray_pop(buf))->k_i; } +static int extent_ec_pending(struct btree_trans *trans, struct bkey_ptrs_c ptrs) +{ + struct bch_fs *c = trans->c; + + guard(rcu)(); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + if (!ca) + continue; + + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + if (bch2_bucket_has_new_stripe(c, bucket_to_u64(bucket))) + return true; + } + return false; +} + static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct per_snapshot_io_opts *snapshot_io_opts, struct bpos work_pos, @@ -790,6 +840,68 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, data_opts->target = opts->background_target; data_opts->write_flags |= BCH_WRITE_only_specified_devs; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + if (r->need_rb & BIT(BCH_REBALANCE_data_replicas)) { + unsigned durability = bch2_bkey_durability(c, k); + unsigned ptr_bit = 1; + + guard(rcu)(); + if (durability <= opts->data_replicas) { + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + if (ca && !ptr->cached && !ca->mi.durability) + data_opts->kill_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + data_opts->extra_replicas = opts->data_replicas - durability; + } else { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + unsigned d = bch2_extent_ptr_durability(c, &p); + + if (d && durability - d >= opts->data_replicas) { + data_opts->kill_ptrs |= ptr_bit; + durability -= d; + } + + ptr_bit <<= 1; + } + + ptr_bit = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.has_ec && durability - p.ec.redundancy >= opts->data_replicas) { + data_opts->kill_ec_ptrs |= ptr_bit; + durability -= p.ec.redundancy; + } + + ptr_bit <<= 1; + } + } + } + + if (r->need_rb & BIT(BCH_REBALANCE_erasure_code)) { + if (opts->erasure_code) { + /* XXX: we'll need ratelimiting */ + if (extent_ec_pending(trans, ptrs)) + return bkey_s_c_null; + + data_opts->extra_replicas = opts->data_replicas; + } else { + unsigned ptr_bit = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.has_ec) { + data_opts->kill_ec_ptrs |= ptr_bit; + data_opts->extra_replicas += p.ec.redundancy; + } + + ptr_bit <<= 1; + } + } + } + if (!data_opts->rewrite_ptrs && !data_opts->kill_ptrs && !data_opts->kill_ec_ptrs && diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 9ec2df6c8071..010b2a86f1be 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -74,6 +74,7 @@ enum bch_fsck_flags { x(btree_root_bad_min_key, 60, 0) \ x(btree_root_bad_max_key, 61, 0) \ x(btree_node_read_error, 62, FSCK_AUTOFIX) \ + x(btree_node_topology_gap_between_nodes, 328, FSCK_AUTOFIX) \ x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \ x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \ x(btree_node_topology_bad_root_min_key, 323, FSCK_AUTOFIX) \ @@ -159,8 +160,8 @@ enum bch_fsck_flags { x(extent_ptrs_redundant_stripe, 139, 0) \ x(extent_ptrs_unwritten, 140, 0) \ x(extent_ptrs_written_and_unwritten, 141, 0) \ - x(extent_rebalance_bad_pending, 330, 0) \ - x(extent_rebalance_bad_hipri, 331, 0) \ + x(extent_rebalance_bad_pending, 331, 0) \ + x(extent_rebalance_bad_hipri, 332, 0) \ x(ptr_to_invalid_device, 142, 0) \ x(ptr_to_removed_device, 322, FSCK_AUTOFIX) \ x(ptr_to_duplicate_device, 143, 0) \ @@ -341,9 +342,9 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(extent_io_opts_not_set, 328, FSCK_AUTOFIX) \ - x(extent_io_opts_unneeded, 329, FSCK_AUTOFIX) \ - x(MAX, 332, 0) + x(extent_io_opts_not_set, 329, FSCK_AUTOFIX) \ + x(extent_io_opts_unneeded, 330, FSCK_AUTOFIX) \ + x(MAX, 333, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 52c6823ae7a4..3984f3cee929 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -564,15 +564,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) * successfully marked the filesystem dirty */ - ret = bch2_journal_reclaim_start(&c->journal); - if (ret) - goto err; - set_bit(BCH_FS_rw, &c->flags); set_bit(BCH_FS_was_rw, &c->flags); enumerated_ref_start(&c->writes); + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) { + bch_err_msg(c, ret, "error starting journal reclaim thread"); + goto err; + } + ret = bch2_copygc_start(c); if (ret) { bch_err_msg(c, ret, "error starting copygc thread"); @@ -852,7 +854,8 @@ int bch2_fs_init_rw(struct bch_fs *c) bch2_fs_btree_write_buffer_init(c) ?: bch2_fs_fs_io_buffered_init(c) ?: bch2_fs_io_write_init(c) ?: - bch2_fs_journal_init(&c->journal); + bch2_fs_journal_init(&c->journal) ?: + bch2_journal_reclaim_start(&c->journal); if (ret) return ret; |