diff options
Diffstat (limited to 'fs/bcachefs')
41 files changed, 1685 insertions, 555 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 5455412b2b75..df383b97aacc 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -21,7 +21,6 @@ config BCACHEFS_FS select XOR_BLOCKS select XXHASH select SYMBOLIC_ERRNAME - select MIN_HEAP help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. @@ -34,7 +33,6 @@ config BCACHEFS_QUOTA config BCACHEFS_ERASURE_CODING bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)" depends on BCACHEFS_FS - select QUOTACTL help This enables the "erasure_code" filesysystem and inode option, which organizes data into reed-solomon stripes instead of ordinary @@ -43,11 +41,6 @@ config BCACHEFS_ERASURE_CODING WARNING: this feature is still undergoing on disk format changes, and should only be enabled for testing purposes. -config BCACHEFS_POSIX_ACL - bool "bcachefs POSIX ACL support" - depends on BCACHEFS_FS - select FS_POSIX_ACL - config BCACHEFS_DEBUG bool "bcachefs debugging" depends on BCACHEFS_FS diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 1e87eee962ec..98acb3dc66a3 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -1,4 +1,9 @@ +ifdef BCACHEFS_DKMS + CONFIG_BCACHEFS_FS := m + # Enable other features here? +endif + obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o bcachefs-y := \ @@ -99,9 +104,16 @@ bcachefs-y := \ util.o \ varint.o \ xattr.o \ - vendor/closure.o + vendor/closure.o \ + vendor/min_heap.o -obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o +ifndef BCACHEFS_DKMS + obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o +endif # Silence "note: xyz changed in GCC X.X" messages subdir-ccflags-y += $(call cc-disable-warning, psabi) + +ifdef BCACHEFS_DKMS + subdir-ccflags-y += -I$(src) +endif diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 3befa1f36e72..a1df0ec2d511 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -57,7 +57,7 @@ void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size) } } -#ifdef CONFIG_BCACHEFS_POSIX_ACL +#ifndef NO_BCACHEFS_FS #include "fs.h" @@ -437,4 +437,4 @@ err: return ret; } -#endif /* CONFIG_BCACHEFS_POSIX_ACL */ +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h index fe730a6bf0c1..01cf21c5a0b6 100644 --- a/fs/bcachefs/acl.h +++ b/fs/bcachefs/acl.h @@ -26,7 +26,7 @@ typedef struct { void bch2_acl_to_text(struct printbuf *, const void *, size_t); -#ifdef CONFIG_BCACHEFS_POSIX_ACL +#ifndef NO_BCACHEFS_FS struct posix_acl *bch2_get_acl(struct inode *, int, bool); @@ -55,6 +55,6 @@ static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, return 0; } -#endif /* CONFIG_BCACHEFS_POSIX_ACL */ +#endif /* NO_BCACHEFS_FS */ #endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 3ccca855f05e..933c5a68eff9 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -686,6 +686,7 @@ struct btree_debug { unsigned id; }; +#define BCH_LINK_MAX U32_MAX #define BCH_TRANSACTIONS_NR 128 struct btree_transaction_stats { @@ -1061,6 +1062,7 @@ struct bch_fs { GENRADIX(struct gc_stripe) gc_stripes; struct hlist_head ec_stripes_new[32]; + struct hlist_head ec_stripes_new_buckets[64]; spinlock_t ec_stripes_new_lock; /* ERASURE CODING */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index d29bd684b137..090f11e122ad 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -707,7 +707,8 @@ struct bch_sb_field_ext { x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \ x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \ x(31bit_dirent_offset, BCH_VERSION(1, 30)) \ - x(btree_node_accounting, BCH_VERSION(1, 31)) + x(btree_node_accounting, BCH_VERSION(1, 31)) \ + x(rebalance_v2, BCH_VERSION(1, 32)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 63dc0836bf08..638c2a9268a9 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -204,7 +204,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * if (bpos_eq(expected_start, cur->data->min_key)) return 0; - prt_printf(&buf, " at "); + prt_printf(&buf, " at "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); prt_printf(&buf, ":\nparent: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); @@ -229,8 +229,8 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * *pulled_from_scan = cur->data->min_key; ret = bch_err_throw(c, topology_repair_did_fill_from_scan); } else { - if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, - "btree node with incorrect min_key%s", buf.buf)) + if (mustfix_fsck_err(trans, btree_node_topology_gap_between_nodes, + "gap between btree nodes%s", buf.buf)) ret = set_node_min(c, cur, expected_start); } } else { /* overlap */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 52d21259ed6f..3808c41dda84 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1318,6 +1318,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, set_btree_bset_end(b, b->set); set_btree_node_need_rewrite(b); set_btree_node_need_rewrite_error(b); + ret = 0; continue; } if (ret) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index a6f344faf751..73fa8513ee61 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -838,8 +838,10 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, bpos_ge(k->k.p, start) && bpos_le(k->k.p, end))) keys->data[dst++] = *i; - else if (i->allocated) + else if (i->allocated) { kfree(i->allocated_k); + i->allocated_k = NULL; + } } keys->nr = keys->gap = dst; } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index c0dff992ad60..433d498540fd 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -12,7 +12,6 @@ #include "recovery_passes.h" #include <linux/kthread.h> -#include <linux/min_heap.h> #include <linux/sched/sysctl.h> #include <linux/sort.h> diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 10bfadcde80a..362846d5bb87 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -143,6 +143,17 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, return bch2_csum_opt_to_type(opts.data_checksum, true); } +static inline enum bch_csum_type bch2_data_checksum_type_rb(struct bch_fs *c, + struct bch_extent_rebalance opts) +{ + if (c->sb.encryption_type) + return c->opts.wide_macs + ? BCH_CSUM_chacha20_poly1305_128 + : BCH_CSUM_chacha20_poly1305_80; + + return bch2_csum_opt_to_type(opts.data_checksum, true); +} + static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) { if (c->sb.encryption_type) diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h index 37554e4514fe..d12c11dc6601 100644 --- a/fs/bcachefs/clock_types.h +++ b/fs/bcachefs/clock_types.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_CLOCK_TYPES_H #define _BCACHEFS_CLOCK_TYPES_H -#include "util.h" +#include "vendor/min_heap.h" #define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 62d5d17d681e..02997b061ce5 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -55,63 +55,6 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) return true; } -static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - } -} - -static noinline_for_stack -bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs, - const struct bch_extent_ptr *start) -{ - if (!ctxt) { - bkey_for_each_ptr(ptrs, ptr) { - if (ptr == start) - break; - - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - } - return false; - } - - __bkey_for_each_ptr(start, ptrs.end, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - bool locked; - move_ctxt_wait_event(ctxt, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || - list_empty(&ctxt->ios)); - if (!locked) { - bch2_trans_unlock(ctxt->trans); - bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); - } - } - return true; -} - -static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs) -{ - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) - return __bkey_nocow_lock(c, ctxt, ptrs, ptr); - } - - return true; -} - noinline_for_stack static void trace_io_move_finish2(struct data_update *u, struct bkey_i *new, @@ -210,28 +153,6 @@ static void trace_data_update2(struct data_update *m, } noinline_for_stack -static void trace_io_move_created_rebalance2(struct data_update *m, - struct bkey_s_c old, struct bkey_s_c k, - struct bkey_i *insert) -{ - struct bch_fs *c = m->op.c; - CLASS(printbuf, buf)(); - - bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_io_move_created_rebalance(c, buf.buf); - - count_event(c, io_move_created_rebalance); -} - -noinline_for_stack static int data_update_invalid_bkey(struct data_update *m, struct bkey_s_c old, struct bkey_s_c k, struct bkey_i *insert) @@ -440,7 +361,7 @@ restart_drop_extra_replicas: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: bch2_inum_snapshot_opts_get(trans, k.k->p.inode, k.k->p.snapshot, &opts) ?: - bch2_bkey_set_needs_rebalance(c, &opts, insert, + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, insert, SET_NEEDS_REBALANCE_foreground, m->op.opts.change_cookie) ?: bch2_trans_update(trans, &iter, insert, @@ -451,10 +372,6 @@ restart_drop_extra_replicas: if (trace_data_update_enabled()) trace_data_update2(m, old, k, insert); - if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > - bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) - trace_io_move_created_rebalance2(m, old, k, insert); - ret = bch2_trans_commit(trans, &op->res, NULL, BCH_TRANS_COMMIT_no_check_rw| @@ -538,7 +455,7 @@ void bch2_data_update_exit(struct data_update *update) update->bvecs = NULL; if (c->opts.nocow_enabled) - bkey_nocow_unlock(c, k); + bch2_bkey_nocow_unlock(c, k, 0); bkey_put_dev_refs(c, k); bch2_disk_reservation_put(c, &update->op.res); bch2_bkey_buf_exit(&update->k, c); @@ -1018,10 +935,19 @@ int bch2_data_update_init(struct btree_trans *trans, goto out; } - if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, ptrs)) { - ret = bch_err_throw(c, nocow_lock_blocked); - goto out_put_dev_refs; + if (c->opts.nocow_enabled) { + if (!bch2_bkey_nocow_trylock(c, ptrs, 0)) { + bool locked = false; + if (ctxt) + move_ctxt_wait_event(ctxt, + (locked = bch2_bkey_nocow_trylock(c, ptrs, 0)) || + list_empty(&ctxt->ios)); + if (!locked) { + if (ctxt) + bch2_trans_unlock(ctxt->trans); + bch2_bkey_nocow_lock(c, ptrs, 0); + } + } } if (unwritten) { @@ -1039,8 +965,7 @@ int bch2_data_update_init(struct btree_trans *trans, return 0; out_nocow_unlock: if (c->opts.nocow_enabled) - bkey_nocow_unlock(c, k); -out_put_dev_refs: + bch2_bkey_nocow_unlock(c, k, 0); bkey_put_dev_refs(c, k); out: bch2_disk_reservation_put(c, &m->op.res); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index a99f821c6a1c..73f50e5489b4 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -282,6 +282,9 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po prt_str(out, "btree="); bch2_btree_id_to_text(out, k->btree.id); break; + case BCH_DISK_ACCOUNTING_rebalance_work_v2: + bch2_prt_rebalance_accounting_type(out, k->rebalance_work_v2.type); + break; } } @@ -786,103 +789,10 @@ static struct journal_key *accumulate_and_read_journal_accounting(struct btree_t return ret ? ERR_PTR(ret) : next; } -/* - * At startup time, initialize the in memory accounting from the btree (and - * journal) - */ -int bch2_accounting_read(struct bch_fs *c) +static int accounting_read_mem_fixups(struct btree_trans *trans) { + struct bch_fs *c = trans->c; struct bch_accounting_mem *acc = &c->accounting; - CLASS(btree_trans, trans)(c); - CLASS(printbuf, buf)(); - - /* - * We might run more than once if we rewind to start topology repair or - * btree node scan - and those might cause us to get different results, - * so we can't just skip if we've already run. - * - * Instead, zero out any accounting we have: - */ - scoped_guard(percpu_write, &c->mark_lock) { - darray_for_each(acc->k, e) - percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); - for_each_member_device(c, ca) - percpu_memset(ca->usage, 0, sizeof(*ca->usage)); - percpu_memset(c->usage, 0, sizeof(*c->usage)); - } - - struct journal_keys *keys = &c->journal_keys; - struct journal_key *jk = keys->data; - - while (jk < &darray_top(*keys) && - __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0) - jk++; - - struct journal_key *end = jk; - while (end < &darray_top(*keys) && - __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0) - end++; - - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); - iter.flags &= ~BTREE_ITER_with_journal; - int ret = for_each_btree_key_continue(trans, iter, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); - - if (k.k->type != KEY_TYPE_accounting) - continue; - - while (jk < end && - __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0) - jk = accumulate_and_read_journal_accounting(trans, jk); - - while (jk < end && - __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 && - bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) { - jk->overwritten = true; - jk++; - } - - if (jk < end && - __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0) - jk = accumulate_and_read_journal_accounting(trans, jk); - - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; - - if (!bch2_accounting_is_mem(&acc_k)) { - struct disk_accounting_pos next_acc; - memset(&next_acc, 0, sizeof(next_acc)); - next_acc.type = acc_k.type + 1; - struct bpos next = disk_accounting_pos_to_bpos(&next_acc); - if (jk < end) - next = bpos_min(next, journal_key_k(c, jk)->k.p); - - bch2_btree_iter_set_pos(&iter, next); - continue; - } - - accounting_read_key(trans, k); - })); - bch2_trans_iter_exit(&iter); - if (ret) - return ret; - - while (jk < end) - jk = accumulate_and_read_journal_accounting(trans, jk); - - struct journal_key *dst = keys->data; - darray_for_each(*keys, i) - if (!i->overwritten) - *dst++ = *i; - keys->gap = keys->nr = dst - keys->data; - CLASS(printbuf, underflow_err)(); scoped_guard(percpu_write, &c->mark_lock) { @@ -903,7 +813,7 @@ int bch2_accounting_read(struct bch_fs *c) * Remove it, so that if it's re-added it gets re-marked in the * superblock: */ - ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) + int ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); @@ -985,7 +895,7 @@ int bch2_accounting_read(struct bch_fs *c) if (underflow_err.pos) { bool print = bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err); unsigned pos = underflow_err.pos; - ret = bch2_run_explicit_recovery_pass(c, &underflow_err, + int ret = bch2_run_explicit_recovery_pass(c, &underflow_err, BCH_RECOVERY_PASS_check_allocations, 0); print |= underflow_err.pos != pos; @@ -995,7 +905,108 @@ int bch2_accounting_read(struct bch_fs *c) return ret; } - return ret; + return 0; +} + +/* + * At startup time, initialize the in memory accounting from the btree (and + * journal) + */ +int bch2_accounting_read(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + CLASS(btree_trans, trans)(c); + CLASS(printbuf, buf)(); + + /* + * We might run more than once if we rewind to start topology repair or + * btree node scan - and those might cause us to get different results, + * so we can't just skip if we've already run. + * + * Instead, zero out any accounting we have: + */ + scoped_guard(percpu_write, &c->mark_lock) { + darray_for_each(acc->k, e) + percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); + for_each_member_device(c, ca) + percpu_memset(ca->usage, 0, sizeof(*ca->usage)); + percpu_memset(c->usage, 0, sizeof(*c->usage)); + } + + struct journal_keys *keys = &c->journal_keys; + struct journal_key *jk = keys->data; + + move_gap(keys, keys->nr); + + while (jk < &darray_top(*keys) && + __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0) + jk++; + + struct journal_key *end = jk; + while (end < &darray_top(*keys) && + __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0) + end++; + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + iter.flags &= ~BTREE_ITER_with_journal; + int ret = for_each_btree_key_continue(trans, iter, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ + if (k.k->type != KEY_TYPE_accounting) + continue; + + while (jk < end && + __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0) + jk = accumulate_and_read_journal_accounting(trans, jk); + + while (jk < end && + __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 && + bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) { + jk->overwritten = true; + jk++; + } + + if (jk < end && + __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0) + jk = accumulate_and_read_journal_accounting(trans, jk); + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + break; + + if (!bch2_accounting_is_mem(&acc_k)) { + struct disk_accounting_pos next_acc; + memset(&next_acc, 0, sizeof(next_acc)); + next_acc.type = acc_k.type + 1; + struct bpos next = bpos_predecessor(disk_accounting_pos_to_bpos(&next_acc)); + if (jk < end) + next = bpos_min(next, journal_key_k(c, jk)->k.p); + + bch2_btree_iter_set_pos(&iter, next); + continue; + } + + accounting_read_key(trans, k); + })); + bch2_trans_iter_exit(&iter); + if (ret) + return ret; + + while (jk < end) + jk = accumulate_and_read_journal_accounting(trans, jk); + + bch2_trans_unlock(trans); + + struct journal_key *dst = keys->data; + darray_for_each(*keys, i) + if (!i->overwritten) + *dst++ = *i; + keys->gap = keys->nr = dst - keys->data; + + return accounting_read_mem_fixups(trans); } int bch2_dev_usage_remove(struct bch_fs *c, struct bch_dev *ca) diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index 730a17ea4243..0b61d6100180 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -110,7 +110,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type) x(snapshot, 5, 1) \ x(btree, 6, 3) \ x(rebalance_work, 7, 1) \ - x(inum, 8, 3) + x(inum, 8, 3) \ + x(rebalance_work_v2, 9, 1) \ enum disk_accounting_type { #define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, @@ -210,6 +211,10 @@ struct bch_acct_inum { struct bch_acct_rebalance_work { }; +struct bch_acct_rebalance_work_v2 { + __u8 type; +}; + struct disk_accounting_pos { union { struct { @@ -224,6 +229,7 @@ struct disk_accounting_pos { struct bch_acct_btree btree; struct bch_acct_rebalance_work rebalance_work; struct bch_acct_inum inum; + struct bch_acct_rebalance_work_v2 rebalance_work_v2; } __packed; } __packed; struct bpos _pad; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 89a95b6c4e51..78afd44a7a3f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -895,8 +895,60 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans, * Hash table of open stripes: * Stripes that are being created or modified are kept in a hash table, so that * stripe deletion can skip them. + * + * Additionally, we have a hash table for buckets that have stripes being + * created, to avoid racing with rebalance: */ +static bool __bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket) +{ + unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets))); + struct ec_stripe_new_bucket *s; + + hlist_for_each_entry(s, &c->ec_stripes_new_buckets[hash], hash) + if (s->dev_bucket == dev_bucket) + return true; + return false; +} + +bool bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket) +{ + guard(spinlock)(&c->ec_stripes_new_lock); + return __bch2_bucket_has_new_stripe(c, dev_bucket); +} + +static void stripe_new_bucket_add(struct bch_fs *c, struct ec_stripe_new_bucket *s, u64 dev_bucket) +{ + s->dev_bucket = dev_bucket; + + unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets))); + hlist_add_head(&s->hash, &c->ec_stripes_new_buckets[hash]); +} + +static void stripe_new_buckets_add(struct bch_fs *c, struct ec_stripe_new *s) +{ + unsigned nr_blocks = s->nr_data + s->nr_parity; + + guard(spinlock)(&c->ec_stripes_new_lock); + for (unsigned i = 0; i < nr_blocks; i++) { + if (!s->blocks[i]) + continue; + + struct open_bucket *ob = c->open_buckets + s->blocks[i]; + struct bpos bucket = POS(ob->dev, ob->bucket); + + stripe_new_bucket_add(c, &s->buckets[i], bucket_to_u64(bucket)); + } +} + +static void stripe_new_buckets_del(struct bch_fs *c, struct ec_stripe_new *s) +{ + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + + for (unsigned i = 0; i < v->nr_blocks; i++) + hlist_del_init(&s->buckets[i].hash); +} + static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) { unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); @@ -937,6 +989,8 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) hlist_del_init(&s->hash); s->idx = 0; + + stripe_new_buckets_del(c, s); } /* stripe deletion */ @@ -1134,7 +1188,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, ret = bch2_extent_get_io_opts_one(trans, &opts, &iter, bkey_i_to_s_c(n), SET_NEEDS_REBALANCE_other) ?: - bch2_bkey_set_needs_rebalance(trans->c, &opts, n, + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, SET_NEEDS_REBALANCE_other, 0) ?: bch2_trans_update(trans, &iter, n, 0); out: @@ -2027,6 +2081,7 @@ allocate_buf: if (ret) goto err; + stripe_new_buckets_add(c, s); s->allocated = true; allocated: BUG_ON(!s->idx); diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index cc778da99030..85598448c7e1 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -191,6 +191,11 @@ enum ec_stripe_ref { STRIPE_REF_NR }; +struct ec_stripe_new_bucket { + struct hlist_node hash; + u64 dev_bucket; +}; + struct ec_stripe_new { struct bch_fs *c; struct ec_stripe_head *h; @@ -217,6 +222,8 @@ struct ec_stripe_new { open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; struct disk_reservation res; + struct ec_stripe_new_bucket buckets[BCH_BKEY_PTRS_MAX]; + struct ec_stripe_buf new_stripe; struct ec_stripe_buf existing_stripe; }; @@ -248,6 +255,8 @@ struct ec_stripe_head { int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c); +bool bch2_bucket_has_new_stripe(struct bch_fs *, u64); + void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index cbf1eedddad7..2712a9754100 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -228,6 +228,7 @@ x(BCH_ERR_topology_repair, topology_repair_drop_this_node) \ x(BCH_ERR_topology_repair, topology_repair_drop_prev_node) \ x(BCH_ERR_topology_repair, topology_repair_did_fill_from_scan) \ + x(EMLINK, too_many_links) \ x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EOPNOTSUPP, no_casefolding_without_utf8) \ x(EOPNOTSUPP, casefolding_disabled) \ @@ -367,7 +368,10 @@ x(BCH_ERR_nopromote, nopromote_enomem) \ x(0, invalid_snapshot_node) \ x(0, option_needs_open_fs) \ - x(0, remove_disk_accounting_entry) + x(0, remove_disk_accounting_entry) \ + x(0, nocow_trylock_fail) \ + x(BCH_ERR_nocow_trylock_fail, nocow_trylock_contended) \ + x(BCH_ERR_nocow_trylock_fail, nocow_trylock_bucket_full) \ enum bch_errcode { BCH_ERR_START = 2048, diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 3274ba42c995..c534b009bf60 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1522,24 +1522,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, "redundant stripe entry"); have_ec = true; break; - case BCH_EXTENT_ENTRY_rebalance: { - /* - * this shouldn't be a fsck error, for forward - * compatibility; the rebalance code should just refetch - * the compression opt if it's unknown - */ -#if 0 - const struct bch_extent_rebalance *r = &entry->rebalance; - - if (!bch2_compression_opt_valid(r->compression)) { - union bch_compression_opt opt = { .value = r->compression }; - prt_printf(err, "invalid compression opt %u:%u", - opt.type, opt.level); - return bch_err_throw(c, invalid_bkey); - } -#endif + case BCH_EXTENT_ENTRY_rebalance: + ret = bch2_extent_rebalance_validate(c, k, from, &entry->rebalance); + if (ret) + return ret; break; - } case BCH_EXTENT_ENTRY_flags: bkey_fsck_err_on(entry != ptrs.start, c, extent_flags_not_at_start, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index f1849eb8327d..414adf86efdc 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -541,11 +541,9 @@ __bch2_create(struct mnt_idmap *idmap, * preallocate acls + vfs inode before btree transaction, so that * nothing can fail after the transaction succeeds: */ -#ifdef CONFIG_BCACHEFS_POSIX_ACL ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); if (ret) return ERR_PTR(ret); -#endif inode = __bch2_new_inode(c, GFP_NOFS); if (unlikely(!inode)) { @@ -1751,10 +1749,8 @@ static const struct inode_operations bch_file_inode_operations = { .setattr = bch2_setattr, .fiemap = bch2_fiemap, .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, -#endif .fileattr_get = bch2_fileattr_get, .fileattr_set = bch2_fileattr_set, }; @@ -1773,10 +1769,8 @@ static const struct inode_operations bch_dir_inode_operations = { .setattr = bch2_setattr, .tmpfile = bch2_tmpfile, .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, -#endif .fileattr_get = bch2_fileattr_get, .fileattr_set = bch2_fileattr_set, }; @@ -1797,10 +1791,8 @@ static const struct inode_operations bch_symlink_inode_operations = { .getattr = bch2_getattr, .setattr = bch2_setattr, .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, -#endif .fileattr_get = bch2_fileattr_get, .fileattr_set = bch2_fileattr_set, }; @@ -1809,10 +1801,8 @@ static const struct inode_operations bch_special_inode_operations = { .getattr = bch2_getattr, .setattr = bch2_setattr, .listxattr = bch2_xattr_list, -#ifdef CONFIG_BCACHEFS_POSIX_ACL .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, -#endif .fileattr_get = bch2_fileattr_get, .fileattr_set = bch2_fileattr_set, }; @@ -2546,10 +2536,8 @@ got_sb: c->dev = sb->s_dev; -#ifdef CONFIG_BCACHEFS_POSIX_ACL if (c->opts.acl) sb->s_flags |= SB_POSIXACL; -#endif sb->s_shrink->seeks = 0; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 543627fb58be..fda4ca783848 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1184,8 +1184,8 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) if (bi->bi_flags & BCH_INODE_unlinked) bi->bi_flags &= ~BCH_INODE_unlinked; else { - if (bi->bi_nlink == U32_MAX) - return -EINVAL; + if (bi->bi_nlink == BCH_LINK_MAX - nlink_bias(bi->bi_mode)) + return -BCH_ERR_too_many_links; bi->bi_nlink++; } diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 6a5da02ce266..66ee07be149e 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -33,7 +33,6 @@ #include <linux/blkdev.h> #include <linux/moduleparam.h> -#include <linux/prefetch.h> #include <linux/random.h> #include <linux/sched/mm.h> @@ -365,7 +364,7 @@ int bch2_extent_update(struct btree_trans *trans, min(k->k.p.offset << 9, new_i_size), i_sectors_delta, &inode) ?: (bch2_inode_opts_get_inode(c, &inode, &opts), - bch2_bkey_set_needs_rebalance(c, &opts, k, + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, k, SET_NEEDS_REBALANCE_foreground, change_cookie)) ?: bch2_trans_update(trans, iter, k, 0) ?: @@ -1271,7 +1270,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return bch2_extent_update_i_size_sectors(trans, iter, min(new->k.p.offset << 9, new_i_size), 0, &inode) ?: (bch2_inode_opts_get_inode(c, &inode, &opts), - bch2_bkey_set_needs_rebalance(c, &opts, new, + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, new, SET_NEEDS_REBALANCE_foreground, op->opts.change_cookie)) ?: bch2_trans_update(trans, iter, new, @@ -1323,11 +1322,25 @@ static CLOSURE_CALLBACK(bch2_nocow_write_done) bch2_write_done(cl); } -struct bucket_to_lock { - struct bpos b; - unsigned gen; - struct nocow_lock_bucket *l; -}; +static bool bkey_get_dev_iorefs(struct bch_fs *c, struct bkey_ptrs_c ptrs) +{ + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, + BCH_DEV_WRITE_REF_io_write); + if (unlikely(!ca)) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + enumerated_ref_put(&bch2_dev_have_ref(c, ptr2->dev)->io_ref[WRITE], + BCH_DEV_WRITE_REF_io_write); + } + + return false; + } + } + + return true; +} static void bch2_nocow_write(struct bch_write_op *op) { @@ -1335,15 +1348,14 @@ static void bch2_nocow_write(struct bch_write_op *op) struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; - DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; + struct bkey_ptrs_c ptrs; u32 snapshot; - struct bucket_to_lock *stale_at; + const struct bch_extent_ptr *stale_at; int stale, ret; if (op->flags & BCH_WRITE_move) return; - darray_init(&buckets); trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); @@ -1358,8 +1370,6 @@ retry: while (1) { struct bio *bio = &op->wbio.bio; - buckets.nr = 0; - ret = bch2_trans_relock(trans); if (ret) break; @@ -1381,50 +1391,42 @@ retry: break; /* Get iorefs before dropping btree locks: */ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, - BCH_DEV_WRITE_REF_io_write); - if (unlikely(!ca)) - goto err_get_ioref; - - struct bpos b = PTR_BUCKET_POS(ca, ptr); - struct nocow_lock_bucket *l = - bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); - prefetch(l); - - /* XXX allocating memory with btree locks held - rare */ - darray_push_gfp(&buckets, ((struct bucket_to_lock) { - .b = b, .gen = ptr->gen, .l = l, - }), GFP_KERNEL|__GFP_NOFAIL); - - if (ptr->unwritten) - op->flags |= BCH_WRITE_convert_unwritten; - } + ptrs = bch2_bkey_ptrs_c(k); + if (!bkey_get_dev_iorefs(c, ptrs)) + goto out; /* Unlock before taking nocow locks, doing IO: */ bkey_reassemble(op->insert_keys.top, k); - bch2_trans_unlock(trans); + k = bkey_i_to_s_c(op->insert_keys.top); + ptrs = bch2_bkey_ptrs_c(k); - bch2_cut_front(op->pos, op->insert_keys.top); - if (op->flags & BCH_WRITE_convert_unwritten) - bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); + bch2_trans_unlock(trans); - darray_for_each(buckets, i) { - struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode); + bch2_bkey_nocow_lock(c, ptrs, BUCKET_NOCOW_LOCK_UPDATE); - __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, - bucket_to_u64(i->b), - BUCKET_NOCOW_LOCK_UPDATE); + /* + * This could be handled better: If we're able to trylock the + * nocow locks with btree locks held we know dirty pointers + * can't be stale + */ + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - int gen = bucket_gen_get(ca, i->b.offset); - stale = gen < 0 ? gen : gen_after(gen, i->gen); + int gen = bucket_gen_get(ca, PTR_BUCKET_NR(ca, ptr)); + stale = gen < 0 ? gen : gen_after(gen, ptr->gen); if (unlikely(stale)) { - stale_at = i; + stale_at = ptr; goto err_bucket_stale; } + + if (ptr->unwritten) + op->flags |= BCH_WRITE_convert_unwritten; } + bch2_cut_front(op->pos, op->insert_keys.top); + if (op->flags & BCH_WRITE_convert_unwritten) + bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); + bio = &op->wbio.bio; if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { bio = bio_split(bio, k.k->p.offset - op->pos.offset, @@ -1458,8 +1460,6 @@ err: goto retry; bch2_trans_put(trans); - darray_exit(&buckets); - if (ret) { bch2_write_op_error(op, op->pos.offset, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); @@ -1484,24 +1484,11 @@ err: continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); } return; -err_get_ioref: - darray_for_each(buckets, i) - enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE], - BCH_DEV_WRITE_REF_io_write); - - /* Fall back to COW path: */ - goto out; err_bucket_stale: - darray_for_each(buckets, i) { - bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE); - if (i == stale_at) - break; - } - CLASS(printbuf, buf)(); if (bch2_fs_inconsistent_on(stale < 0, c, - "pointer to invalid bucket in nocow path on device %llu\n %s", - stale_at->b.inode, + "pointer to invalid bucket in nocow path on device %u\n %s", + stale_at->dev, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch_err_throw(c, data_write_invalid_ptr); } else { @@ -1509,7 +1496,13 @@ err_bucket_stale: ret = bch_err_throw(c, transaction_restart); } - goto err_get_ioref; + bch2_bkey_nocow_unlock(c, k, BUCKET_NOCOW_LOCK_UPDATE); + bkey_for_each_ptr(ptrs, ptr) + enumerated_ref_put(&bch2_dev_have_ref(c, ptr->dev)->io_ref[WRITE], + BCH_DEV_WRITE_REF_io_write); + + /* Fall back to COW path: */ + goto out; } static void __bch2_write(struct bch_write_op *op) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ae747c87fcf9..f7b0fdd99c75 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -766,6 +766,9 @@ static int bch2_journal_reclaim_thread(void *arg) set_freezable(); + kthread_wait_freezable(test_bit(BCH_FS_rw, &c->flags) || + kthread_should_stop()); + j->last_flushed = jiffies; while (!ret && !kthread_should_stop()) { @@ -826,8 +829,10 @@ int bch2_journal_reclaim_start(struct journal *j) struct task_struct *p; int ret; - if (j->reclaim_thread) + if (j->reclaim_thread) { + wake_up_process(j->reclaim_thread); return 0; + } p = kthread_create(bch2_journal_reclaim_thread, j, "bch-reclaim/%s", c->name); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 139a6587a64e..9b172af4f8c8 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -84,7 +84,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, struct bch_inode_opts opts; ret = bch2_extent_get_apply_io_opts_one(trans, &opts, iter, k, ctx) ?: - bch2_bkey_set_needs_rebalance(c, &opts, n, ctx, 0) ?: + bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, ctx, 0) ?: drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, err, false); if (ret) return ret; diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c index 71b17f18e90c..c8907070796f 100644 --- a/fs/bcachefs/nocow_locking.c +++ b/fs/bcachefs/nocow_locking.c @@ -6,6 +6,16 @@ #include "nocow_locking.h" #include "util.h" +#include <linux/prefetch.h> + +static bool nocow_bucket_empty(struct nocow_lock_bucket *l) +{ + for (unsigned i = 0; i < ARRAY_SIZE(l->b); i++) + if (atomic_read(&l->l[i])) + return false; + return true; +} + bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket) { u64 dev_bucket = bucket_to_u64(bucket); @@ -20,14 +30,12 @@ bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos #define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0) -void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags) +void __bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, u64 dev_bucket, int flags) { - u64 dev_bucket = bucket_to_u64(bucket); struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); int lock_val = flags ? 1 : -1; - unsigned i; - for (i = 0; i < ARRAY_SIZE(l->b); i++) + for (unsigned i = 0; i < ARRAY_SIZE(l->b); i++) if (l->b[i] == dev_bucket) { int v = atomic_sub_return(lock_val, &l->l[i]); @@ -40,8 +48,8 @@ void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos buc BUG(); } -bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, - u64 dev_bucket, int flags) +static int __bch2_bucket_nocow_trylock(struct bch_fs *c, struct nocow_lock_bucket *l, + u64 dev_bucket, int flags) { int v, lock_val = flags ? 1 : -1; unsigned i; @@ -58,32 +66,128 @@ bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, goto take_lock; } - return false; + return bch_err_throw(c, nocow_trylock_bucket_full); got_entry: v = atomic_read(&l->l[i]); if (lock_val > 0 ? v < 0 : v > 0) - return false; + return bch_err_throw(c, nocow_trylock_contended); take_lock: v = atomic_read(&l->l[i]); /* Overflow? */ if (v && sign(v + lock_val) != sign(v)) - return false; + return bch_err_throw(c, nocow_trylock_contended); atomic_add(lock_val, &l->l[i]); + return 0; +} + +static inline bool bch2_bucket_nocow_trylock(struct bch_fs *c, struct bpos bucket, int flags) +{ + struct bucket_nocow_lock_table *t = &c->nocow_locks; + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); + + return !__bch2_bucket_nocow_trylock(c, l, dev_bucket, flags); +} + +void bch2_bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k, int flags) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, flags); + } +} + +bool bch2_bkey_nocow_trylock(struct bch_fs *c, struct bkey_ptrs_c ptrs, int flags) +{ + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + + if (unlikely(!bch2_bucket_nocow_trylock(c, bucket, flags))) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + + struct bch_dev *ca = bch2_dev_have_ref(c, ptr2->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr2); + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, flags); + } + return false; + } + } + return true; } -void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, - struct nocow_lock_bucket *l, - u64 dev_bucket, int flags) +struct bucket_to_lock { + u64 b; + struct nocow_lock_bucket *l; +}; + +static inline int bucket_to_lock_cmp(struct bucket_to_lock l, + struct bucket_to_lock r) { - if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) { - struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks); + return cmp_int(l.l, r.l); +} + +void bch2_bkey_nocow_lock(struct bch_fs *c, struct bkey_ptrs_c ptrs, int flags) +{ + if (bch2_bkey_nocow_trylock(c, ptrs, flags)) + return; + + DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; + darray_init(&buckets); + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + u64 b = bucket_to_u64(PTR_BUCKET_POS(ca, ptr)); + struct nocow_lock_bucket *l = + bucket_nocow_lock(&c->nocow_locks, b); + prefetch(l); + + /* XXX allocating memory with btree locks held - rare */ + darray_push_gfp(&buckets, ((struct bucket_to_lock) { .b = b, .l = l, }), + GFP_KERNEL|__GFP_NOFAIL); + } + + WARN_ON_ONCE(buckets.nr > NOCOW_LOCK_BUCKET_SIZE); + + bubble_sort(buckets.data, buckets.nr, bucket_to_lock_cmp); +retake_all: + darray_for_each(buckets, i) { + int ret = __bch2_bucket_nocow_trylock(c, i->l, i->b, flags); + if (!ret) + continue; + u64 start_time = local_clock(); - __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags)); + if (ret == -BCH_ERR_nocow_trylock_contended) + __closure_wait_event(&i->l->wait, + (ret = __bch2_bucket_nocow_trylock(c, i->l, i->b, flags)) != -BCH_ERR_nocow_trylock_contended); + if (!ret) { + bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); + continue; + } + + BUG_ON(ret != -BCH_ERR_nocow_trylock_bucket_full); + + darray_for_each(buckets, i2) { + if (i2 == i) + break; + __bch2_bucket_nocow_unlock(&c->nocow_locks, i2->b, flags); + } + + __closure_wait_event(&i->l->wait, nocow_bucket_empty(i->l)); bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); + goto retake_all; } + + darray_exit(&buckets); } void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t) diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h index 48b8a003c0d2..972c914780f9 100644 --- a/fs/bcachefs/nocow_locking.h +++ b/fs/bcachefs/nocow_locking.h @@ -19,29 +19,19 @@ static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lo #define BUCKET_NOCOW_LOCK_UPDATE (1 << 0) bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos); -void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int); -bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int); -void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, - struct nocow_lock_bucket *, u64, int); -static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, - struct bpos bucket, int flags) -{ - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - - __bch2_bucket_nocow_lock(t, l, dev_bucket, flags); -} +void __bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, u64, int); -static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, - struct bpos bucket, int flags) +static inline void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, + int flags) { - u64 dev_bucket = bucket_to_u64(bucket); - struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - - return __bch2_bucket_nocow_trylock(l, dev_bucket, flags); + __bch2_bucket_nocow_unlock(t, bucket_to_u64(bucket), flags); } +void bch2_bkey_nocow_unlock(struct bch_fs *, struct bkey_s_c, int); +bool bch2_bkey_nocow_trylock(struct bch_fs *, struct bkey_ptrs_c, int); +void bch2_bkey_nocow_lock(struct bch_fs *, struct bkey_ptrs_c, int); + void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); void bch2_fs_nocow_locking_exit(struct bch_fs *); diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h index bd12bf677924..3fed8e957e20 100644 --- a/fs/bcachefs/nocow_locking_types.h +++ b/fs/bcachefs/nocow_locking_types.h @@ -5,11 +5,13 @@ #define BUCKET_NOCOW_LOCKS_BITS 10 #define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS) +#define NOCOW_LOCK_BUCKET_SIZE 6 + struct nocow_lock_bucket { struct closure_waitlist wait; spinlock_t lock; - u64 b[4]; - atomic_t l[4]; + u64 b[NOCOW_LOCK_BUCKET_SIZE]; + atomic_t l[NOCOW_LOCK_BUCKET_SIZE]; } __aligned(SMP_CACHE_BYTES); struct bucket_nocow_lock_table { diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index bd5faafc9aa7..e01c808e7893 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -103,6 +103,13 @@ static const char * const __bch2_fs_usage_types[] = { #undef x +static const char * const __bch2_rebalance_accounting_types[] = { +#define x(n) #n, + BCH_REBALANCE_ACCOUNTING() +#undef x + NULL +}; + static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[], unsigned nr, const char *type, unsigned idx) { @@ -125,6 +132,7 @@ PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); +PRT_STR_OPT_BOUNDSCHECKED(rebalance_accounting_type, enum bch_rebalance_accounting_type); static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) @@ -646,11 +654,8 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, val = bch2_opt_val_synonym_lookup(name, val); - if (!(bch2_opt_table[id].flags & OPT_MOUNT)) - return -BCH_ERR_option_name; - - if (id == Opt_acl && - !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) + if (!(bch2_opt_table[id].flags & OPT_MOUNT) && + !(bch2_opt_table[id].flags & OPT_MOUNT_OLD)) return -BCH_ERR_option_name; if ((id == Opt_usrquota || @@ -673,6 +678,12 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, if (ret < 0) return -BCH_ERR_option_value; + if (bch2_opt_table[id].flags & OPT_MOUNT_OLD) { + pr_err("option %s may no longer be specified at mount time; set via sysfs opts dir", + bch2_opt_table[id].attr.name); + return 0; + } + if (opts) bch2_opt_set_by_id(opts, id, v); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 6b9f18839345..68982196b5dc 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -34,6 +34,7 @@ void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); +void bch2_prt_rebalance_accounting_type(struct printbuf *, enum bch_rebalance_accounting_type); static inline const char *bch2_d_type_str(unsigned d_type) { @@ -66,6 +67,7 @@ enum opt_flags { OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */ OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */ OPT_HIDDEN = BIT(11), + OPT_MOUNT_OLD = BIT(12), /* May not be specified at mount time, but don't fail the mount */ }; enum opt_type { @@ -149,12 +151,12 @@ enum fsck_err_opts { BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ NULL, "Number of consecutive write errors allowed before kicking out a device")\ x(metadata_replicas, u8, \ - OPT_FS|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX + 1), \ BCH_SB_META_REPLICAS_WANT, 1, \ "#", "Number of metadata replicas") \ x(data_replicas, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX + 1), \ BCH_SB_DATA_REPLICAS_WANT, 1, \ "#", "Number of data replicas") \ @@ -175,12 +177,12 @@ enum fsck_err_opts { BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ "size", "Maximum size of checksummed/compressed extents")\ x(metadata_checksum, u8, \ - OPT_FS|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_STR(__bch2_csum_opts), \ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(data_checksum, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ @@ -190,12 +192,12 @@ enum fsck_err_opts { BCH_SB_CSUM_ERR_RETRY_NR, 3, \ NULL, NULL) \ x(compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ x(background_compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ @@ -205,27 +207,27 @@ enum fsck_err_opts { BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ x(metadata_target, u16, \ - OPT_FS|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_METADATA_TARGET, 0, \ "(target)", "Device or label for metadata writes") \ x(foreground_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_FOREGROUND_TARGET, 0, \ "(target)", "Device or label for foreground writes") \ x(background_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_BACKGROUND_TARGET, 0, \ "(target)", "Device or label to move data to in the background")\ x(promote_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_PROMOTE_TARGET, 0, \ "(target)", "Device or label to promote data to on read") \ x(erasure_code, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT_OLD|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 67d6a90e86ef..0e40e7bd3441 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "btree_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" @@ -10,6 +11,7 @@ #include "clock.h" #include "compress.h" #include "disk_groups.h" +#include "ec.h" #include "errcode.h" #include "error.h" #include "inode.h" @@ -25,8 +27,29 @@ #include <linux/kthread.h> #include <linux/sched/cputime.h> +#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) + /* bch_extent_rebalance: */ +int bch2_extent_rebalance_validate(struct bch_fs *c, + struct bkey_s_c k, + struct bkey_validate_context from, + const struct bch_extent_rebalance *r) +{ + int ret = 0; + + bkey_fsck_err_on(r->pending && !(r->need_rb & BIT(BCH_REBALANCE_background_target)), + c, extent_rebalance_bad_pending, + "pending incorrectly set"); + + bkey_fsck_err_on(r->hipri && !(r->need_rb & BIT(BCH_REBALANCE_data_replicas)), + c, extent_rebalance_bad_pending, + "hipri incorrectly set"); + +fsck_err: + return ret; +} + static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) { const union bch_extent_entry *entry; @@ -38,15 +61,30 @@ static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct b return NULL; } -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) { return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); } +static const char * const rebalance_opts[] = { +#define x(n) #n, + BCH_REBALANCE_OPTS() +#undef x + NULL +}; + void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_rebalance *r) { - prt_printf(out, "replicas=%u", r->data_replicas); + prt_str(out, "need_rb="); + prt_bitflags(out, rebalance_opts, r->need_rb); + + if (r->hipri) + prt_str(out, " hipri"); + if (r->pending) + prt_str(out, " pending"); + + prt_printf(out, " replicas=%u", r->data_replicas); if (r->data_replicas_from_inode) prt_str(out, " (inode)"); @@ -92,32 +130,54 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, } } -int bch2_trigger_extent_rebalance(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - enum btree_iter_update_trigger_flags flags) -{ - struct bch_fs *c = trans->c; - int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta[1] = { 0 }; +/* + * XXX: check in bkey_validate that if r->hipri or r->pending are set, + * r->data_replicas are also set + */ - s64 s = bch2_bkey_sectors_need_rebalance(c, old); - need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta[0] -= s; +static inline unsigned rb_accounting_counters(const struct bch_extent_rebalance *r) +{ + if (!r) + return 0; + unsigned ret = r->need_rb; - s = bch2_bkey_sectors_need_rebalance(c, new); - need_rebalance_delta += s != 0; - need_rebalance_sectors_delta[0] += s; + if (r->hipri) + ret |= BIT(BCH_REBALANCE_ACCOUNTING_high_priority); + if (r->pending) { + ret |= BIT(BCH_REBALANCE_ACCOUNTING_pending); + ret &= ~BIT(BCH_REBALANCE_ACCOUNTING_background_target); + } + return ret; +} - if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { +int __bch2_trigger_extent_rebalance(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned old_r, unsigned new_r, + enum btree_iter_update_trigger_flags flags) +{ + int delta = (int) !!new_r - (int) !!old_r; + if ((flags & BTREE_TRIGGER_transactional) && delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - new.k->p, need_rebalance_delta > 0); + new.k->p, delta > 0); if (ret) return ret; } - if (need_rebalance_sectors_delta[0]) { + delta = old.k->size == new.k->size + ? old_r ^ new_r + : old_r | new_r; + while (delta) { + unsigned c = __ffs(delta); + delta ^= BIT(c); + + s64 v[1] = { 0 }; + if (old_r & BIT(c)) + v[0] -= (s64) old.k->size; + if (new_r & BIT(c)) + v[0] += (s64) new.k->size; + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, - need_rebalance_sectors_delta, rebalance_work); + v, rebalance_work_v2, c); if (ret) return ret; } @@ -125,39 +185,48 @@ int bch2_trigger_extent_rebalance(struct btree_trans *trans, return 0; } -static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, - struct bch_inode_opts *io_opts, - unsigned *move_ptrs, - unsigned *compress_ptrs, - u64 *sectors) +static struct bch_extent_rebalance +bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, + struct bch_inode_opts *opts, + unsigned *move_ptrs, + unsigned *compress_ptrs, + unsigned *csum_ptrs, + bool may_update_indirect) { *move_ptrs = 0; *compress_ptrs = 0; - *sectors = 0; + *csum_ptrs = 0; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - const struct bch_extent_rebalance *rb_opts = bch2_bkey_ptrs_rebalance_opts(ptrs); - if (!io_opts && !rb_opts) - return; + struct bch_extent_rebalance r = { .type = BIT(BCH_EXTENT_ENTRY_rebalance) }; if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return; + return r; - unsigned compression_type = - bch2_compression_opt_to_type(io_opts - ? io_opts->background_compression - : rb_opts->background_compression); - unsigned target = io_opts - ? io_opts->background_target - : rb_opts->background_target; - if (target && !bch2_target_accepts_data(c, BCH_DATA_user, target)) - target = 0; + const struct bch_extent_rebalance *old_r = bch2_bkey_ptrs_rebalance_opts(ptrs); + if (old_r) { + r = *old_r; + r.need_rb = 0; + } + +#define x(_name) \ + if (k.k->type != KEY_TYPE_reflink_v || \ + may_update_indirect || \ + (!opts->_name##_from_inode && !r._name##_from_inode)) { \ + r._name = opts->_name; \ + r._name##_from_inode = opts->_name##_from_inode; \ + } + BCH_REBALANCE_OPTS() +#undef x + + unsigned compression_type = bch2_compression_opt_to_type(r.background_compression); + unsigned csum_type = bch2_data_checksum_type_rb(c, r); + + bool incompressible = false, unwritten = false, ec = false; + unsigned durability = 0, min_durability = INT_MAX; const union bch_extent_entry *entry; struct extent_ptr_decoded p; - bool incompressible = false, unwritten = false; - unsigned ptr_idx = 1; guard(rcu)(); @@ -166,102 +235,285 @@ static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, unwritten |= p.ptr.unwritten; if (!p.ptr.cached) { - if (p.crc.compression_type != compression_type) + if (p.crc.compression_type != compression_type) { *compress_ptrs |= ptr_idx; + r.need_rb |= BIT(BCH_REBALANCE_background_compression); + } - if (target && !bch2_dev_in_target(c, p.ptr.dev, target)) + if (p.crc.csum_type != csum_type) { + *csum_ptrs |= ptr_idx; + r.need_rb |= BIT(BCH_REBALANCE_data_checksum); + } + + if (r.background_target && + !bch2_dev_in_target(c, p.ptr.dev, r.background_target)) { *move_ptrs |= ptr_idx; + r.need_rb |= BIT(BCH_REBALANCE_background_target); + } + + unsigned d = bch2_extent_ptr_durability(c, &p); + durability += d; + min_durability = min(min_durability, d); + + ec |= p.has_ec; } ptr_idx <<= 1; } - if (unwritten) - *compress_ptrs = 0; - if (incompressible) + if (unwritten || incompressible) { *compress_ptrs = 0; + r.need_rb &= ~BIT(BCH_REBALANCE_background_compression); + } - unsigned rb_ptrs = *move_ptrs | *compress_ptrs; + if (unwritten) { + *csum_ptrs = 0; + r.need_rb &= !BIT(BCH_REBALANCE_data_checksum); + } - if (!rb_ptrs) - return; + if (durability < r.data_replicas || durability >= r.data_replicas + min_durability) + r.need_rb |= BIT(BCH_REBALANCE_data_replicas); + if (!unwritten && r.erasure_code != ec) + r.need_rb |= BIT(BCH_REBALANCE_erasure_code); + return r; +} - ptr_idx = 1; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (rb_ptrs & ptr_idx) - *sectors += p.crc.compressed_size; - ptr_idx <<= 1; +static int check_rebalance_scan_cookie(struct btree_trans *trans, u64 inum, bool *v) +{ + if (v && *v) + return 1; + + /* + * If opts need to be propagated to the extent, a scan cookie should be + * present: + */ + CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + 0); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + return ret; + + ret = k.k->type == KEY_TYPE_cookie; + if (v) + *v = ret; + return ret; +} + +static int check_dev_rebalance_scan_cookie(struct btree_trans *trans, struct bkey_s_c k, + struct bch_devs_mask *v) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) + if (v && test_bit(ptr->dev, v->d)) + return 1; + + bkey_for_each_ptr(ptrs, ptr) { + int ret = check_rebalance_scan_cookie(trans, ptr->dev + 1, NULL); + if (ret < 0) + return ret; + if (ret) { + if (v) + __set_bit(ptr->dev, v->d); + return ret; + } } + + return 0; } -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) +static bool bkey_has_ec(struct bkey_s_c k) { - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - u64 sectors = 0; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; - bch2_bkey_needs_rebalance(c, k, NULL, &move_ptrs, &compress_ptrs, §ors); - return sectors; + bkey_extent_entry_for_each(ptrs, entry) + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) + return true; + return false; } -static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, - struct bch_inode_opts *opts, - struct bkey_s_c k) +static int new_needs_rb_allowed(struct btree_trans *trans, + struct per_snapshot_io_opts *s, + struct bkey_s_c k, + enum set_needs_rebalance_ctx ctx, + unsigned opt_change_cookie, + const struct bch_extent_rebalance *old, + const struct bch_extent_rebalance *new, + unsigned new_need_rb) { - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - u64 sectors = 0; + struct bch_fs *c = trans->c; + /* + * New need_rb - pointers that don't match the current io path options - + * are only allowed in certain situations: + * + * Propagating new options: from bch2_set_rebalance_needs_scan + * + * Foreground writes: background_compression and background_target are + * allowed + * + * Foreground writes: we may have raced with an option change: + * opt_change_cookie checks for this + * + * XXX: foreground writes should still match compression, + * foreground_target - figure out how to check for this + */ + if (ctx == SET_NEEDS_REBALANCE_opt_change || + ctx == SET_NEEDS_REBALANCE_opt_change_indirect) + return 0; + + if ((new_need_rb & BIT(BCH_REBALANCE_erasure_code)) && + !bkey_has_ec(k)) { + /* Foreground writes are not initially erasure coded - and we + * may crash before a stripe is created + */ + new_need_rb &= ~BIT(BCH_REBALANCE_erasure_code); + } + + if (ctx == SET_NEEDS_REBALANCE_foreground) { + new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)| + BIT(BCH_REBALANCE_background_target)); + + /* + * Foreground writes might end up degraded when a device is + * getting yanked: + * + * XXX: this is something we need to fix, but adding retries to + * the write path is something we have to do carefully. + */ + new_need_rb &= ~BIT(BCH_REBALANCE_data_replicas); + if (!new_need_rb) + return 0; + + if (opt_change_cookie != atomic_read(&c->opt_change_cookie)) + return 0; + } - bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, §ors); - return move_ptrs|compress_ptrs; + /* + * Either the extent data or the extent io options (from + * bch_extent_rebalance) should match the io_opts from the + * inode/filesystem, unless + * + * - There's a scan pending to propagate new options + * - It's an indirect extent: it may be referenced by inodes + * with inconsistent options + * + * For efficiency (so that we can cache checking for scan + * cookies), only check option consistency when we're called + * with snapshot_io_opts - don't bother when we're called from + * move_data_phys() -> get_io_opts_one() + * + * Note that we can cache the existence of a cookie, but not the + * non-existence, to avoid spurious false positives. + */ + int ret = check_rebalance_scan_cookie(trans, 0, s ? &s->fs_scan_cookie : NULL) ?: + check_rebalance_scan_cookie(trans, k.k->p.inode, s ? &s->inum_scan_cookie : NULL); + if (ret < 0) + return ret; + if (ret) + return 0; + + if (new_need_rb == BIT(BCH_REBALANCE_data_replicas)) { + ret = check_dev_rebalance_scan_cookie(trans, k, s ? &s->dev_cookie : NULL); + if (ret < 0) + return ret; + if (ret) + return 0; + } + + CLASS(printbuf, buf)(); + + prt_printf(&buf, "extent with incorrect/missing rebalance opts:\n"); + bch2_bkey_val_to_text(&buf, c, k); + + const struct bch_extent_rebalance _old = {}; + if (!old) + old = &_old; + +#define x(_name) \ + if (new_need_rb & BIT(BCH_REBALANCE_##_name)) \ + prt_printf(&buf, "\n" #_name " %u != %u", old->_name, new->_name); + BCH_REBALANCE_OPTS() +#undef x + + fsck_err(trans, extent_io_opts_not_set, "%s", buf.buf); +fsck_err: + return ret; } -static inline bool bkey_should_have_rb_opts(struct bch_fs *c, - struct bch_inode_opts *opts, - struct bkey_s_c k) +static inline bool bkey_should_have_rb_opts(struct bkey_s_c k, + struct bch_extent_rebalance new) { if (k.k->type == KEY_TYPE_reflink_v) { -#define x(n) if (opts->n##_from_inode) return true; +#define x(n) if (new.n##_from_inode) return true; BCH_REBALANCE_OPTS() #undef x } - return bch2_bkey_ptrs_need_rebalance(c, opts, k); + return new.need_rb; } -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts, +int bch2_bkey_set_needs_rebalance(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, + struct bch_inode_opts *opts, struct bkey_i *_k, enum set_needs_rebalance_ctx ctx, - u32 change_cookie) + u32 opt_change_cookie) { if (!bkey_extent_is_direct_data(&_k->k)) return 0; + struct bch_fs *c = trans->c; struct bkey_s k = bkey_i_to_s(_k); struct bch_extent_rebalance *old = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - if (bkey_should_have_rb_opts(c, opts, k.s_c)) { + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + unsigned csum_ptrs = 0; + struct bch_extent_rebalance new = + bch2_bkey_needs_rebalance(c, k.s_c, opts, &move_ptrs, &compress_ptrs, &csum_ptrs, + ctx == SET_NEEDS_REBALANCE_opt_change_indirect); + + bool should_have_rb = bkey_should_have_rb_opts(k.s_c, new); + + if (should_have_rb == !!old && + (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old)) + return 0; + + unsigned new_need_rb = new.need_rb & ~(old ? old->need_rb : 0); + + if (unlikely(new_need_rb)) { + int ret = new_needs_rb_allowed(trans, snapshot_io_opts, + k.s_c, ctx, opt_change_cookie, + old, &new, new_need_rb); + if (ret) + return ret; + } + + if (should_have_rb) { if (!old) { old = bkey_val_end(k); k.k->u64s += sizeof(*old) / sizeof(u64); } - *old = io_opts_to_rebalance_opts(c, opts); - } else { - if (old) - extent_entry_drop(k, (union bch_extent_entry *) old); - } + *old = new; + } else if (old) + extent_entry_drop(k, (union bch_extent_entry *) old); return 0; } static int bch2_get_update_rebalance_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, struct bch_inode_opts *io_opts, struct btree_iter *iter, struct bkey_s_c k, enum set_needs_rebalance_ctx ctx) { struct bch_fs *c = trans->c; + int ret = 0; BUG_ON(iter->flags & BTREE_ITER_is_extents); BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); @@ -269,36 +521,24 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans, if (!bkey_extent_is_direct_data(k.k)) return 0; - bool may_update_indirect = ctx == SET_NEEDS_REBALANCE_opt_change_indirect; + struct bch_extent_rebalance *old = + (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k); - /* - * If it's an indirect extent, and we walked to it directly, we won't - * have the options from the inode that were directly applied: options - * from the extent take precedence - unless the io_opts option came from - * the inode and may_update_indirect is true (walked from a - * REFLINK_P_MAY_UPDATE_OPTIONS pointer). - */ - const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); - if (old && k.k->type == KEY_TYPE_reflink_v) { -#define x(_name) \ - if (old->_name##_from_inode && \ - !(may_update_indirect && io_opts->_name##_from_inode)) { \ - io_opts->_name = old->_name; \ - io_opts->_name##_from_inode = true; \ - } - BCH_REBALANCE_OPTS() -#undef x - } + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + unsigned csum_ptrs = 0; + struct bch_extent_rebalance new = + bch2_bkey_needs_rebalance(c, k, io_opts, &move_ptrs, &compress_ptrs, &csum_ptrs, + ctx == SET_NEEDS_REBALANCE_opt_change_indirect); - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, io_opts); + bool should_have_rb = bkey_should_have_rb_opts(k, new); - if (bkey_should_have_rb_opts(c, io_opts, k) - ? old && !memcmp(old, &new, sizeof(new)) - : !old) + if (should_have_rb == !!old && + (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old)) return 0; struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); - int ret = PTR_ERR_OR_ZERO(n); + ret = PTR_ERR_OR_ZERO(n); if (ret) return ret; @@ -306,7 +546,7 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans, /* On successfull transaction commit, @k was invalidated: */ - return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?: + return bch2_bkey_set_needs_rebalance(trans, snapshot_io_opts, io_opts, n, ctx, 0) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, 0) ?: bch_err_throw(c, transaction_restart_commit); @@ -349,7 +589,8 @@ static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans *trans, darray_push(&io_opts->d, e); })); - io_opts->cur_inum = extent_pos.inode; + io_opts->cur_inum = extent_pos.inode; + io_opts->inum_scan_cookie = false; } ret = ret ?: trans_was_restarted(trans, restart_count); @@ -372,11 +613,13 @@ struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *trans, enum set_needs_rebalance_ctx ctx) { struct bch_inode_opts *opts = - bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, extent_iter, extent_k); + bch2_extent_get_io_opts(trans, snapshot_io_opts, + extent_pos, extent_iter, extent_k); if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level) return opts; - int ret = bch2_get_update_rebalance_opts(trans, opts, extent_iter, extent_k, ctx); + int ret = bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts, + extent_iter, extent_k, ctx); return ret ? ERR_PTR(ret) : opts; } @@ -420,11 +663,9 @@ int bch2_extent_get_apply_io_opts_one(struct btree_trans *trans, if (ret || btree_iter_path(trans, extent_iter)->level) return ret; - return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k, ctx); + return bch2_get_update_rebalance_opts(trans, NULL, io_opts, extent_iter, extent_k, ctx); } -#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) - static const char * const bch2_rebalance_state_strs[] = { #define x(t) #t, BCH_REBALANCE_STATES() @@ -467,6 +708,11 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) return ret; } +int bch2_set_rebalance_needs_scan_device(struct bch_fs *c, unsigned dev) +{ + return bch2_set_rebalance_needs_scan(c, dev + 1); +} + int bch2_set_fs_needs_rebalance(struct bch_fs *c) { return bch2_set_rebalance_needs_scan(c, 0); @@ -535,21 +781,21 @@ static struct bkey_i *next_rebalance_entry(struct btree_trans *trans, return &(&darray_pop(buf))->k_i; } -static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +static int extent_ec_pending(struct btree_trans *trans, struct bkey_ptrs_c ptrs) { - if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) - return 0; + struct bch_fs *c = trans->c; - struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; + guard(rcu)(); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + if (!ca) + continue; - extent_entry_drop(bkey_i_to_s(n), - (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); - return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + if (bch2_bucket_has_new_stripe(c, bucket_to_u64(bucket))) + return true; + } + return false; } static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, @@ -570,6 +816,10 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, if (bkey_err(k)) return k; + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + if (!r || !r->need_rb) /* Write buffer race? */ + return bkey_s_c_null; + struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans, snapshot_io_opts, extent_iter->pos, extent_iter, k, @@ -580,22 +830,86 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, *opts_ret = opts; + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + unsigned csum_ptrs = 0; + bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, &csum_ptrs, false); + memset(data_opts, 0, sizeof(*data_opts)); - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k); + data_opts->rewrite_ptrs = move_ptrs|compress_ptrs|csum_ptrs; data_opts->target = opts->background_target; data_opts->write_flags |= BCH_WRITE_only_specified_devs; - if (!data_opts->rewrite_ptrs) { - /* - * device we would want to write to offline? devices in target - * changed? - * - * We'll now need a full scan before this extent is picked up - * again: - */ - int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); - if (ret) - return bkey_s_c_err(ret); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + if (r->need_rb & BIT(BCH_REBALANCE_data_replicas)) { + unsigned durability = bch2_bkey_durability(c, k); + unsigned ptr_bit = 1; + + guard(rcu)(); + if (durability <= opts->data_replicas) { + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + if (ca && !ptr->cached && !ca->mi.durability) + data_opts->kill_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + data_opts->extra_replicas = opts->data_replicas - durability; + } else { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + unsigned d = bch2_extent_ptr_durability(c, &p); + + if (d && durability - d >= opts->data_replicas) { + data_opts->kill_ptrs |= ptr_bit; + durability -= d; + } + + ptr_bit <<= 1; + } + + ptr_bit = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.has_ec && durability - p.ec.redundancy >= opts->data_replicas) { + data_opts->kill_ec_ptrs |= ptr_bit; + durability -= p.ec.redundancy; + } + + ptr_bit <<= 1; + } + } + } + + if (r->need_rb & BIT(BCH_REBALANCE_erasure_code)) { + if (opts->erasure_code) { + /* XXX: we'll need ratelimiting */ + if (extent_ec_pending(trans, ptrs)) + return bkey_s_c_null; + + data_opts->extra_replicas = opts->data_replicas; + } else { + unsigned ptr_bit = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.has_ec) { + data_opts->kill_ec_ptrs |= ptr_bit; + data_opts->extra_replicas += p.ec.redundancy; + } + + ptr_bit <<= 1; + } + } + } + + if (!data_opts->rewrite_ptrs && + !data_opts->kill_ptrs && + !data_opts->kill_ec_ptrs && + !data_opts->extra_replicas) { + CLASS(printbuf, buf)(); + prt_printf(&buf, "got extent to rebalance but nothing to do, confused\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "%s", buf.buf); return bkey_s_c_null; } @@ -605,12 +919,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); - unsigned move_ptrs = 0; - unsigned compress_ptrs = 0; - u64 sectors = 0; - - bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, §ors); - if (move_ptrs) { prt_str(&buf, "move="); bch2_target_to_text(&buf, c, opts->background_target); @@ -627,6 +935,14 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, prt_newline(&buf); } + if (csum_ptrs) { + prt_str(&buf, "csum="); + bch2_prt_csum_opt(&buf, opts->data_checksum); + prt_str(&buf, " "); + bch2_prt_u64_base2(&buf, csum_ptrs); + prt_newline(&buf); + } + trace_rebalance_extent(c, buf.buf); } count_event(c, rebalance_extent); @@ -688,8 +1004,75 @@ out: return ret; } +static int do_rebalance_scan_bp(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct bkey_buf *last_flushed) +{ + if (bp.v->level) /* metadata not supported yet */ + return 0; + + struct btree_iter iter; + struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, + last_flushed); + int ret = bkey_err(k); + if (ret) + return ret; + + if (!k.k) + return 0; + + struct bch_inode_opts io_opts; + ret = bch2_extent_get_io_opts_one(trans, &io_opts, &iter, k, + SET_NEEDS_REBALANCE_opt_change); + bch2_trans_iter_exit(&iter); + return ret; +} + +static int do_rebalance_scan_device(struct moving_context *ctxt, + unsigned dev, u64 cookie, + u64 *sectors_scanned) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &c->rebalance; + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + bch2_btree_write_buffer_flush_sync(trans); + + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, + POS(dev, 0), POS(dev, U64_MAX), + BTREE_ITER_prefetch, k, ({ + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + + if (k.k->type != KEY_TYPE_backpointer) + continue; + + do_rebalance_scan_bp(trans, bkey_s_c_to_backpointer(k), &last_flushed); + })) ?: + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_clear_rebalance_needs_scan(trans, dev + 1, cookie)); + + *sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen); + /* + * Ensure that the rebalance_work entries we created are seen by the + * next iteration of do_rebalance(), so we don't end up stuck in + * rebalance_wait(): + */ + *sectors_scanned += 1; + bch2_move_stats_exit(&r->scan_stats, c); + + bch2_btree_write_buffer_flush_sync(trans); + + bch2_bkey_buf_exit(&last_flushed, c); + return ret; +} + static int do_rebalance_scan_indirect(struct btree_trans *trans, struct bkey_s_c_reflink_p p, + struct per_snapshot_io_opts *snapshot_io_opts, struct bch_inode_opts *opts) { u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); @@ -702,7 +1085,7 @@ static int do_rebalance_scan_indirect(struct btree_trans *trans, BTREE_ITER_not_extents, k, ({ if (bpos_ge(bkey_start_pos(k.k), POS(0, end))) break; - bch2_get_update_rebalance_opts(trans, opts, &iter, k, + bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts, &iter, k, SET_NEEDS_REBALANCE_opt_change_indirect); })); if (ret) @@ -724,15 +1107,21 @@ static int do_rebalance_scan(struct moving_context *ctxt, bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); ctxt->stats = &r->scan_stats; + r->state = BCH_REBALANCE_scanning; + if (!inum) { r->scan_start = BBPOS_MIN; r->scan_end = BBPOS_MAX; - } else { + } else if (inum >= BCACHEFS_ROOT_INO) { r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); - } + } else { + unsigned dev = inum - 1; + r->scan_start = BBPOS(BTREE_ID_backpointers, POS(dev, 0)); + r->scan_end = BBPOS(BTREE_ID_backpointers, POS(dev, U64_MAX)); - r->state = BCH_REBALANCE_scanning; + return do_rebalance_scan_device(ctxt, inum - 1, cookie, sectors_scanned); + } int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, r->scan_start.pos, r->scan_end.pos, @@ -750,7 +1139,8 @@ static int do_rebalance_scan(struct moving_context *ctxt, (inum && k.k->type == KEY_TYPE_reflink_p && REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v) - ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), opts) + ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), + snapshot_io_opts, opts) : 0); })); if (ret) @@ -1049,6 +1439,7 @@ int bch2_fs_rebalance_init(struct bch_fs *c) static int check_rebalance_work_one(struct btree_trans *trans, struct btree_iter *extent_iter, struct btree_iter *rebalance_iter, + struct per_snapshot_io_opts *snapshot_io_opts, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; @@ -1089,8 +1480,7 @@ static int check_rebalance_work_one(struct btree_trans *trans, extent_k.k = &deleted; } - bool should_have_rebalance = - bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; + bool should_have_rebalance = bch2_bkey_needs_rb(extent_k); bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; if (should_have_rebalance != have_rebalance) { @@ -1119,6 +1509,21 @@ static int check_rebalance_work_one(struct btree_trans *trans, return ret; } + struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans, + snapshot_io_opts, extent_iter->pos, extent_iter, extent_k, + SET_NEEDS_REBALANCE_other); + ret = PTR_ERR_OR_ZERO(opts); + if (ret == -BCH_ERR_transaction_restart_commit) { + /* + * If get_apply_io_opts() did work, just advance and check the + * next key; it may have updated the rebalance_work btree so + * we'd need a write buffer flush to check what it just did. + */ + ret = 0; + } + if (ret) + return ret; + if (cmp <= 0) bch2_btree_iter_advance(extent_iter); if (cmp >= 0) @@ -1131,10 +1536,14 @@ int bch2_check_rebalance_work(struct bch_fs *c) { CLASS(btree_trans, trans)(c); CLASS(btree_iter, extent_iter)(trans, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_not_extents| BTREE_ITER_prefetch); CLASS(btree_iter, rebalance_iter)(trans, BTREE_ID_rebalance_work, POS_MIN, BTREE_ITER_prefetch); + struct per_snapshot_io_opts snapshot_io_opts; + per_snapshot_io_opts_init(&snapshot_io_opts, c); + struct bkey_buf last_flushed; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); @@ -1148,12 +1557,15 @@ int bch2_check_rebalance_work(struct bch_fs *c) bch2_trans_begin(trans); - ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); + ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, + &snapshot_io_opts, &last_flushed) ?: + bch2_trans_commit(trans, NULL, NULL, 0); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ret = 0; } + per_snapshot_io_opts_exit(&snapshot_io_opts); bch2_bkey_buf_exit(&last_flushed, c); return ret < 0 ? ret : 0; } diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 24bafa42f070..e6dd9a7db26c 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -7,10 +7,14 @@ #include "opts.h" #include "rebalance_types.h" +int bch2_extent_rebalance_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context, + const struct bch_extent_rebalance *); + static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, struct bch_inode_opts *opts) { - struct bch_extent_rebalance r = { + return (struct bch_extent_rebalance) { .type = BIT(BCH_EXTENT_ENTRY_rebalance), #define x(_name) \ ._name = opts->_name, \ @@ -18,22 +22,36 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_f BCH_REBALANCE_OPTS() #undef x }; - - if (r.background_target && - !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) - r.background_target = 0; - - return r; }; void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *, const struct bch_extent_rebalance *); -int bch2_trigger_extent_rebalance(struct btree_trans *, - struct bkey_s_c, struct bkey_s_c, - enum btree_iter_update_trigger_flags); +const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); + +static inline int bch2_bkey_needs_rb(struct bkey_s_c k) +{ + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + return r ? r->need_rb : 0; +} + +int __bch2_trigger_extent_rebalance(struct btree_trans *, + struct bkey_s_c, struct bkey_s_c, + unsigned, unsigned, + enum btree_iter_update_trigger_flags); -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); +static inline int bch2_trigger_extent_rebalance(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + enum btree_iter_update_trigger_flags flags) +{ + unsigned old_r = bch2_bkey_needs_rb(old); + unsigned new_r = bch2_bkey_needs_rb(new); + + return old_r != new_r || + (old.k->size != new.k->size && (old_r|new_r)) + ? __bch2_trigger_extent_rebalance(trans, old, new, old_r, new_r, flags) + : 0; +} enum set_needs_rebalance_ctx { SET_NEEDS_REBALANCE_opt_change, @@ -42,9 +60,6 @@ enum set_needs_rebalance_ctx { SET_NEEDS_REBALANCE_other, }; -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *, - struct bkey_i *, enum set_needs_rebalance_ctx, u32); - /* Inodes in different snapshots may have different IO options: */ struct snapshot_io_opts_entry { u32 snapshot; @@ -53,6 +68,10 @@ struct snapshot_io_opts_entry { struct per_snapshot_io_opts { u64 cur_inum; + bool fs_scan_cookie; + bool inum_scan_cookie; + struct bch_devs_mask dev_cookie; + struct bch_inode_opts fs_io_opts; DARRAY(struct snapshot_io_opts_entry) d; }; @@ -68,6 +87,10 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt darray_exit(&io_opts->d); } +int bch2_bkey_set_needs_rebalance(struct btree_trans *, + struct per_snapshot_io_opts *, struct bch_inode_opts *, + struct bkey_i *, enum set_needs_rebalance_ctx, u32); + struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *, struct per_snapshot_io_opts *, struct bpos, struct btree_iter *, struct bkey_s_c, @@ -82,6 +105,7 @@ int bch2_extent_get_apply_io_opts_one(struct btree_trans *, struct bch_inode_opt int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); +int bch2_set_rebalance_needs_scan_device(struct bch_fs *, unsigned); int bch2_set_fs_needs_rebalance(struct bch_fs *); static inline void bch2_rebalance_wakeup(struct bch_fs *c) diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h index ff9a1342a22b..d7a5f899e789 100644 --- a/fs/bcachefs/rebalance_format.h +++ b/fs/bcachefs/rebalance_format.h @@ -5,49 +5,76 @@ struct bch_extent_rebalance { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:6, - unused:3, + unused:5, + hipri:1, + pending:1, + need_rb:5, - promote_target_from_inode:1, - erasure_code_from_inode:1, + data_replicas_from_inode:1, data_checksum_from_inode:1, + erasure_code_from_inode:1, background_compression_from_inode:1, - data_replicas_from_inode:1, background_target_from_inode:1, + promote_target_from_inode:1, - promote_target:16, - erasure_code:1, + data_replicas:3, data_checksum:4, - data_replicas:4, + erasure_code:1, background_compression:8, /* enum bch_compression_opt */ - background_target:16; + background_target:12, + promote_target:12; #elif defined (__BIG_ENDIAN_BITFIELD) - __u64 background_target:16, + __u64 promote_target:12, + background_target:12, background_compression:8, - data_replicas:4, - data_checksum:4, erasure_code:1, - promote_target:16, + data_checksum:4, + data_replicas:3, + promote_target_from_inode:1, background_target_from_inode:1, - data_replicas_from_inode:1, background_compression_from_inode:1, - data_checksum_from_inode:1, erasure_code_from_inode:1, - promote_target_from_inode:1, + data_checksum_from_inode:1, + data_replicas_from_inode:1, - unused:3, + need_rb:5, + pending:1, + hipri:1, + unused:5, type:6; #endif }; /* subset of BCH_INODE_OPTS */ #define BCH_REBALANCE_OPTS() \ + x(data_replicas) \ x(data_checksum) \ + x(erasure_code) \ x(background_compression) \ + x(background_target) \ + x(promote_target) + +enum bch_rebalance_opts { +#define x(n) BCH_REBALANCE_##n, + BCH_REBALANCE_OPTS() +#undef x +}; + +#define BCH_REBALANCE_ACCOUNTING() \ x(data_replicas) \ - x(promote_target) \ + x(data_checksum) \ + x(erasure_code) \ + x(background_compression) \ x(background_target) \ - x(erasure_code) + x(high_priority) \ + x(pending) \ + +enum bch_rebalance_accounting_type { +#define x(n) BCH_REBALANCE_ACCOUNTING_##n, + BCH_REBALANCE_ACCOUNTING() +#undef x +}; #endif /* _BCACHEFS_REBALANCE_FORMAT_H */ diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h index b7eff904acdb..898caf943b34 100644 --- a/fs/bcachefs/replicas_format.h +++ b/fs/bcachefs/replicas_format.h @@ -17,7 +17,8 @@ struct bch_replicas_entry_v1 { __u8 data_type; __u8 nr_devs; __u8 nr_required; - __u8 devs[] __counted_by(nr_devs); + /* No counted_by: bch_replicas_cpu entries are all the size of the biggest entry */ + __u8 devs[]; } __packed; struct bch_sb_field_replicas { diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index bfd06fd5d506..66b7f19f0437 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -107,7 +107,10 @@ BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\ x(btree_node_accounting, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) + BCH_FSCK_ERR_accounting_mismatch) \ + x(rebalance_v2, \ + BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work), \ + BCH_FSCK_ERR_extent_io_opts_not_set) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 77e3fc92e39b..010b2a86f1be 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -74,6 +74,7 @@ enum bch_fsck_flags { x(btree_root_bad_min_key, 60, 0) \ x(btree_root_bad_max_key, 61, 0) \ x(btree_node_read_error, 62, FSCK_AUTOFIX) \ + x(btree_node_topology_gap_between_nodes, 328, FSCK_AUTOFIX) \ x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \ x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \ x(btree_node_topology_bad_root_min_key, 323, FSCK_AUTOFIX) \ @@ -159,6 +160,8 @@ enum bch_fsck_flags { x(extent_ptrs_redundant_stripe, 139, 0) \ x(extent_ptrs_unwritten, 140, 0) \ x(extent_ptrs_written_and_unwritten, 141, 0) \ + x(extent_rebalance_bad_pending, 331, 0) \ + x(extent_rebalance_bad_hipri, 332, 0) \ x(ptr_to_invalid_device, 142, 0) \ x(ptr_to_removed_device, 322, FSCK_AUTOFIX) \ x(ptr_to_duplicate_device, 143, 0) \ @@ -339,7 +342,9 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 328, 0) + x(extent_io_opts_not_set, 329, FSCK_AUTOFIX) \ + x(extent_io_opts_unneeded, 330, FSCK_AUTOFIX) \ + x(MAX, 333, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 5cd308a68035..534ba2c0f83a 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -564,15 +564,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) * successfully marked the filesystem dirty */ - ret = bch2_journal_reclaim_start(&c->journal); - if (ret) - goto err; - set_bit(BCH_FS_rw, &c->flags); set_bit(BCH_FS_was_rw, &c->flags); enumerated_ref_start(&c->writes); + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) { + bch_err_msg(c, ret, "error starting journal reclaim thread"); + goto err; + } + ret = bch2_copygc_start(c); if (ret) { bch_err_msg(c, ret, "error starting copygc thread"); @@ -852,7 +854,8 @@ int bch2_fs_init_rw(struct bch_fs *c) bch2_fs_btree_write_buffer_init(c) ?: bch2_fs_fs_io_buffered_init(c) ?: bch2_fs_io_write_init(c) ?: - bch2_fs_journal_init(&c->journal); + bch2_fs_journal_init(&c->journal) ?: + bch2_journal_reclaim_start(&c->journal); if (ret) return ret; @@ -1971,6 +1974,9 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, bch_notice(ca, "%s", bch2_member_states[new_state]); + if (new_state == BCH_MEMBER_STATE_failed) + bch2_set_rebalance_needs_scan_device(c, ca->dev_idx); + scoped_guard(mutex, &c->sb_lock) { struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); SET_BCH_MEMBER_STATE(m, new_state); @@ -1980,6 +1986,9 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, if (new_state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); + if (new_state == BCH_MEMBER_STATE_failed) + bch2_set_rebalance_needs_scan_device(c, ca->dev_idx); + bch2_rebalance_wakeup(c); return ret; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index ef6312c50f88..85ccc084ecd9 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -793,6 +793,9 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, bool is_sb = opt->get_sb || opt->get_member; bool changed = false; + if (id == Opt_durability) + bch2_set_rebalance_needs_scan_device(c, ca->dev_idx); + if (is_sb) { changed = bch2_opt_set_sb(c, ca, opt, v); } else if (!ca) { @@ -806,6 +809,9 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, if (!ca) bch2_opt_set_by_id(&c->opts, id, v); + if (id == Opt_durability) + bch2_set_rebalance_needs_scan_device(c, ca->dev_idx); + if (changed) bch2_opt_hook_post_set(c, ca, 0, id, v); diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 6c312fd9a447..c5d7be2eba03 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1339,11 +1339,6 @@ DEFINE_EVENT(fs_str, io_move_pred, TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, io_move_created_rebalance, - TP_PROTO(struct bch_fs *c, const char *str), - TP_ARGS(c, str) -); - DEFINE_EVENT(fs_str, io_move_evacuate_bucket, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 555e0d8f3cf0..eb4222f0f34b 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -7,7 +7,6 @@ #include <linux/errno.h> #include <linux/freezer.h> #include <linux/kernel.h> -#include <linux/min_heap.h> #include <linux/sched/clock.h> #include <linux/llist.h> #include <linux/log2.h> diff --git a/fs/bcachefs/vendor/min_heap.c b/fs/bcachefs/vendor/min_heap.c new file mode 100644 index 000000000000..21f744549d66 --- /dev/null +++ b/fs/bcachefs/vendor/min_heap.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "min_heap.h" + +void __bch2_min_heap_init(min_heap_char *heap, void *data, size_t size) +{ + __min_heap_init_inline(heap, data, size); +} + +void *__bch2_min_heap_peek(struct min_heap_char *heap) +{ + return __min_heap_peek_inline(heap); +} + +bool __bch2_min_heap_full(min_heap_char *heap) +{ + return __min_heap_full_inline(heap); +} + +void __bch2_min_heap_sift_down(min_heap_char *heap, size_t pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + __min_heap_sift_down_inline(heap, pos, elem_size, func, args); +} + +void __bch2_min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + __min_heap_sift_up_inline(heap, elem_size, idx, func, args); +} + +void __bch2_min_heapify_all(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + __min_heapify_all_inline(heap, elem_size, func, args); +} + +bool __bch2_min_heap_pop(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + return __min_heap_pop_inline(heap, elem_size, func, args); +} + +void __bch2_min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + __min_heap_pop_push_inline(heap, element, elem_size, func, args); +} + +bool __bch2_min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + return __min_heap_push_inline(heap, element, elem_size, func, args); +} + +bool __bch2_min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + return __min_heap_del_inline(heap, elem_size, idx, func, args); +} diff --git a/fs/bcachefs/vendor/min_heap.h b/fs/bcachefs/vendor/min_heap.h new file mode 100644 index 000000000000..865c8b33b11f --- /dev/null +++ b/fs/bcachefs/vendor/min_heap.h @@ -0,0 +1,477 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MIN_HEAP_H +#define _LINUX_MIN_HEAP_H + +#include <linux/bug.h> +#include <linux/string.h> +#include <linux/types.h> + +/* + * The Min Heap API provides utilities for managing min-heaps, a binary tree + * structure where each node's value is less than or equal to its children's + * values, ensuring the smallest element is at the root. + * + * Users should avoid directly calling functions prefixed with __min_heap_*(). + * Instead, use the provided macro wrappers. + * + * For further details and examples, refer to Documentation/core-api/min_heap.rst. + */ + +/** + * Data structure to hold a min-heap. + * @nr: Number of elements currently in the heap. + * @size: Maximum number of elements that can be held in current storage. + * @data: Pointer to the start of array holding the heap elements. + * @preallocated: Start of the static preallocated array holding the heap elements. + */ +#define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \ +struct _name { \ + size_t nr; \ + size_t size; \ + _type *data; \ + _type preallocated[_nr]; \ +} + +#define DEFINE_MIN_HEAP(_type, _name) MIN_HEAP_PREALLOCATED(_type, _name, 0) + +typedef DEFINE_MIN_HEAP(char, min_heap_char) min_heap_char; + +#define __minheap_cast(_heap) (typeof((_heap)->data[0]) *) +#define __minheap_obj_size(_heap) sizeof((_heap)->data[0]) + +/** + * struct min_heap_callbacks - Data/functions to customise the min_heap. + * @less: Partial order function for this heap. + * @swp: Swap elements function. + */ +struct min_heap_callbacks { + bool (*less)(const void *lhs, const void *rhs, void *args); + void (*swp)(void *lhs, void *rhs, void *args); +}; + +/** + * is_aligned - is this pointer & size okay for word-wide copying? + * @base: pointer to data + * @size: size of each element + * @align: required alignment (typically 4 or 8) + * + * Returns true if elements can be copied using word loads and stores. + * The size must be a multiple of the alignment, and the base address must + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + * + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" + * to "if ((a | b) & mask)", so we do that by hand. + */ +__attribute_const__ __always_inline +static bool is_aligned(const void *base, size_t size, unsigned char align) +{ + unsigned char lsbits = (unsigned char)size; + + (void)base; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + lsbits |= (unsigned char)(uintptr_t)base; +#endif + return (lsbits & (align - 1)) == 0; +} + +/** + * swap_words_32 - swap two elements in 32-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 4) + * + * Exchange the two objects in memory. This exploits base+index addressing, + * which basically all CPUs have, to minimize loop overhead computations. + * + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the + * bottom of the loop, even though the zero flag is still valid from the + * subtract (since the intervening mov instructions don't alter the flags). + * Gcc 8.1.0 doesn't have that problem. + */ +static __always_inline +void swap_words_32(void *a, void *b, size_t n) +{ + do { + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + } while (n); +} + +/** + * swap_words_64 - swap two elements in 64-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 8) + * + * Exchange the two objects in memory. This exploits base+index + * addressing, which basically all CPUs have, to minimize loop overhead + * computations. + * + * We'd like to use 64-bit loads if possible. If they're not, emulating + * one requires base+index+4 addressing which x86 has but most other + * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, + * but it's possible to have 64-bit loads without 64-bit pointers (e.g. + * x32 ABI). Are there any cases the kernel needs to worry about? + */ +static __always_inline +void swap_words_64(void *a, void *b, size_t n) +{ + do { +#ifdef CONFIG_64BIT + u64 t = *(u64 *)(a + (n -= 8)); + *(u64 *)(a + n) = *(u64 *)(b + n); + *(u64 *)(b + n) = t; +#else + /* Use two 32-bit transfers to avoid base+index+4 addressing */ + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + + t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; +#endif + } while (n); +} + +/** + * swap_bytes - swap two elements a byte at a time + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size + * + * This is the fallback if alignment doesn't allow using larger chunks. + */ +static __always_inline +void swap_bytes(void *a, void *b, size_t n) +{ + do { + char t = ((char *)a)[--n]; + ((char *)a)[n] = ((char *)b)[n]; + ((char *)b)[n] = t; + } while (n); +} + +/* + * The values are arbitrary as long as they can't be confused with + * a pointer, but small integers make for the smallest compare + * instructions. + */ +#define SWAP_WORDS_64 ((void (*)(void *, void *, void *))0) +#define SWAP_WORDS_32 ((void (*)(void *, void *, void *))1) +#define SWAP_BYTES ((void (*)(void *, void *, void *))2) + +/* + * Selects the appropriate swap function based on the element size. + */ +static __always_inline +void *select_swap_func(const void *base, size_t size) +{ + if (is_aligned(base, size, 8)) + return SWAP_WORDS_64; + else if (is_aligned(base, size, 4)) + return SWAP_WORDS_32; + else + return SWAP_BYTES; +} + +static __always_inline +void do_swap(void *a, void *b, size_t size, void (*swap_func)(void *lhs, void *rhs, void *args), + void *priv) +{ + if (swap_func == SWAP_WORDS_64) + swap_words_64(a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32(a, b, size); + else if (swap_func == SWAP_BYTES) + swap_bytes(a, b, size); + else + swap_func(a, b, priv); +} + +/** + * parent - given the offset of the child, find the offset of the parent. + * @i: the offset of the heap element whose parent is sought. Non-zero. + * @lsbit: a precomputed 1-bit mask, equal to "size & -size" + * @size: size of each element + * + * In terms of array indexes, the parent of element j = @i/@size is simply + * (j-1)/2. But when working in byte offsets, we can't use implicit + * truncation of integer divides. + * + * Fortunately, we only need one bit of the quotient, not the full divide. + * @size has a least significant bit. That bit will be clear if @i is + * an even multiple of @size, and set if it's an odd multiple. + * + * Logically, we're doing "if (i & lsbit) i -= size;", but since the + * branch is unpredictable, it's done with a bit of clever branch-free + * code instead. + */ +__attribute_const__ __always_inline +static size_t parent(size_t i, unsigned int lsbit, size_t size) +{ + i -= size; + i -= size & -(i & lsbit); + return i / 2; +} + +/* Initialize a min-heap. */ +static __always_inline +void __min_heap_init_inline(min_heap_char *heap, void *data, size_t size) +{ + heap->nr = 0; + heap->size = size; + if (data) + heap->data = data; + else + heap->data = heap->preallocated; +} + +#define min_heap_init_inline(_heap, _data, _size) \ + __min_heap_init_inline(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size) + +/* Get the minimum element from the heap. */ +static __always_inline +void *__min_heap_peek_inline(struct min_heap_char *heap) +{ + return heap->nr ? heap->data : NULL; +} + +#define min_heap_peek_inline(_heap) \ + (__minheap_cast(_heap) \ + __min_heap_peek_inline(container_of(&(_heap)->nr, min_heap_char, nr))) + +/* Check if the heap is full. */ +static __always_inline +bool __min_heap_full_inline(min_heap_char *heap) +{ + return heap->nr == heap->size; +} + +#define min_heap_full_inline(_heap) \ + __min_heap_full_inline(container_of(&(_heap)->nr, min_heap_char, nr)) + +/* Sift the element at pos down the heap. */ +static __always_inline +void __min_heap_sift_down_inline(min_heap_char *heap, size_t pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + const unsigned long lsbit = elem_size & -elem_size; + void *data = heap->data; + void (*swp)(void *lhs, void *rhs, void *args) = func->swp; + /* pre-scale counters for performance */ + size_t a = pos * elem_size; + size_t b, c, d; + size_t n = heap->nr * elem_size; + + if (!swp) + swp = select_swap_func(data, elem_size); + + /* Find the sift-down path all the way to the leaves. */ + for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;) + b = func->less(data + c, data + d, args) ? c : d; + + /* Special case for the last leaf with no sibling. */ + if (d == n) + b = c; + + /* Backtrack to the correct location. */ + while (b != a && func->less(data + a, data + b, args)) + b = parent(b, lsbit, elem_size); + + /* Shift the element into its correct place. */ + c = b; + while (b != a) { + b = parent(b, lsbit, elem_size); + do_swap(data + b, data + c, elem_size, swp, args); + } +} + +#define min_heap_sift_down_inline(_heap, _pos, _func, _args) \ + __min_heap_sift_down_inline(container_of(&(_heap)->nr, min_heap_char, nr), _pos, \ + __minheap_obj_size(_heap), _func, _args) + +/* Sift up ith element from the heap, O(log2(nr)). */ +static __always_inline +void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + const unsigned long lsbit = elem_size & -elem_size; + void *data = heap->data; + void (*swp)(void *lhs, void *rhs, void *args) = func->swp; + /* pre-scale counters for performance */ + size_t a = idx * elem_size, b; + + if (!swp) + swp = select_swap_func(data, elem_size); + + while (a) { + b = parent(a, lsbit, elem_size); + if (func->less(data + b, data + a, args)) + break; + do_swap(data + a, data + b, elem_size, swp, args); + a = b; + } +} + +#define min_heap_sift_up_inline(_heap, _idx, _func, _args) \ + __min_heap_sift_up_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) + +/* Floyd's approach to heapification that is O(nr). */ +static __always_inline +void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + ssize_t i; + + for (i = heap->nr / 2 - 1; i >= 0; i--) + __min_heap_sift_down_inline(heap, i, elem_size, func, args); +} + +#define min_heapify_all_inline(_heap, _func, _args) \ + __min_heapify_all_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) + +/* Remove minimum element from the heap, O(log2(nr)). */ +static __always_inline +bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + void *data = heap->data; + + if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap")) + return false; + + /* Place last element at the root (position 0) and then sift down. */ + heap->nr--; + memcpy(data, data + (heap->nr * elem_size), elem_size); + __min_heap_sift_down_inline(heap, 0, elem_size, func, args); + + return true; +} + +#define min_heap_pop_inline(_heap, _func, _args) \ + __min_heap_pop_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) + +/* + * Remove the minimum element and then push the given element. The + * implementation performs 1 sift (O(log2(nr))) and is therefore more + * efficient than a pop followed by a push that does 2. + */ +static __always_inline +void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + memcpy(heap->data, element, elem_size); + __min_heap_sift_down_inline(heap, 0, elem_size, func, args); +} + +#define min_heap_pop_push_inline(_heap, _element, _func, _args) \ + __min_heap_pop_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) + +/* Push an element on to the heap, O(log2(nr)). */ +static __always_inline +bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + void *data = heap->data; + size_t pos; + + if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap")) + return false; + + /* Place at the end of data. */ + pos = heap->nr; + memcpy(data + (pos * elem_size), element, elem_size); + heap->nr++; + + /* Sift child at pos up. */ + __min_heap_sift_up_inline(heap, elem_size, pos, func, args); + + return true; +} + +#define min_heap_push_inline(_heap, _element, _func, _args) \ + __min_heap_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) + +/* Remove ith element from the heap, O(log2(nr)). */ +static __always_inline +bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + void *data = heap->data; + void (*swp)(void *lhs, void *rhs, void *args) = func->swp; + + if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap")) + return false; + + if (!swp) + swp = select_swap_func(data, elem_size); + + /* Place last element at the root (position 0) and then sift down. */ + heap->nr--; + if (idx == heap->nr) + return true; + do_swap(data + (idx * elem_size), data + (heap->nr * elem_size), elem_size, swp, args); + __min_heap_sift_up_inline(heap, elem_size, idx, func, args); + __min_heap_sift_down_inline(heap, idx, elem_size, func, args); + + return true; +} + +#define min_heap_del_inline(_heap, _idx, _func, _args) \ + __min_heap_del_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) + +void __bch2_min_heap_init(min_heap_char *heap, void *data, size_t size); +void *__bch2_min_heap_peek(struct min_heap_char *heap); +bool __bch2_min_heap_full(min_heap_char *heap); +void __bch2_min_heap_sift_down(min_heap_char *heap, size_t pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +void __bch2_min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args); +void __bch2_min_heapify_all(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __bch2_min_heap_pop(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +void __bch2_min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __bch2_min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __bch2_min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args); + +#define min_heap_init(_heap, _data, _size) \ + __bch2_min_heap_init(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size) +#define min_heap_peek(_heap) \ + (__minheap_cast(_heap) __bch2_min_heap_peek(container_of(&(_heap)->nr, min_heap_char, nr))) +#define min_heap_full(_heap) \ + __bch2_min_heap_full(container_of(&(_heap)->nr, min_heap_char, nr)) +#define min_heap_sift_down(_heap, _pos, _func, _args) \ + __bch2_min_heap_sift_down(container_of(&(_heap)->nr, min_heap_char, nr), _pos, \ + __minheap_obj_size(_heap), _func, _args) +#define min_heap_sift_up(_heap, _idx, _func, _args) \ + __bch2_min_heap_sift_up(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) +#define min_heapify_all(_heap, _func, _args) \ + __bch2_min_heapify_all(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop(_heap, _func, _args) \ + __bch2_min_heap_pop(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_push(_heap, _element, _func, _args) \ + __bch2_min_heap_pop_push(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) +#define min_heap_push(_heap, _element, _func, _args) \ + __bch2_min_heap_push(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) +#define min_heap_del(_heap, _idx, _func, _args) \ + __bch2_min_heap_del(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) + +#endif /* _LINUX_MIN_HEAP_H */ |