diff options
-rw-r--r-- | .bcachefs_revision | 2 | ||||
-rw-r--r-- | bch_bindgen/src/bkey.rs | 2 | ||||
-rw-r--r-- | c_src/cmd_fs.c | 317 | ||||
-rw-r--r-- | libbcachefs/bcachefs.h | 17 | ||||
-rw-r--r-- | libbcachefs/bcachefs_format.h | 11 | ||||
-rw-r--r-- | libbcachefs/bkey_methods.c | 6 | ||||
-rw-r--r-- | libbcachefs/bkey_types.h | 5 | ||||
-rw-r--r-- | libbcachefs/btree_cache.c | 4 | ||||
-rw-r--r-- | libbcachefs/btree_io.c | 2 | ||||
-rw-r--r-- | libbcachefs/btree_iter.c | 72 | ||||
-rw-r--r-- | libbcachefs/btree_iter.h | 6 | ||||
-rw-r--r-- | libbcachefs/btree_trans_commit.c | 2 | ||||
-rw-r--r-- | libbcachefs/btree_update.c | 67 | ||||
-rw-r--r-- | libbcachefs/btree_update_interior.c | 14 | ||||
-rw-r--r-- | libbcachefs/data_update.c | 9 | ||||
-rw-r--r-- | libbcachefs/data_update.h | 1 | ||||
-rw-r--r-- | libbcachefs/extents.c | 17 | ||||
-rw-r--r-- | libbcachefs/extents.h | 1 | ||||
-rw-r--r-- | libbcachefs/fs.c | 10 | ||||
-rw-r--r-- | libbcachefs/fsck.c | 4 | ||||
-rw-r--r-- | libbcachefs/lru.h | 10 | ||||
-rw-r--r-- | libbcachefs/move.c | 16 | ||||
-rw-r--r-- | libbcachefs/movinggc.c | 188 | ||||
-rw-r--r-- | libbcachefs/rebalance.c | 93 | ||||
-rw-r--r-- | libbcachefs/sb-counters_format.h | 3 | ||||
-rw-r--r-- | libbcachefs/super.c | 19 |
26 files changed, 572 insertions, 326 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision index 386580af..abe206fd 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -68b6a68ecf72dba7035a888ad1247fd04d3cc98e +933c0b52a810e410c1c871dacaaaa0f6a5d67f62 diff --git a/bch_bindgen/src/bkey.rs b/bch_bindgen/src/bkey.rs index 0c4786eb..1c98b029 100644 --- a/bch_bindgen/src/bkey.rs +++ b/bch_bindgen/src/bkey.rs @@ -51,6 +51,7 @@ pub enum BkeyValC<'a> { logged_op_finsert(&'a c::bch_logged_op_finsert), accounting(&'a c::bch_accounting), inode_alloc_cursor(&'a c::bch_inode_alloc_cursor), + extent_whiteout, } impl<'a, 'b> BkeySC<'a> { @@ -109,6 +110,7 @@ impl<'a, 'b> BkeySC<'a> { KEY_TYPE_logged_op_finsert => logged_op_finsert(transmute(self.v)), KEY_TYPE_accounting => accounting(transmute(self.v)), KEY_TYPE_inode_alloc_cursor => inode_alloc_cursor(transmute(self.v)), + KEY_TYPE_extent_whiteout => extent_whiteout, KEY_TYPE_MAX => unreachable!(), } } diff --git a/c_src/cmd_fs.c b/c_src/cmd_fs.c index aa825e90..4eca866a 100644 --- a/c_src/cmd_fs.c +++ b/c_src/cmd_fs.c @@ -18,96 +18,85 @@ #include "libbcachefs/darray.h" -static void __dev_usage_type_to_text(struct printbuf *out, - enum bch_data_type type, - unsigned bucket_size, - u64 buckets, u64 sectors, u64 frag) -{ - bch2_prt_data_type(out, type); - prt_char(out, ':'); - prt_tab(out); - - prt_units_u64(out, sectors << 9); - prt_tab_rjust(out); - - prt_printf(out, "%llu", buckets); - prt_tab_rjust(out); - - if (frag) { - prt_units_u64(out, frag << 9); - prt_tab_rjust(out); - } - prt_newline(out); -} - -static void dev_usage_type_to_text(struct printbuf *out, - struct bch_ioctl_dev_usage_v2 *u, - enum bch_data_type type) -{ - u64 sectors = 0; - switch (type) { - case BCH_DATA_free: - case BCH_DATA_need_discard: - case BCH_DATA_need_gc_gens: - /* sectors are 0 for these types so calculate sectors for them */ - sectors = u->d[type].buckets * u->bucket_size; - break; - default: - sectors = u->d[type].sectors; - } - - __dev_usage_type_to_text(out, type, - u->bucket_size, - u->d[type].buckets, - sectors, - u->d[type].fragmented); -} +#define FS_USAGE_FIELDS() \ + x(replicas) \ + x(btree) \ + x(compression) \ + x(rebalance_work) \ + x(devices) + +enum __fs_usage_fields { +#define x(n) __FS_USAGE_##n, + FS_USAGE_FIELDS() +#undef x +}; + +enum fs_usage_fields { +#define x(n) FS_USAGE_##n = BIT(__FS_USAGE_##n), + FS_USAGE_FIELDS() +#undef x +}; + +const char * const fs_usage_field_strs[] = { +#define x(n) [__FS_USAGE_##n] = #n, + FS_USAGE_FIELDS() +#undef x + NULL +}; static void dev_usage_to_text(struct printbuf *out, struct bchfs_handle fs, - struct dev_name *d) + struct dev_name *d, + bool full) { struct bch_ioctl_dev_usage_v2 *u = bchu_dev_usage(fs, d->idx); - prt_newline(out); - prt_printf(out, "%s (device %u):", d->label ?: "(no label)", d->idx); - prt_tab(out); - prt_str(out, d->dev ?: "(device not found)"); - prt_tab_rjust(out); - - prt_str(out, bch2_member_states[u->state]); - prt_tab_rjust(out); - - prt_newline(out); + u64 used = 0, capacity = u->nr_buckets * u->bucket_size; + for (unsigned type = 0; type < u->nr_data_types; type++) { + if (!data_type_is_empty(type)) + used += u->d[type].sectors; + } - printbuf_indent_add(out, 2); - prt_tab(out); + prt_printf(out, "%s (device %u):\t%s\r%s\r %02u%%\n", + d->label ?: "(no label)", d->idx, + d->dev ?: "(device not found)", + bch2_member_states[u->state], + (unsigned) (used * 100 / capacity)); - prt_str(out, "data"); - prt_tab_rjust(out); + if (full) { + printbuf_indent_add(out, 2); + prt_printf(out, "\tdata\rbuckets\rfragmented\r\n"); - prt_str(out, "buckets"); - prt_tab_rjust(out); + for (unsigned type = 0; type < u->nr_data_types; type++) { + bch2_prt_data_type(out, type); + prt_printf(out, ":\t"); - prt_str(out, "fragmented"); - prt_tab_rjust(out); + /* sectors are 0 for empty bucket data types, so calculate sectors for them */ + u64 sectors = data_type_is_empty(type) + ? u->d[type].buckets * u->bucket_size + : u->d[type].sectors; + prt_units_u64(out, sectors << 9); - prt_newline(out); + prt_printf(out, "\r%llu\r", u->d[type].buckets); - for (unsigned i = 0; i < u->nr_data_types; i++) - dev_usage_type_to_text(out, u, i); + u64 fragmented = u->d[type].buckets * u->bucket_size - sectors; + if (fragmented) + prt_units_u64(out, fragmented << 9); + prt_printf(out, "\r\n"); + } - prt_str(out, "capacity:"); - prt_tab(out); + prt_printf(out, "capacity:\t"); + prt_units_u64(out, (u->nr_buckets * u->bucket_size) << 9); + prt_printf(out, "\r%llu\r\n", u->nr_buckets); - prt_units_u64(out, (u->nr_buckets * u->bucket_size) << 9); - prt_tab_rjust(out); - prt_printf(out, "%llu", u->nr_buckets); - prt_tab_rjust(out); + prt_printf(out, "bucket size:\t"); + prt_units_u64(out, u->bucket_size << 9); + prt_printf(out, "\r\n"); - printbuf_indent_sub(out, 2); + printbuf_indent_sub(out, 2); + prt_newline(out); + } - prt_newline(out); free(u); } @@ -124,19 +113,21 @@ static int dev_by_label_cmp(const void *_l, const void *_r) static void devs_usage_to_text(struct printbuf *out, struct bchfs_handle fs, - dev_names dev_names) + dev_names dev_names, + bool full) { sort(dev_names.data, dev_names.nr, sizeof(dev_names.data[0]), dev_by_label_cmp, NULL); printbuf_tabstops_reset(out); + prt_newline(out); printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 20); printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 14); darray_for_each(dev_names, dev) - dev_usage_to_text(out, fs, dev); + dev_usage_to_text(out, fs, dev, full); darray_for_each(dev_names, dev) { free(dev->dev); @@ -150,14 +141,9 @@ static void persistent_reserved_to_text(struct printbuf *out, if (!sectors) return; - prt_str(out, "reserved:"); - prt_tab(out); - prt_printf(out, "%u/%u ", 1, nr_replicas); - prt_tab(out); - prt_str(out, "[] "); + prt_printf(out, "reserved:\t%u/%u\t[] ", 1, nr_replicas); prt_units_u64(out, sectors << 9); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\r\n"); } static void replicas_usage_to_text(struct printbuf *out, @@ -190,21 +176,12 @@ static void replicas_usage_to_text(struct printbuf *out, *d++ = '\0'; bch2_prt_data_type(out, r->data_type); - prt_char(out, ':'); - prt_tab(out); - - prt_printf(out, "%u/%u ", r->nr_required, r->nr_devs); - prt_tab(out); - - prt_printf(out, "%u ", durability); - prt_tab(out); - - prt_printf(out, "%s ", devs); - prt_tab(out); + prt_printf(out, ":\t%u/%u\t%u\t%s\t", + r->nr_required, r->nr_devs, + durability, devs); prt_units_u64(out, sectors << 9); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\r\n"); } #define for_each_usage_replica(_u, _r) \ @@ -251,15 +228,27 @@ static void accounting_swab_if_old(struct bch_ioctl_query_accounting *in) static int fs_usage_v1_to_text(struct printbuf *out, struct bchfs_handle fs, - dev_names dev_names) + dev_names dev_names, + enum fs_usage_fields fields) { - struct bch_ioctl_query_accounting *a = - bchu_fs_accounting(fs, - BIT(BCH_DISK_ACCOUNTING_persistent_reserved)| + unsigned accounting_types = 0; + + if (fields & FS_USAGE_replicas) + accounting_types |= BIT(BCH_DISK_ACCOUNTING_replicas)| - BIT(BCH_DISK_ACCOUNTING_compression)| - BIT(BCH_DISK_ACCOUNTING_btree)| - BIT(BCH_DISK_ACCOUNTING_rebalance_work)); + BIT(BCH_DISK_ACCOUNTING_persistent_reserved); + + if (fields & FS_USAGE_compression) + accounting_types |= BIT(BCH_DISK_ACCOUNTING_compression); + + if (fields & FS_USAGE_btree) + accounting_types |= BIT(BCH_DISK_ACCOUNTING_btree); + + if (fields & FS_USAGE_rebalance_work) + accounting_types |= BIT(BCH_DISK_ACCOUNTING_rebalance_work); + + struct bch_ioctl_query_accounting *a = + bchu_fs_accounting(fs, accounting_types); if (!a) return -1; @@ -277,45 +266,27 @@ static int fs_usage_v1_to_text(struct printbuf *out, printbuf_tabstop_push(out, 20); printbuf_tabstop_push(out, 16); - prt_str(out, "Size:"); - prt_tab(out); + prt_printf(out, "Size:\t"); prt_units_u64(out, a->capacity << 9); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\r\n"); - prt_str(out, "Used:"); - prt_tab(out); + prt_printf(out, "Used:\t"); prt_units_u64(out, a->used << 9); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\r\n"); - prt_str(out, "Online reserved:"); - prt_tab(out); + prt_printf(out, "Online reserved:\t"); prt_units_u64(out, a->online_reserved << 9); - prt_tab_rjust(out); - prt_newline(out); - - prt_newline(out); - - printbuf_tabstops_reset(out); - - printbuf_tabstop_push(out, 16); - prt_str(out, "Data type"); - prt_tab(out); - - printbuf_tabstop_push(out, 16); - prt_str(out, "Required/total"); - prt_tab(out); - - printbuf_tabstop_push(out, 14); - prt_str(out, "Durability"); - prt_tab(out); - - printbuf_tabstop_push(out, 14); - prt_str(out, "Devices"); - prt_newline(out); - - printbuf_tabstop_push(out, 14); + prt_printf(out, "\r\n"); + + if (fields & FS_USAGE_replicas) { + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 14); + printbuf_tabstop_push(out, 14); + printbuf_tabstop_push(out, 14); + prt_printf(out, "\nData type\tRequired/total\tDurability\tDevices\n"); + } unsigned prev_type = 0; @@ -364,8 +335,7 @@ static int fs_usage_v1_to_text(struct printbuf *out, prt_units_u64(out, nr_extents ? div_u64(sectors_uncompressed << 9, nr_extents) : 0); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\r\n"); break; case BCH_DISK_ACCOUNTING_btree: if (new_type) { @@ -376,8 +346,7 @@ static int fs_usage_v1_to_text(struct printbuf *out, } prt_printf(out, "%s:\t", bch2_btree_id_str(acc_k.btree.id)); prt_units_u64(out, a->v.d[0] << 9); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\r\n"); break; case BCH_DISK_ACCOUNTING_rebalance_work: if (new_type) @@ -395,7 +364,8 @@ static int fs_usage_v1_to_text(struct printbuf *out, static void fs_usage_v0_to_text(struct printbuf *out, struct bchfs_handle fs, - dev_names dev_names) + dev_names dev_names, + enum fs_usage_fields fields) { struct bch_ioctl_fs_usage *u = bchu_fs_usage(fs); @@ -410,20 +380,17 @@ static void fs_usage_v0_to_text(struct printbuf *out, prt_str(out, "Size:"); prt_tab(out); prt_units_u64(out, u->capacity << 9); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\r\n"); prt_str(out, "Used:"); prt_tab(out); prt_units_u64(out, u->used << 9); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\r\n"); prt_str(out, "Online reserved:"); prt_tab(out); prt_units_u64(out, u->online_reserved << 9); - prt_tab_rjust(out); - prt_newline(out); + prt_printf(out, "\r\n"); prt_newline(out); @@ -473,43 +440,34 @@ static void fs_usage_v0_to_text(struct printbuf *out, free(u); } -static void fs_usage_to_text(struct printbuf *out, const char *path) +static void fs_usage_to_text(struct printbuf *out, const char *path, + enum fs_usage_fields fields) { struct bchfs_handle fs = bcache_fs_open(path); dev_names dev_names = bchu_fs_get_devices(fs); - if (!fs_usage_v1_to_text(out, fs, dev_names)) + if (!fs_usage_v1_to_text(out, fs, dev_names, fields)) goto devs; - fs_usage_v0_to_text(out, fs, dev_names); + fs_usage_v0_to_text(out, fs, dev_names, fields); devs: - devs_usage_to_text(out, fs, dev_names); + devs_usage_to_text(out, fs, dev_names, fields & FS_USAGE_devices); darray_exit(&dev_names); bcache_fs_close(fs); } -int fs_usage(void) -{ - puts("bcachefs fs - manage a running filesystem\n" - "Usage: bcachefs fs <CMD> [OPTIONS]\n" - "\n" - "Commands:\n" - " usage Display detailed filesystem usage\n" - " top Show runtime performance information\n" - "\n" - "Report bugs to <linux-bcachefs@vger.kernel.org>"); - return 0; -} - static void fs_usage_usage(void) { puts("bcachefs fs usage - display detailed filesystem usage\n" "Usage: bcachefs fs usage [OPTION]... <mountpoint>\n" "\n" "Options:\n" + " -f, --fields=FIELDS List of accounting sections to print\n" + " replicas,btree,compression,rebalance_work,devices" + " -a Print all accounting fields\n" " -h, --human-readable Human readable units\n" " -H, --help Display this help and exit\n" "Report bugs to <linux-bcachefs@vger.kernel.org>"); @@ -518,18 +476,26 @@ static void fs_usage_usage(void) int cmd_fs_usage(int argc, char *argv[]) { static const struct option longopts[] = { - { "help", no_argument, NULL, 'H' }, + { "fields", required_argument, NULL, 'f' }, + { "all", no_argument, NULL, 'a' }, { "human-readable", no_argument, NULL, 'h' }, + { "help", no_argument, NULL, 'H' }, { NULL } }; bool human_readable = false; + unsigned fields = 0; struct printbuf buf = PRINTBUF; char *fs; int opt; - while ((opt = getopt_long(argc, argv, "h", + while ((opt = getopt_long(argc, argv, "f:ahH", longopts, NULL)) != -1) switch (opt) { + case 'f': + fields |= read_flag_list_or_die(optarg, fs_usage_field_strs, "fields"); + break; + case 'a': + fields = ~0; case 'h': human_readable = true; break; @@ -545,13 +511,13 @@ int cmd_fs_usage(int argc, char *argv[]) if (!argc) { printbuf_reset(&buf); buf.human_readable_units = human_readable; - fs_usage_to_text(&buf, "."); + fs_usage_to_text(&buf, ".", fields); printf("%s", buf.buf); } else { while ((fs = arg_pop())) { printbuf_reset(&buf); buf.human_readable_units = human_readable; - fs_usage_to_text(&buf, fs); + fs_usage_to_text(&buf, fs, fields); printf("%s", buf.buf); } } @@ -560,6 +526,19 @@ int cmd_fs_usage(int argc, char *argv[]) return 0; } +int fs_usage(void) +{ + puts("bcachefs fs - manage a running filesystem\n" + "Usage: bcachefs fs <CMD> [OPTIONS]\n" + "\n" + "Commands:\n" + " usage Display detailed filesystem usage\n" + " top Show runtime performance information\n" + "\n" + "Report bugs to <linux-bcachefs@vger.kernel.org>"); + return 0; +} + int fs_cmds(int argc, char *argv[]) { char *cmd = pop_cmd(&argc, argv); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index cdf593c5..16d08dfb 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -386,14 +386,6 @@ do { \ ##__VA_ARGS__, bch2_err_str(_ret)); \ } while (0) -static inline int __bch2_err_trace(struct bch_fs *c, int err) -{ - trace_error_throw(c, err, _THIS_IP_); - return err; -} - -#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err) - /* Parameters that are useful for debugging, but should always be compiled in: */ #define BCH_DEBUG_PARAMS_ALWAYS() \ BCH_DEBUG_PARAM(key_merging_disabled, \ @@ -1153,6 +1145,15 @@ struct bch_fs { struct mutex fsck_error_counts_lock; }; +static inline int __bch2_err_trace(struct bch_fs *c, int err) +{ + this_cpu_inc(c->counters[BCH_COUNTER_error_throw]); + trace_error_throw(c, err, _THIS_IP_); + return err; +} + +#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err) + extern struct wait_queue_head bch2_read_only_wait; static inline bool bch2_ro_ref_tryget(struct bch_fs *c) diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index b4a04df5..a8f59522 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -423,7 +423,8 @@ enum bch_bkey_type_flags { x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \ x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \ x(accounting, 34, BKEY_TYPE_strict_btree_checks) \ - x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) + x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) \ + x(extent_whiteout, 36, BKEY_TYPE_strict_btree_checks) enum bch_bkey_type { #define x(name, nr, ...) KEY_TYPE_##name = nr, @@ -440,6 +441,10 @@ struct bch_whiteout { struct bch_val v; }; +struct bch_extent_whiteout { + struct bch_val v; +}; + struct bch_error { struct bch_val v; }; @@ -700,7 +705,8 @@ struct bch_sb_field_ext { x(extent_flags, BCH_VERSION(1, 25)) \ x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ x(fast_device_removal, BCH_VERSION(1, 27)) \ - x(inode_has_case_insensitive, BCH_VERSION(1, 28)) + x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \ + x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1340,6 +1346,7 @@ enum btree_id_flags { BTREE_IS_snapshots| \ BTREE_IS_data, \ BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_extent_whiteout)| \ BIT_ULL(KEY_TYPE_error)| \ BIT_ULL(KEY_TYPE_cookie)| \ BIT_ULL(KEY_TYPE_extent)| \ diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index fcd8c82c..75d73677 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -41,6 +41,10 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, .key_validate = deleted_key_validate, \ }) +#define bch2_bkey_ops_extent_whiteout ((struct bkey_ops) { \ + .key_validate = deleted_key_validate, \ +}) + static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { @@ -203,7 +207,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, ? bch2_bkey_types[k.k->type] : "(unknown)"); - if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { + if (btree_node_type_is_extents(type) && !bkey_extent_whiteout(k.k)) { bkey_fsck_err_on(k.k->size == 0, c, bkey_extent_size_zero, "size == 0"); diff --git a/libbcachefs/bkey_types.h b/libbcachefs/bkey_types.h index b4f328f9..88a48ce6 100644 --- a/libbcachefs/bkey_types.h +++ b/libbcachefs/bkey_types.h @@ -44,6 +44,11 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) #define bkey_whiteout(_k) \ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) +#define bkey_extent_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || \ + (_k)->type == KEY_TYPE_whiteout || \ + (_k)->type == KEY_TYPE_extent_whiteout) + /* bkey with split value, const */ struct bkey_s_c { const struct bkey *k; diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 9261ad04..3b1d694d 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -804,7 +804,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea goto got_node; } - b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); + b = __btree_node_mem_alloc(c, GFP_NOWAIT); if (b) { bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); } else { @@ -842,7 +842,7 @@ got_node: mutex_unlock(&bc->lock); - if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { + if (btree_node_data_alloc(c, b, GFP_NOWAIT)) { bch2_trans_unlock(trans); if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) goto err; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 8a03cd75..276cf088 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -131,7 +131,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; - p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); + p = kvmalloc(size, GFP_NOWAIT); if (!p) { *used_mempool = true; p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index a67babf6..8962c481 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -2450,10 +2450,27 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en continue; } - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_nofilter_whiteouts)) { - search_key = bkey_successor(iter, k.k->p); - continue; + if (!(iter->flags & BTREE_ITER_nofilter_whiteouts)) { + /* + * KEY_TYPE_extent_whiteout indicates that there + * are no extents that overlap with this + * whiteout - meaning bkey_start_pos() is + * monotonically increasing when including + * KEY_TYPE_extent_whiteout (not + * KEY_TYPE_whiteout). + * + * Without this @end wouldn't be able to + * terminate searches and we'd have to scan + * through tons of whiteouts: + */ + if (k.k->type == KEY_TYPE_extent_whiteout && + bkey_ge(k.k->p, end)) + goto end; + + if (bkey_extent_whiteout(k.k)) { + search_key = bkey_successor(iter, k.k->p); + continue; + } } } @@ -2711,7 +2728,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp saved_path = 0; } - if (!bkey_whiteout(k.k)) { + if (!bkey_extent_whiteout(k.k)) { saved_path = btree_path_clone(trans, iter->path, iter->flags & BTREE_ITER_intent, _THIS_IP_); @@ -2724,7 +2741,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp continue; } - if (bkey_whiteout(k.k)) { + if (bkey_extent_whiteout(k.k)) { search_key = bkey_predecessor(iter, k.k->p); search_key.snapshot = U32_MAX; continue; @@ -2865,7 +2882,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) iter->k = *k.k; } - if (unlikely(k.k->type == KEY_TYPE_whiteout && + if (unlikely(bkey_extent_whiteout(k.k) && (iter->flags & BTREE_ITER_filter_snapshots) && !(iter->flags & BTREE_ITER_nofilter_whiteouts))) iter->k.type = KEY_TYPE_deleted; @@ -2878,31 +2895,40 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) EBUG_ON(btree_iter_path(trans, iter)->level); - if (iter->flags & BTREE_ITER_intent) { - struct btree_iter iter2; + struct btree_iter iter2; - bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek_max(&iter2, end); + bch2_trans_copy_iter(&iter2, iter); + iter2.flags |= BTREE_ITER_nofilter_whiteouts; - if (k.k && !bkey_err(k)) { - swap(iter->key_cache_path, iter2.key_cache_path); - iter->k = iter2.k; - k.k = &iter->k; + while (1) { + k = bch2_btree_iter_peek_max(&iter2, end); + if ((iter2.flags & BTREE_ITER_is_extents) && + k.k && + !bkey_err(k) && + k.k->type == KEY_TYPE_whiteout) { + bch2_btree_iter_set_pos(&iter2, k.k->p); + continue; } - bch2_trans_iter_exit(&iter2); - } else { - struct bpos pos = iter->pos; - k = bch2_btree_iter_peek_max(iter, end); - if (unlikely(bkey_err(k))) - bch2_btree_iter_set_pos(iter, pos); - else - iter->pos = pos; + break; + } + + if (k.k && !bkey_err(k)) { + swap(iter->key_cache_path, iter2.key_cache_path); + iter->k = iter2.k; + k.k = &iter->k; } + bch2_trans_iter_exit(&iter2); if (unlikely(bkey_err(k))) goto out; + if (unlikely(k.k && + bkey_extent_whiteout(k.k) && + (iter->flags & BTREE_ITER_filter_snapshots) && + !(iter->flags & BTREE_ITER_nofilter_whiteouts))) + iter->k.type = KEY_TYPE_deleted; + next = k.k ? bkey_start_pos(k.k) : POS_MAX; if (bkey_lt(iter->pos, next)) { diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index b117cb5d..c8fc6ee0 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -954,7 +954,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); #define allocate_dropping_locks_errcode(_trans, _do) \ ({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + gfp_t _gfp = GFP_NOWAIT; \ int _ret = _do; \ \ if (bch2_err_matches(_ret, ENOMEM)) { \ @@ -966,7 +966,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); #define allocate_dropping_locks(_trans, _ret, _do) \ ({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + gfp_t _gfp = GFP_NOWAIT; \ typeof(_do) _p = _do; \ \ _ret = 0; \ @@ -979,7 +979,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); #define allocate_dropping_locks_norelock(_trans, _lock_dropped, _do) \ ({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + gfp_t _gfp = GFP_NOWAIT; \ typeof(_do) _p = _do; \ _lock_dropped = false; \ if (unlikely(!_p)) { \ diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 8b94a815..4d58bdb2 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -449,7 +449,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags return 0; new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); if (unlikely(!new_k)) return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 6f3b5757..f59f018f 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -12,6 +12,7 @@ #include "extents.h" #include "keylist.h" #include "snapshot.h" +#include "super-io.h" #include "trace.h" #include <linux/string_helpers.h> @@ -158,6 +159,21 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, return ret; } +static inline enum bch_bkey_type extent_whiteout_type(struct bch_fs *c, enum btree_id btree, const struct bkey *k) +{ + /* + * KEY_TYPE_extent_whiteout indicates that there isn't a real extent + * present at that position: key start positions inclusive of + * KEY_TYPE_extent_whiteout (but not KEY_TYPE_whiteout) are + * monotonically increasing + */ + return btree_id_is_extents_snapshots(btree) && + bkey_deleted(k) && + !bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_snapshot_whiteouts) + ? KEY_TYPE_extent_whiteout + : KEY_TYPE_whiteout; +} + int bch2_trans_update_extent_overwrite(struct btree_trans *trans, struct btree_iter *iter, enum btree_iter_update_trigger_flags flags, @@ -224,14 +240,14 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, update->k.p = old.k->p; update->k.p.snapshot = new.k->p.snapshot; - if (new.k->p.snapshot != old.k->p.snapshot) { - update->k.type = KEY_TYPE_whiteout; - } else if (btree_type_has_snapshots(btree_id)) { - ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); + if (btree_type_has_snapshots(btree_id)) { + ret = new.k->p.snapshot != old.k->p.snapshot + ? 1 + : need_whiteout_for_snapshot(trans, btree_id, update->k.p); if (ret < 0) return ret; if (ret) - update->k.type = KEY_TYPE_whiteout; + update->k.type = extent_whiteout_type(trans->c, iter->btree_id, new.k); } ret = bch2_btree_insert_nonextent(trans, btree_id, update, @@ -265,7 +281,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans, CLASS(btree_iter, iter)(trans, btree_id, bkey_start_pos(&insert->k), BTREE_ITER_intent| BTREE_ITER_with_updates| - BTREE_ITER_not_extents); + BTREE_ITER_not_extents| + BTREE_ITER_nofilter_whiteouts); struct bkey_s_c k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); int ret = bkey_err(k); if (ret) @@ -283,12 +300,40 @@ static int bch2_trans_update_extent(struct btree_trans *trans, goto next; } - while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { - bool done = bkey_lt(insert->k.p, k.k->p); + while (true) { + BUG_ON(bkey_le(k.k->p, bkey_start_pos(&insert->k))); - ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); - if (ret) - return ret; + /* + * When KEY_TYPE_whiteout is included, bkey_start_pos is not + * monotonically increasing + */ + if (k.k->type != KEY_TYPE_whiteout && bkey_le(insert->k.p, bkey_start_pos(k.k))) + break; + + bool done = k.k->type != KEY_TYPE_whiteout && bkey_lt(insert->k.p, k.k->p); + + if (bkey_extent_whiteout(k.k)) { + enum bch_bkey_type whiteout_type = extent_whiteout_type(trans->c, btree_id, &insert->k); + + if (bkey_le(k.k->p, insert->k.p) && + k.k->type != whiteout_type) { + struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + return ret; + + update->k.p.snapshot = iter.snapshot; + update->k.type = whiteout_type; + + ret = bch2_trans_update(trans, &iter, update, 0); + if (ret) + return ret; + } + } else { + ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); + if (ret) + return ret; + } if (done) goto out; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 76897cf1..65ca54c5 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -336,6 +336,20 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, BUG_ON(b->ob.nr); mutex_lock(&c->btree_reserve_cache_lock); + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { + guard(spinlock)(&c->freelist_lock); + if (c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark)) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); + + ret = cl + ? bch_err_throw(c, bucket_alloc_blocked) + : bch_err_throw(c, open_buckets_empty); + mutex_unlock(&c->btree_reserve_cache_lock); + goto err; + } + } + if (c->btree_reserve_cache_nr > nr_reserve) { for (struct btree_alloc *a = c->btree_reserve_cache; a < c->btree_reserve_cache + c->btree_reserve_cache_nr;) { diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 01838a3a..0bd4dd06 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -693,6 +693,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, if (ret) return ret; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned i = 0; + bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { + if (data_opts->kill_ec_ptrs & BIT(i)) + bch2_bkey_drop_ec(n, p.ptr.dev); + i++; + } + while (data_opts->kill_ptrs) { unsigned i = 0, drop = __fls(data_opts->kill_ptrs); diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h index 5e14d135..fc12aa65 100644 --- a/libbcachefs/data_update.h +++ b/libbcachefs/data_update.h @@ -12,6 +12,7 @@ struct moving_context; struct data_update_opts { unsigned rewrite_ptrs; unsigned kill_ptrs; + unsigned kill_ec_ptrs; u16 target; u8 extra_replicas; unsigned btree_insert_flags; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index b879a586..7ab03987 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -995,6 +995,22 @@ void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev); } +void bch2_bkey_drop_ec(struct bkey_i *k, unsigned dev) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_entry *entry, *ec = NULL; + + bkey_extent_entry_for_each(ptrs, entry) { + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) + ec = entry; + else if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_ptr && + entry->ptr.dev == dev) { + bch2_bkey_extent_entry_drop(k, ec); + return; + } + } +} + const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -1757,3 +1773,4 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k) memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); return -val_u64s_delta; } + diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 35ee03cd..f6dcb171 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -650,6 +650,7 @@ void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); void bch2_bkey_drop_device(struct bkey_s, unsigned); +void bch2_bkey_drop_ec(struct bkey_i *k, unsigned); #define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \ do { \ diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index b5e3090f..52722a5e 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -268,7 +268,7 @@ restart: rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { if (inode->ei_inum.inum == inum) { ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, - GFP_NOWAIT|__GFP_NOWARN); + GFP_NOWAIT); if (ret) { rcu_read_unlock(); ret = darray_make_room(&subvols, 1); @@ -826,14 +826,6 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_MTIME); - if (inode_u.bi_subvol) { - /* - * Subvolume deletion is asynchronous, but we still want to tell - * the VFS that it's been deleted here: - */ - set_nlink(&inode->v, 0); - } - if (IS_CASEFOLDED(vdir)) d_invalidate(dentry); err: diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 6ccea092..01c1c637 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1444,7 +1444,7 @@ static int check_key_has_inode(struct btree_trans *trans, if (ret) return ret; - if (k.k->type == KEY_TYPE_whiteout) + if (bkey_extent_whiteout(k.k)) return 0; bool have_inode = i && !i->whiteout; @@ -1924,7 +1924,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, &inode->recalculate_sums); if (ret) goto err; + } + if (!bkey_extent_whiteout(k.k)) { /* * Check inodes in reverse order, from oldest snapshots to * newest, starting from the inode that matches this extent's diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h index 8abd0aa2..6f1e0a7b 100644 --- a/libbcachefs/lru.h +++ b/libbcachefs/lru.h @@ -24,6 +24,16 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) return pos; } +static inline struct bpos lru_start(u16 lru_id) +{ + return lru_pos(lru_id, 0, 0); +} + +static inline struct bpos lru_end(u16 lru_id) +{ + return lru_pos(lru_id, U64_MAX, LRU_TIME_MAX); +} + static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 30fe269d..932b62a9 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -344,7 +344,7 @@ int bch2_move_extent(struct moving_context *ctxt, if (!data_opts.rewrite_ptrs && !data_opts.extra_replicas && !data_opts.scrub) { - if (data_opts.kill_ptrs) { + if (data_opts.kill_ptrs|data_opts.kill_ec_ptrs) { this_cpu_add(c->counters[BCH_COUNTER_io_move_drop_only], k.k->size); return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); } else { @@ -542,7 +542,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt) if (ctxt->wait_on_copygc && c->copygc_running) { bch2_moving_ctxt_flush_all(ctxt); - wait_event_killable(c->copygc_running_wq, + wait_event_freezable(c->copygc_running_wq, !c->copygc_running || (is_kthread && kthread_should_stop())); } @@ -1280,7 +1280,17 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, i++; } - return data_opts->kill_ptrs != 0; + i = 0; + bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { + if (p.has_ec && durability - p.ec.redundancy >= replicas) { + data_opts->kill_ec_ptrs |= BIT(i); + durability -= p.ec.redundancy; + } + + i++; + } + + return (data_opts->kill_ptrs|data_opts->kill_ec_ptrs) != 0; } static bool scrub_pred(struct bch_fs *c, void *_arg, diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index b0cbe3c1..f36d60b8 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -14,6 +14,7 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "clock.h" +#include "ec.h" #include "errcode.h" #include "error.h" #include "lru.h" @@ -131,72 +132,153 @@ static bool bucket_in_flight(struct buckets_in_flight *list, return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params); } +static int try_add_copygc_bucket(struct btree_trans *trans, + struct buckets_in_flight *buckets_in_flight, + struct bpos bucket, u64 lru_time) +{ + struct move_bucket b = { .k.bucket = bucket }; + + int ret = bch2_bucket_is_movable(trans, &b, lru_time); + if (ret <= 0) + return ret; + + if (bucket_in_flight(buckets_in_flight, b.k)) + return 0; + + struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); + if (!b_i) + return -ENOMEM; + + *b_i = b; + + ret = darray_push(&buckets_in_flight->to_evacuate, b_i); + if (ret) { + kfree(b_i); + return ret; + } + + ret = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, + bch_move_bucket_params); + BUG_ON(ret); + + size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); + return buckets_in_flight->to_evacuate.nr >= nr_to_get; +} + static int bch2_copygc_get_buckets(struct moving_context *ctxt, struct buckets_in_flight *buckets_in_flight) { struct btree_trans *trans = ctxt->trans; - struct bch_fs *c = trans->c; - size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); - size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; - int ret; - move_buckets_wait(ctxt, buckets_in_flight, false); + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_start(BCH_LRU_BUCKET_FRAGMENTATION), + lru_end(BCH_LRU_BUCKET_FRAGMENTATION), + 0, k, + try_add_copygc_bucket(trans, buckets_in_flight, + u64_to_bucket(k.k->p.offset), + lru_pos_time(k.k->p)) + ); - ret = bch2_btree_write_buffer_tryflush(trans); - if (bch2_err_matches(ret, EROFS)) - return ret; + return ret < 0 ? ret : 0; +} - if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) - return ret; +static int bch2_copygc_get_stripe_buckets(struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight) +{ + struct btree_trans *trans = ctxt->trans; - ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), - lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), - 0, k, ({ - struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; - int ret2 = 0; + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_start(BCH_LRU_STRIPE_FRAGMENTATION), + lru_end(BCH_LRU_STRIPE_FRAGMENTATION), + 0, lru_k, ({ + CLASS(btree_iter, s_iter)(trans, BTREE_ID_stripes, POS(0, lru_k.k->p.offset), 0); + struct bkey_s_c s_k = bch2_btree_iter_peek_slot(&s_iter); + int ret2 = bkey_err(s_k); + if (ret2) + goto err; - saw++; + if (s_k.k->type != KEY_TYPE_stripe) + continue; - ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)); - if (ret2 < 0) - goto err; + const struct bch_stripe *s = bkey_s_c_to_stripe(s_k).v; - if (!ret2) - not_movable++; - else if (bucket_in_flight(buckets_in_flight, b.k)) - in_flight++; - else { - struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); - ret2 = b_i ? 0 : -ENOMEM; + /* write buffer race? */ + if (stripe_lru_pos(s) != lru_pos_time(lru_k.k->p)) + continue; + + unsigned nr_data = s->nr_blocks - s->nr_redundant; + for (unsigned i = 0; i < nr_data; i++) { + if (!stripe_blockcount_get(s, i)) + continue; + + const struct bch_extent_ptr *ptr = s->ptrs + i; + CLASS(bch2_dev_tryget, ca)(trans->c, ptr->dev); + if (unlikely(!ca)) + continue; + + ret2 = try_add_copygc_bucket(trans, buckets_in_flight, + PTR_BUCKET_POS(ca, ptr), U64_MAX); if (ret2) - goto err; + break; + } +err: + ret2; + })); - *b_i = b; + return ret < 0 ? ret : 0; +} + +static bool should_do_ec_copygc(struct btree_trans *trans) +{ + u64 stripe_frag_ratio = 0; + + for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_start(BCH_LRU_STRIPE_FRAGMENTATION), + lru_end(BCH_LRU_STRIPE_FRAGMENTATION), + 0, lru_k, ({ + CLASS(btree_iter, s_iter)(trans, BTREE_ID_stripes, POS(0, lru_k.k->p.offset), 0); + struct bkey_s_c s_k = bch2_btree_iter_peek_slot(&s_iter); + int ret = bkey_err(s_k); + if (ret) + goto err; - ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); - if (ret2) { - kfree(b_i); - goto err; - } + if (s_k.k->type != KEY_TYPE_stripe) + continue; - ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, - bch_move_bucket_params); - BUG_ON(ret2); + const struct bch_stripe *s = bkey_s_c_to_stripe(s_k).v; - sectors += b.sectors; - } + /* write buffer race? */ + if (stripe_lru_pos(s) != lru_pos_time(lru_k.k->p)) + continue; - ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; + unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_nonempty = 0; + for (unsigned i = 0; i < nr_data; i++) + blocks_nonempty += !!stripe_blockcount_get(s, i); + + /* stripe is pending delete */ + if (!blocks_nonempty) + continue; + + /* This matches the calculation in alloc_lru_idx_fragmentation, so we can + * directly compare without actually looking up the bucket pointed to by the + * bucket fragmentation lru: + */ + stripe_frag_ratio = div_u64(blocks_nonempty * (1ULL << 31), nr_data); + break; err: - ret2; + ret; })); - pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", - buckets_in_flight->nr, buckets_in_flight->sectors, - saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); + CLASS(btree_iter, iter)(trans, BTREE_ID_lru, lru_start(BCH_LRU_BUCKET_FRAGMENTATION), 0); + struct bkey_s_c lru_k; - return ret < 0 ? ret : 0; + lockrestart_do(trans, bkey_err(lru_k = bch2_btree_iter_peek_max(&iter, + lru_end(BCH_LRU_BUCKET_FRAGMENTATION)))); + + u64 bucket_frag_ratio = lru_k.k && !bkey_err(lru_k) ? lru_pos_time(lru_k.k->p) : 0; + + /* Prefer normal bucket copygc */ + return stripe_frag_ratio && stripe_frag_ratio * 2 < bucket_frag_ratio; } noinline @@ -213,7 +295,18 @@ static int bch2_copygc(struct moving_context *ctxt, u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; - ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); + move_buckets_wait(ctxt, buckets_in_flight, false); + + ret = bch2_btree_write_buffer_tryflush(trans); + if (bch2_err_matches(ret, EROFS)) + goto err; + + if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) + goto err; + + ret = should_do_ec_copygc(trans) + ? bch2_copygc_get_stripe_buckets(ctxt, buckets_in_flight) + : bch2_copygc_get_buckets(ctxt, buckets_in_flight); if (ret) goto err; @@ -265,7 +358,8 @@ static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) for (unsigned i = 0; i < BCH_DATA_NR; i++) if (data_type_movable(i)) - fragmented += usage_full.d[i].fragmented; + fragmented += usage_full.d[i].buckets * ca->mi.bucket_size - + usage_full.d[i].sectors; return max(0LL, fragmented_allowed - fragmented); } diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index c0c5fe96..17ca56b0 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -292,12 +292,48 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, : 0; } -static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, - struct btree_iter *work_iter) +#define REBALANCE_WORK_BUF_NR 1024 +DEFINE_DARRAY_NAMED(darray_rebalance_work, struct bkey_i_cookie); + +static struct bkey_i *next_rebalance_entry(struct btree_trans *trans, + darray_rebalance_work *buf, struct bpos *work_pos) { - return !kthread_should_stop() - ? bch2_btree_iter_peek(work_iter) - : bkey_s_c_null; + if (unlikely(!buf->nr)) { + /* + * Avoid contention with write buffer flush: buffer up rebalance + * work entries in a darray + */ + + BUG_ON(!buf->size);; + + bch2_trans_begin(trans); + + for_each_btree_key(trans, iter, BTREE_ID_rebalance_work, *work_pos, + BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ + /* we previously used darray_make_room */ + BUG_ON(bkey_bytes(k.k) > sizeof(buf->data[0])); + + bkey_reassemble(&darray_top(*buf).k_i, k); + buf->nr++; + + *work_pos = bpos_successor(iter.pos); + if (buf->nr == buf->size) + break; + 0; + })); + + if (!buf->nr) + return NULL; + + unsigned l = 0, r = buf->nr - 1; + while (l < r) { + swap(buf->data[l], buf->data[r]); + l++; + --r; + } + } + + return &(&darray_pop(buf))->k_i; } static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, @@ -464,10 +500,9 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) per_snapshot_io_opts_init(&snapshot_io_opts, c); int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, - r->scan_start.pos, r->scan_end.pos, - BTREE_ITER_all_snapshots| - BTREE_ITER_not_extents| - BTREE_ITER_prefetch, k, ({ + r->scan_start.pos, r->scan_end.pos, + BTREE_ITER_all_snapshots| + BTREE_ITER_prefetch, k, ({ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, @@ -524,49 +559,37 @@ static int do_rebalance(struct moving_context *ctxt) struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &c->rebalance; - struct btree_iter extent_iter = { NULL }; - struct bkey_s_c k; + struct btree_iter extent_iter = {}; u32 kick = r->kick; - int ret = 0; - bch2_trans_begin(trans); + struct bpos work_pos = POS_MIN; + CLASS(darray_rebalance_work, work)(); + int ret = darray_make_room(&work, REBALANCE_WORK_BUF_NR); + if (ret) + return ret; bch2_move_stats_init(&r->work_stats, "rebalance_work"); bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); - CLASS(btree_iter, rebalance_work_iter)(trans, - BTREE_ID_rebalance_work, POS_MIN, - BTREE_ITER_all_snapshots); - while (!bch2_move_ratelimit(ctxt)) { if (!bch2_rebalance_enabled(c)) { bch2_moving_ctxt_flush_all(ctxt); kthread_wait_freezable(bch2_rebalance_enabled(c) || kthread_should_stop()); + if (kthread_should_stop()) + break; } - if (kthread_should_stop()) + struct bkey_i *k = next_rebalance_entry(trans, &work, &work_pos); + if (!k) break; - bch2_trans_begin(trans); - - ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret || !k.k) - break; - - ret = k.k->type == KEY_TYPE_cookie - ? do_rebalance_scan(ctxt, k.k->p.inode, - le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) - : do_rebalance_extent(ctxt, k.k->p, &extent_iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; + ret = k->k.type == KEY_TYPE_cookie + ? do_rebalance_scan(ctxt, k->k.p.inode, + le64_to_cpu(bkey_i_to_cookie(k)->v.cookie)) + : lockrestart_do(trans, do_rebalance_extent(ctxt, k->k.p, &extent_iter)); if (ret) break; - - bch2_btree_iter_advance(&rebalance_work_iter); } bch2_trans_iter_exit(&extent_iter); diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h index f3ea53a5..740859c7 100644 --- a/libbcachefs/sb-counters_format.h +++ b/libbcachefs/sb-counters_format.h @@ -101,7 +101,8 @@ enum counters_flags { x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \ x(trans_restart_split_race, 76, TYPE_COUNTER) \ x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \ - x(write_buffer_flush_sync, 78, TYPE_COUNTER) + x(write_buffer_flush_sync, 78, TYPE_COUNTER) \ + x(error_throw, 93, TYPE_COUNTER) enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 6c72d93b..ef15e614 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -1054,16 +1054,19 @@ static int bch2_fs_opt_version_init(struct bch_fs *c) bch2_print_str(c, KERN_INFO, p.buf); - if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { - bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); - return -EINVAL; - } + if (BCH_SB_INITIALIZED(c->disk_sb.sb)) { + if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { + bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); + return -EINVAL; + } - if (!c->sb.clean && - !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { - bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); - return -EINVAL; + if (!c->sb.clean && + !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { + bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); + return -EINVAL; + } } + return 0; } |